@verica-app/cli 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,84 @@
1
+ # Changelog
2
+
3
+ All notable changes to `@verica-app/cli` are documented here.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
6
+ This package is **pre-1.0**: while on `0.x`, a breaking change bumps the **minor**
7
+ version and additive features/fixes bump the **patch** (see [Stability](./README.md#stability)).
8
+
9
+ ## [Unreleased]
10
+
11
+ ## [0.1.4] - 2026-06-19
12
+
13
+ ### Added
14
+
15
+ - `--reuse-if-unchanged` (with `--reuse-max-age <hrs>` and `--reuse-same-ref`) to
16
+ reuse a recent **completed** run instead of executing again when the config
17
+ (prompt version + model + sampling + dataset snapshot + graders) is unchanged.
18
+ Opt-in and freshness-bounded (default 24h, max 720) — re-running stays the
19
+ default, since reuse can't see provider drift. On a cache hit the `--json`
20
+ element gains `"reused": true` and a `reusedFrom` block (additive — a normal
21
+ run's payload is unchanged); the API answers `200` instead of `202`. Cannot be
22
+ combined with `--threshold` / `--baseline-*` (the reused verdict was frozen under
23
+ the prior gate).
24
+
25
+ - `--git-repo-url <url>` (auto-detected from `GITHUB_SERVER_URL`/`GITHUB_REPOSITORY`,
26
+ else GitLab's `CI_PROJECT_URL`) sent as `git.repoUrl`, so the run UI can link the
27
+ commit SHA to `<repoUrl>/commit/<sha>` and the branch to `<repoUrl>/tree/<ref>`.
28
+
29
+ ## [0.1.3] - 2026-06-18
30
+
31
+ ### Added
32
+
33
+ - `--tools <file>` flag and `tools:` in `.verica.yml` to push a prompt's tool
34
+ definitions from the repo. The manifest accepts either a **path to a JSON file**
35
+ or an **inline array**. Each tool may use Verica's flat shape
36
+ (`{ name, description, parameters }`) **or** the OpenAI wrapper
37
+ (`{ type: "function", function: { … } }`), which is auto-unwrapped — paste your
38
+ real schemas as-is.
39
+
40
+ ### Changed
41
+
42
+ - **Prompt push is now field-level.** `--prompt` (user template), `--system-prompt`,
43
+ and `--tools` are independent: push only the fields you changed and every omitted
44
+ field is **inherited from the eval's current prompt version**. A new version is
45
+ created only if the merged content differs.
46
+ - Previously, a push that included `--prompt` without `--system-prompt` produced a
47
+ version with **no system prompt** (it was dropped, not inherited). If your CI
48
+ relied on that, pass all the fields you intend to set.
49
+ - `--system-prompt` (or `--tools`) can now be pushed **on its own** — earlier the
50
+ CLI sent a prompt block only when `--prompt` was present.
51
+ - Prompt templates reference dataset columns by their **bare name** (`{{ pais }}`),
52
+ and grader/judge prompts reference the model output via `{{ output.text }}` /
53
+ `{{ output.tool_calls }}`. The legacy `{{ item.* }}` / `{{ sample.* }}` forms still
54
+ resolve, so existing prompt files keep working.
55
+
56
+ ## [0.1.2] - 2026-06-18
57
+
58
+ ### Changed
59
+
60
+ - Default to the hosted API (`https://verica.app`). `VERICA_BASE_URL` and `--base-url`
61
+ are now overrides for local dev / self-hosting only — clients no longer configure a URL.
62
+
63
+ ## [0.1.1] - 2026-06-18
64
+
65
+ ### Changed
66
+
67
+ - Published under the `@verica-app` scope, with a tag-triggered release workflow
68
+ (`cli-v*` → `npm publish --provenance`). No behavior change.
69
+
70
+ ## [0.1.0] - 2026-06-18
71
+
72
+ ### Added
73
+
74
+ - Initial release. The `run` command:
75
+ - `--eval <id>` (single) or `--manifest <file>` (`.verica.yml`, multi-prompt).
76
+ - Pushes prompt content (`--prompt` / `--system-prompt`), versioned by content equality.
77
+ - `--model` · `--sampling <file.json>` for execution config.
78
+ - `--wait` polls to a terminal status; the exit code reflects the gate.
79
+ - `--junit <file>` · `--junit-mode rows|gate` for a JUnit report; `--json` for
80
+ machine-readable results.
81
+ - `--threshold` · `--baseline-ref` · `--baseline-run` to override the gate per branch.
82
+ - Git provenance (`--git-sha` / `--git-ref`) auto-detected from common CI env vars.
83
+ - Exit codes: `0` passed · `1` gate failed · `2` validation/transport error.
84
+ - `VERICA_TOKEN` is the only required secret; provider keys stay in Verica (BYOK).
package/README.md CHANGED
@@ -27,6 +27,8 @@ npm i -D @verica-app/cli
27
27
  verica run \
28
28
  --eval eval_8x2k9d \
29
29
  --prompt prompts/support-agent.txt \
30
+ --system-prompt prompts/support-agent.system.txt \ # optional
31
+ --tools prompts/support-agent.tools.json \ # optional
30
32
  --model gpt-4.1-mini \
31
33
  --wait \
32
34
  --junit verica-results.xml \
@@ -44,10 +46,16 @@ verica run --manifest .verica.yml --wait --junit report.xml
44
46
  evals:
45
47
  - id: eval_8x2k9d
46
48
  prompt: prompts/support-agent.txt
49
+ systemPrompt: prompts/support-agent.system.txt
50
+ tools: prompts/support-agent.tools.json # a path to a JSON file…
47
51
  sampling: { temperature: 0.2, maxTokens: 512 }
48
52
  model: gpt-4.1-mini
49
53
  - id: eval_3p1m7q
50
54
  prompt: prompts/triage.txt
55
+ tools: # …or an inline array
56
+ - name: get_order
57
+ description: Look up an order by id
58
+ parameters: { type: object, properties: { id: { type: string } }, required: [id] }
51
59
  model: claude-sonnet-4-6
52
60
  ```
53
61
 
@@ -63,20 +71,113 @@ default — you don't configure a URL.
63
71
  ## Key flags
64
72
 
65
73
  - `--eval <id>` / `--manifest <file>` — what to run.
66
- - `--prompt <file>` / `--system-prompt <file>` — prompt content to push (versioned by content).
74
+ - `--prompt <file>` / `--system-prompt <file>` / `--tools <file>` — prompt content to push (versioned by content). See [Prompt content](#prompt-content-what-you-push).
67
75
  - `--model <model>` · `--sampling <file.json>` — execution config.
68
76
  - `--wait` — poll to completion; the exit code reflects the gate.
69
77
  - `--junit <file>` · `--junit-mode rows|gate` — JUnit report (default `rows`).
70
78
  - `--json` — machine-readable results on stdout.
71
79
  - `--threshold <0..1>` · `--baseline-ref <ref>` · `--baseline-run <id>` — override the gate per branch.
72
- - `--git-sha` / `--git-ref` — provenance (auto-detected from CI env otherwise).
80
+ - `--reuse-if-unchanged` · `--reuse-max-age <hrs>` · `--reuse-same-ref` — reuse a recent completed run instead of re-executing an unchanged config. See [Reuse](#reuse-skip-re-running-an-unchanged-config).
81
+ - `--git-sha` / `--git-ref` / `--git-repo-url` — provenance + the repo web base that links the SHA in the run UI (all auto-detected from CI env otherwise).
73
82
 
74
83
  > Local dev / self-hosting only: point the CLI at another instance with `--base-url`
75
84
  > (or the `VERICA_BASE_URL` env var). Clients never need this.
76
85
 
86
+ ## Prompt content (what you push)
87
+
88
+ The repo owns the **prompt**: the user template (`--prompt`), the system prompt
89
+ (`--system-prompt`), and the tool definitions (`--tools`). The **dataset, graders,
90
+ gate, and any few-shot/simulated turns stay in Verica** — they're the test scenario,
91
+ managed by whoever owns the eval.
92
+
93
+ Each of the three prompt fields is **independent and optional**: push the ones you
94
+ changed and every omitted field is **inherited from the current version**. A push
95
+ creates a new prompt version only if the merged content actually differs.
96
+
97
+ ```bash
98
+ verica run --eval eval_8x2k9d --system-prompt prompts/agent.system.txt --model gpt-4.1-mini --wait
99
+ # user template + tools inherited; only the system prompt re-versions
100
+ ```
101
+
102
+ Templates reference dataset columns by name — e.g. `What is the capital of {{ pais }}?`
103
+ (the column is `pais`). Grader/judge prompts can also reference the model output via
104
+ `{{ output.text }}` / `{{ output.tool_calls }}`.
105
+
106
+ **Tools** are pushed as JSON — a `--tools <file>`, or a path / inline array under
107
+ `tools:` in the manifest. Each entry may be Verica's flat shape **or** the OpenAI
108
+ wrapper (auto-unwrapped), so you can paste your real schemas as-is:
109
+
110
+ ```json
111
+ [
112
+ {
113
+ "name": "get_order",
114
+ "description": "Look up an order by id",
115
+ "parameters": {
116
+ "type": "object",
117
+ "properties": { "id": { "type": "string" } },
118
+ "required": ["id"]
119
+ }
120
+ },
121
+ {
122
+ "type": "function",
123
+ "function": { "name": "cancel_order", "description": "…", "parameters": { "type": "object" } }
124
+ }
125
+ ]
126
+ ```
127
+
128
+ Tools are never executed — the model's _decision_ to call one (and with which
129
+ arguments) is what the eval grades.
130
+
131
+ ## Reuse (skip re-running an unchanged config)
132
+
133
+ By default every `verica run` executes — re-running an unchanged eval is often the
134
+ **point** in CI (it catches silent model drift and run-to-run variance, since an
135
+ eval's output isn't a pure function of its inputs). When you'd rather save the
136
+ tokens, opt in with `--reuse-if-unchanged`:
137
+
138
+ ```bash
139
+ verica run --eval eval_8x2k9d --model gpt-4.1-mini --reuse-if-unchanged --wait
140
+ # if the same config ran & completed in the last 24h, returns that verdict — no new run
141
+ ```
142
+
143
+ "Unchanged" means the **prompt version + model + sampling + dataset snapshot +
144
+ graders** all match a prior run (the gate is _not_ part of it — it decides the
145
+ verdict, not the output). On a hit the CLI exits on the prior run's frozen verdict
146
+ and the `--json` element carries `"reused": true` plus a `reusedFrom` block (the API
147
+ also answers `200` instead of `202`).
148
+
149
+ - `--reuse-max-age <hrs>` — how stale a reusable run may be (default **24**, max
150
+ **720**). There is no "forever": reuse can't see provider-side drift behind a
151
+ stable model id, so it's always bounded — that bound is your staleness budget.
152
+ - `--reuse-same-ref` — only reuse a run on the **same git ref**. Off by default: an
153
+ identical config produces the same output distribution regardless of branch.
154
+ - Only **completed** runs are reused (never a partial/failed one).
155
+ - Incompatible with `--threshold` / `--baseline-ref` / `--baseline-run` — a reused
156
+ verdict was frozen under its own gate, so it can't honor a new one.
157
+
158
+ Omit `--reuse-if-unchanged` (the default) any time you want a guaranteed fresh run.
159
+
77
160
  ## Exit codes
78
161
 
79
162
  `0` passed · `1` gate failed · `2` validation/transport error.
80
163
 
164
+ ## Stability
165
+
166
+ This CLI is **pre-1.0 (`0.x`)**. The command surface, the `--json` payload, the JUnit
167
+ output, and the prompt-push behavior are still settling and may change. Exit codes
168
+ (`0`/`1`/`2`) are stable.
169
+
170
+ During `0.x` the **minor** version is the breaking lever, so pin accordingly:
171
+
172
+ ```jsonc
173
+ // package.json
174
+ "@verica-app/cli": "~0.1" // >=0.1.0 <0.2.0 — gets patches, not breaking minors
175
+ ```
176
+
177
+ We bump the **minor** for any breaking change (flags, output shapes, push behavior) and
178
+ the **patch** for additive features and fixes. **1.0** will freeze the commands, flags,
179
+ exit codes, and output shapes under standard semver. See
180
+ [CHANGELOG.md](./CHANGELOG.md) for what changed in each release.
181
+
81
182
  MIT licensed. There's no IP in the client — the engine, graders, gate, and crypto all
82
183
  run server-side behind the token API.
package/dist/cli.js CHANGED
@@ -4067,9 +4067,16 @@ var samplingParamsSchema = external_exports.object({
4067
4067
  reasoning: external_exports.boolean().optional()
4068
4068
  });
4069
4069
  var runRequestSchema = external_exports.object({
4070
- /** Omit to run the eval's current (UI-managed) prompt unchanged. */
4070
+ /**
4071
+ * Prompt content to push. Omit the whole block to run the eval's current
4072
+ * (UI-managed) prompt unchanged. Each field is independent: supply only the
4073
+ * ones you're changing — every omitted field (template / systemPrompt / tools)
4074
+ * is inherited from the current version, and a new version is created only if
4075
+ * the merged result differs (e.g. push just `systemPrompt` to re-version the
4076
+ * system prompt while keeping the user template).
4077
+ */
4071
4078
  prompt: external_exports.object({
4072
- template: external_exports.string(),
4079
+ template: external_exports.string().optional(),
4073
4080
  systemPrompt: external_exports.string().optional(),
4074
4081
  tools: external_exports.array(toolDefinitionSchema).optional()
4075
4082
  }).optional(),
@@ -4079,7 +4086,13 @@ var runRequestSchema = external_exports.object({
4079
4086
  /** Commit provenance, stamped on the run. */
4080
4087
  git: external_exports.object({
4081
4088
  sha: external_exports.string().optional(),
4082
- ref: external_exports.string().optional()
4089
+ ref: external_exports.string().optional(),
4090
+ /**
4091
+ * The repository's web base URL (e.g. `https://github.com/acme/widgets`) so
4092
+ * the run UI can link the SHA → `<repoUrl>/commit/<sha>` and the branch →
4093
+ * `<repoUrl>/tree/<ref>`. The CLI auto-detects it from CI env.
4094
+ */
4095
+ repoUrl: external_exports.string().optional()
4083
4096
  }).optional(),
4084
4097
  /** CLI gate overrides (precedence over the eval's pass_condition). */
4085
4098
  gate: external_exports.object({
@@ -4089,6 +4102,23 @@ var runRequestSchema = external_exports.object({
4089
4102
  baselineRef: external_exports.string().optional(),
4090
4103
  /** Pin a specific baseline run (wins over baselineRef). */
4091
4104
  baselineRunId: external_exports.string().optional()
4105
+ }).optional(),
4106
+ /**
4107
+ * Opt-in cost control: when the merged config (prompt version + model +
4108
+ * sampling + dataset snapshot + graders) matches a recent COMPLETED run, the
4109
+ * server returns that run's frozen verdict instead of executing again. NOT a
4110
+ * default — an eval's output isn't a pure function of its config (generation +
4111
+ * judge are non-deterministic, the model endpoint drifts), so reuse is always
4112
+ * the caller's explicit choice and is bounded by `maxAgeHours`. Incompatible
4113
+ * with `gate` (the cached verdict was frozen under the old gate).
4114
+ */
4115
+ reuse: external_exports.object({
4116
+ /** Turn reuse on. The trigger — everything else is just tuning. */
4117
+ ifUnchanged: external_exports.boolean().optional(),
4118
+ /** Max age (hours) of a reusable run; server default 24, cap 720 (30d). No "infinite reuse". */
4119
+ maxAgeHours: external_exports.number().positive().max(720).optional(),
4120
+ /** Also require the prior run's git ref to match (per-branch isolation); default false. */
4121
+ sameRef: external_exports.boolean().optional()
4092
4122
  }).optional()
4093
4123
  });
4094
4124
  var runAcceptedSchema = external_exports.object({
@@ -4097,7 +4127,23 @@ var runAcceptedSchema = external_exports.object({
4097
4127
  promptVersion: external_exports.number().int(),
4098
4128
  /** Whether a NEW prompt version was created (vs. the current one reused). */
4099
4129
  created: external_exports.boolean(),
4100
- resultUrl: external_exports.string()
4130
+ resultUrl: external_exports.string(),
4131
+ /**
4132
+ * Whether this response reuses a prior run instead of executing a new one (a
4133
+ * cache hit on `reuse.ifUnchanged`). The HTTP status reflects it too: 200 when
4134
+ * reused, 202 when a fresh run was enqueued. Optional so an older API that
4135
+ * predates reuse (omitting it) reads as `false`.
4136
+ */
4137
+ reused: external_exports.boolean().optional(),
4138
+ /** Provenance of the reused run — present iff `reused` is true. */
4139
+ reusedFrom: external_exports.object({
4140
+ runId: external_exports.string(),
4141
+ /** ISO timestamp the reused run finished — shows how stale the verdict is. */
4142
+ finishedAt: external_exports.string(),
4143
+ status: external_exports.literal("completed"),
4144
+ gitSha: external_exports.string().nullable(),
4145
+ gitRef: external_exports.string().nullable()
4146
+ }).optional()
4101
4147
  });
4102
4148
  var runStatusSchema = external_exports.enum([
4103
4149
  "queued",
@@ -4289,6 +4335,7 @@ function parseManifest(raw, source = ".verica.yml") {
4289
4335
  id: e.id,
4290
4336
  prompt: typeof e.prompt === "string" ? e.prompt : void 0,
4291
4337
  systemPrompt: typeof e.systemPrompt === "string" ? e.systemPrompt : void 0,
4338
+ tools: typeof e.tools === "string" || Array.isArray(e.tools) ? e.tools : void 0,
4292
4339
  model: typeof e.model === "string" ? e.model : void 0,
4293
4340
  sampling: e.sampling ?? void 0
4294
4341
  };
@@ -4298,6 +4345,27 @@ async function loadManifest(path) {
4298
4345
  return parseManifest(await readFile(path, "utf8"), path);
4299
4346
  }
4300
4347
 
4348
+ // src/tools.ts
4349
+ function unwrap(entry) {
4350
+ if (entry !== null && typeof entry === "object" && entry.type === "function" && typeof entry.function === "object" && entry.function !== null) {
4351
+ return entry.function;
4352
+ }
4353
+ return entry;
4354
+ }
4355
+ function normalizeToolDefinitions(raw) {
4356
+ if (!Array.isArray(raw)) {
4357
+ throw new Error("tools must be a JSON array of tool definitions.");
4358
+ }
4359
+ return raw.map((entry, i) => {
4360
+ const parsed = toolDefinitionSchema.safeParse(unwrap(entry));
4361
+ if (!parsed.success) {
4362
+ const why = parsed.error.issues.map((issue) => issue.message).join("; ");
4363
+ throw new Error(`tools[${i}] is not a valid tool definition: ${why}`);
4364
+ }
4365
+ return parsed.data;
4366
+ });
4367
+ }
4368
+
4301
4369
  // src/junit.ts
4302
4370
  function esc(s) {
4303
4371
  return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
@@ -4380,19 +4448,24 @@ async function runCommand(opts) {
4380
4448
  const entries = await resolveEntries(opts);
4381
4449
  const git = resolveGit(opts);
4382
4450
  const gate = resolveGate(opts);
4451
+ const reuse = resolveReuse(opts);
4383
4452
  const suites = [];
4384
4453
  const summaries = [];
4385
4454
  let worst = EXIT.pass;
4386
4455
  for (const entry of entries) {
4387
4456
  try {
4388
- const body = await buildRequest(entry, { samplingFile: opts.samplingFile, git, gate });
4457
+ const body = await buildRequest(entry, { samplingFile: opts.samplingFile, git, gate, reuse });
4389
4458
  const accepted = await client.triggerRun(entry.id, body);
4390
- err(
4391
- `\u25B6 ${entry.id}: run ${accepted.runId} queued (prompt v${accepted.promptVersion}${accepted.created ? ", new version" : ", reused"})`
4392
- );
4459
+ const promptNote = `prompt v${accepted.promptVersion}${accepted.created ? ", new version" : ""}`;
4460
+ if (accepted.reused) {
4461
+ err(`\u267B ${entry.id}: run ${accepted.runId} reused (${promptNote})`);
4462
+ if (accepted.reusedFrom) err(` \u21B3 a completed run from ${accepted.reusedFrom.finishedAt}`);
4463
+ } else {
4464
+ err(`\u25B6 ${entry.id}: run ${accepted.runId} queued (${promptNote})`);
4465
+ }
4393
4466
  err(` ${accepted.resultUrl}`);
4394
4467
  if (!opts.wait) {
4395
- summaries.push({ evalId: entry.id, runId: accepted.runId, resultUrl: accepted.resultUrl });
4468
+ summaries.push(buildSummary(entry.id, { status: "queued", accepted }));
4396
4469
  continue;
4397
4470
  }
4398
4471
  const run = await pollUntilTerminal(client, accepted.runId, {
@@ -4406,12 +4479,12 @@ async function runCommand(opts) {
4406
4479
  opts.junitMode === "rows" ? rowsSuite(entry.id, await client.getResults(accepted.runId)) : gateSuite(entry.id, run)
4407
4480
  );
4408
4481
  }
4409
- summaries.push({ evalId: entry.id, runId: accepted.runId, ...run });
4482
+ summaries.push(buildSummary(entry.id, { status: "waited", accepted, run }));
4410
4483
  } catch (e) {
4411
4484
  worst = EXIT.error;
4412
4485
  const message = e instanceof Error ? e.message : String(e);
4413
4486
  err(`\u2717 ${entry.id}: ${message}`);
4414
- summaries.push({ evalId: entry.id, error: message });
4487
+ summaries.push(buildSummary(entry.id, { status: "error", message }));
4415
4488
  }
4416
4489
  }
4417
4490
  if (opts.junitFile && suites.length > 0) {
@@ -4432,6 +4505,7 @@ async function resolveEntries(opts) {
4432
4505
  id: opts.evalId,
4433
4506
  prompt: opts.promptFile,
4434
4507
  systemPrompt: opts.systemPromptFile,
4508
+ tools: opts.toolsFile,
4435
4509
  model: opts.model
4436
4510
  }
4437
4511
  ];
@@ -4442,23 +4516,72 @@ async function buildRequest(entry, ctx) {
4442
4516
  }
4443
4517
  const template = entry.prompt ? await readFile2(entry.prompt, "utf8") : void 0;
4444
4518
  const systemPrompt = entry.systemPrompt ? await readFile2(entry.systemPrompt, "utf8") : void 0;
4519
+ const tools = await resolveTools(entry.tools);
4445
4520
  let sampling = entry.sampling;
4446
4521
  if (!sampling && ctx.samplingFile) {
4447
4522
  sampling = JSON.parse(await readFile2(ctx.samplingFile, "utf8"));
4448
4523
  }
4524
+ const prompt = template !== void 0 || systemPrompt !== void 0 || tools !== void 0 ? {
4525
+ ...template !== void 0 ? { template } : {},
4526
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
4527
+ ...tools !== void 0 ? { tools } : {}
4528
+ } : void 0;
4449
4529
  return {
4450
4530
  model: entry.model,
4451
- ...template !== void 0 ? { prompt: { template, ...systemPrompt !== void 0 ? { systemPrompt } : {} } } : {},
4531
+ ...prompt ? { prompt } : {},
4452
4532
  ...sampling ? { samplingParams: sampling } : {},
4453
4533
  ...ctx.git ? { git: ctx.git } : {},
4454
- ...ctx.gate ? { gate: ctx.gate } : {}
4534
+ ...ctx.gate ? { gate: ctx.gate } : {},
4535
+ ...ctx.reuse ? { reuse: ctx.reuse } : {}
4455
4536
  };
4456
4537
  }
4538
+ function buildSummary(evalId, outcome) {
4539
+ switch (outcome.status) {
4540
+ case "queued":
4541
+ return {
4542
+ evalId,
4543
+ runId: outcome.accepted.runId,
4544
+ resultUrl: outcome.accepted.resultUrl,
4545
+ ...reuseFields(outcome.accepted)
4546
+ };
4547
+ case "waited":
4548
+ return {
4549
+ evalId,
4550
+ runId: outcome.accepted.runId,
4551
+ ...outcome.run,
4552
+ ...reuseFields(outcome.accepted)
4553
+ };
4554
+ case "error":
4555
+ return { evalId, error: outcome.message };
4556
+ }
4557
+ }
4558
+ function reuseFields(accepted) {
4559
+ if (!accepted.reused) return {};
4560
+ return { reused: true, ...accepted.reusedFrom ? { reusedFrom: accepted.reusedFrom } : {} };
4561
+ }
4562
+ async function resolveTools(tools) {
4563
+ if (tools === void 0) return void 0;
4564
+ const raw = typeof tools === "string" ? JSON.parse(await readFile2(tools, "utf8")) : tools;
4565
+ return normalizeToolDefinitions(raw);
4566
+ }
4457
4567
  function resolveGit(opts) {
4458
4568
  const sha = opts.gitSha ?? process.env.GITHUB_SHA ?? process.env.CI_COMMIT_SHA;
4459
4569
  const ref = opts.gitRef ?? process.env.GITHUB_REF ?? process.env.CI_COMMIT_REF_NAME;
4460
- if (!sha && !ref) return void 0;
4461
- return { ...sha ? { sha } : {}, ...ref ? { ref } : {} };
4570
+ const repoUrl = resolveRepoUrl(opts);
4571
+ if (!sha && !ref && !repoUrl) return void 0;
4572
+ return {
4573
+ ...sha ? { sha } : {},
4574
+ ...ref ? { ref } : {},
4575
+ ...repoUrl ? { repoUrl } : {}
4576
+ };
4577
+ }
4578
+ function resolveRepoUrl(opts) {
4579
+ if (opts.gitRepoUrl) return opts.gitRepoUrl.replace(/\/+$/, "");
4580
+ const server = process.env.GITHUB_SERVER_URL;
4581
+ const repo = process.env.GITHUB_REPOSITORY;
4582
+ if (server && repo) return `${server.replace(/\/+$/, "")}/${repo}`;
4583
+ if (process.env.CI_PROJECT_URL) return process.env.CI_PROJECT_URL.replace(/\/+$/, "");
4584
+ return void 0;
4462
4585
  }
4463
4586
  function resolveGate(opts) {
4464
4587
  const gate = {};
@@ -4467,6 +4590,14 @@ function resolveGate(opts) {
4467
4590
  if (opts.baselineRun !== void 0) gate.baselineRunId = opts.baselineRun;
4468
4591
  return Object.keys(gate).length > 0 ? gate : void 0;
4469
4592
  }
4593
+ function resolveReuse(opts) {
4594
+ if (!opts.reuseIfUnchanged) return void 0;
4595
+ return {
4596
+ ifUnchanged: true,
4597
+ ...opts.reuseMaxAgeHours !== void 0 ? { maxAgeHours: opts.reuseMaxAgeHours } : {},
4598
+ ...opts.reuseSameRef ? { sameRef: true } : {}
4599
+ };
4600
+ }
4470
4601
  function pct(n) {
4471
4602
  return n == null ? "?" : `${(n * 100).toFixed(1)}%`;
4472
4603
  }
@@ -4494,8 +4625,13 @@ Usage:
4494
4625
 
4495
4626
  Options:
4496
4627
  --eval <id> Eval to run (or use --manifest for many).
4497
- --prompt <file> Prompt template file to push (versioned by content).
4498
- --system-prompt <file> System-prompt file (optional).
4628
+ --prompt <file> User prompt (template) file to push (versioned by content).
4629
+ --system-prompt <file> System-prompt file. Either prompt file is optional and
4630
+ independent: push one and the other is inherited from
4631
+ the current version (omit both to run it unchanged).
4632
+ --tools <file> Tool definitions to push: a JSON file (Verica-flat or
4633
+ OpenAI {type:function,\u2026} entries). Omit to inherit the
4634
+ current version's tools. Inline arrays: .verica.yml only.
4499
4635
  --model <model> Model to sample under (overrides the manifest).
4500
4636
  --sampling <file> JSON sampling params (temperature, maxTokens, \u2026).
4501
4637
  --manifest <file> .verica.yml mapping prompts \u2192 eval IDs (multi-prompt).
@@ -4506,8 +4642,16 @@ Options:
4506
4642
  --threshold <0..1> Override the gate's minimum pass rate.
4507
4643
  --baseline-ref <ref> No-regression baseline = last run on this git ref.
4508
4644
  --baseline-run <id> No-regression baseline = this specific run.
4645
+ --reuse-if-unchanged Reuse a recent completed run instead of executing again
4646
+ when the config (prompt + model + sampling + dataset +
4647
+ graders) is unchanged. Off by default. Incompatible with
4648
+ --threshold / --baseline-*.
4649
+ --reuse-max-age <hrs> Max age of a reusable run (default 24, max 720).
4650
+ --reuse-same-ref Only reuse a run on the same git ref (default: any ref).
4509
4651
  --git-sha <sha> Commit SHA (else auto-detected from CI env).
4510
4652
  --git-ref <ref> Git ref (else auto-detected from CI env).
4653
+ --git-repo-url <url> Repo web base for the SHA link in the run UI (e.g.
4654
+ https://github.com/acme/widgets). Auto-detected from CI env.
4511
4655
  --base-url <url> Override the API base URL (dev/self-host only).
4512
4656
  --poll-interval <sec> Initial poll interval (default 3).
4513
4657
  --timeout <sec> Max wait (default 1800).
@@ -4531,6 +4675,7 @@ async function main() {
4531
4675
  eval: { type: "string" },
4532
4676
  prompt: { type: "string" },
4533
4677
  "system-prompt": { type: "string" },
4678
+ tools: { type: "string" },
4534
4679
  model: { type: "string" },
4535
4680
  sampling: { type: "string" },
4536
4681
  manifest: { type: "string" },
@@ -4541,8 +4686,12 @@ async function main() {
4541
4686
  threshold: { type: "string" },
4542
4687
  "baseline-ref": { type: "string" },
4543
4688
  "baseline-run": { type: "string" },
4689
+ "reuse-if-unchanged": { type: "boolean", default: false },
4690
+ "reuse-max-age": { type: "string" },
4691
+ "reuse-same-ref": { type: "boolean", default: false },
4544
4692
  "git-sha": { type: "string" },
4545
4693
  "git-ref": { type: "string" },
4694
+ "git-repo-url": { type: "string" },
4546
4695
  "base-url": { type: "string" },
4547
4696
  "poll-interval": { type: "string" },
4548
4697
  timeout: { type: "string" },
@@ -4561,12 +4710,24 @@ async function main() {
4561
4710
  if (values.threshold !== void 0 && threshold === void 0) {
4562
4711
  throw new Error(`--threshold must be a number between 0 and 1 (got "${values.threshold}").`);
4563
4712
  }
4713
+ const reuseMaxAge = finiteNumber(values["reuse-max-age"]);
4714
+ if (values["reuse-max-age"] !== void 0 && (reuseMaxAge === void 0 || reuseMaxAge <= 0 || reuseMaxAge > 720)) {
4715
+ throw new Error(
4716
+ `--reuse-max-age must be a number of hours in (0, 720] (got "${values["reuse-max-age"]}").`
4717
+ );
4718
+ }
4719
+ if (values["reuse-if-unchanged"] && (threshold !== void 0 || values["baseline-ref"] !== void 0 || values["baseline-run"] !== void 0)) {
4720
+ throw new Error(
4721
+ "--reuse-if-unchanged cannot be combined with --threshold / --baseline-ref / --baseline-run (a reused verdict was frozen under the prior gate)."
4722
+ );
4723
+ }
4564
4724
  const opts = {
4565
4725
  baseUrl,
4566
4726
  token,
4567
4727
  evalId: values.eval,
4568
4728
  promptFile: values.prompt,
4569
4729
  systemPromptFile: values["system-prompt"],
4730
+ toolsFile: values.tools,
4570
4731
  samplingFile: values.sampling,
4571
4732
  model: values.model,
4572
4733
  manifestFile: values.manifest,
@@ -4577,8 +4738,12 @@ async function main() {
4577
4738
  threshold,
4578
4739
  baselineRef: values["baseline-ref"],
4579
4740
  baselineRun: values["baseline-run"],
4741
+ reuseIfUnchanged: values["reuse-if-unchanged"] ?? false,
4742
+ reuseMaxAgeHours: reuseMaxAge,
4743
+ reuseSameRef: values["reuse-same-ref"] ?? false,
4580
4744
  gitSha: values["git-sha"],
4581
4745
  gitRef: values["git-ref"],
4746
+ gitRepoUrl: values["git-repo-url"],
4582
4747
  pollIntervalMs: (finiteNumber(values["poll-interval"]) ?? 3) * 1e3,
4583
4748
  timeoutMs: (finiteNumber(values.timeout) ?? 1800) * 1e3
4584
4749
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@verica-app/cli",
3
- "version": "0.1.2",
3
+ "version": "0.1.4",
4
4
  "private": false,
5
5
  "description": "Run a Verica eval from CI and block the merge on the result.",
6
6
  "license": "MIT",
@@ -12,18 +12,15 @@
12
12
  "prompt",
13
13
  "testing"
14
14
  ],
15
- "repository": {
16
- "type": "git",
17
- "url": "git+https://github.com/mtn-labs/evals.git",
18
- "directory": "packages/cli"
19
- },
20
- "homepage": "https://github.com/mtn-labs/evals/tree/main/packages/cli#readme",
15
+ "homepage": "https://verica.app",
21
16
  "type": "module",
22
17
  "bin": {
23
18
  "verica": "./dist/cli.js"
24
19
  },
25
20
  "files": [
26
- "dist"
21
+ "dist",
22
+ "README.md",
23
+ "CHANGELOG.md"
27
24
  ],
28
25
  "publishConfig": {
29
26
  "access": "public"