@runtypelabs/cli 2.23.1 → 2.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +237 -191
  2. package/package.json +3 -3
package/dist/index.js CHANGED
@@ -37591,39 +37591,46 @@ var BUILT_IN_GRADER_IDS = [
37591
37591
  "rightTone",
37592
37592
  "safeToSend"
37593
37593
  ];
37594
+ var graderSeveritySchema = external_exports.enum(["gate", "soft"]);
37595
+ var severityFields = { severity: graderSeveritySchema.optional() };
37594
37596
  var checkGraderSchema = external_exports.discriminatedUnion("kind", [
37595
37597
  external_exports.object({
37596
37598
  kind: external_exports.literal("contains"),
37597
37599
  value: external_exports.string(),
37598
- caseSensitive: external_exports.boolean().optional()
37600
+ caseSensitive: external_exports.boolean().optional(),
37601
+ ...severityFields
37599
37602
  }),
37600
37603
  external_exports.object({
37601
37604
  kind: external_exports.literal("not_contains"),
37602
37605
  value: external_exports.string(),
37603
- caseSensitive: external_exports.boolean().optional()
37606
+ caseSensitive: external_exports.boolean().optional(),
37607
+ ...severityFields
37604
37608
  }),
37605
37609
  // Exact/normalized match against `case.expected.text`.
37606
- external_exports.object({ kind: external_exports.literal("matches_expected") }),
37610
+ external_exports.object({ kind: external_exports.literal("matches_expected"), ...severityFields }),
37607
37611
  external_exports.object({
37608
37612
  kind: external_exports.literal("regex"),
37609
37613
  pattern: external_exports.string(),
37610
- flags: external_exports.string().optional()
37614
+ flags: external_exports.string().optional(),
37615
+ ...severityFields
37611
37616
  }),
37612
- external_exports.object({ kind: external_exports.literal("valid_json") }),
37617
+ external_exports.object({ kind: external_exports.literal("valid_json"), ...severityFields }),
37613
37618
  external_exports.object({
37614
37619
  kind: external_exports.literal("json_field"),
37615
37620
  path: external_exports.string(),
37616
37621
  equals: external_exports.unknown().optional(),
37617
- exists: external_exports.boolean().optional()
37622
+ exists: external_exports.boolean().optional(),
37623
+ ...severityFields
37618
37624
  }),
37619
37625
  external_exports.object({
37620
37626
  kind: external_exports.literal("length"),
37621
37627
  minChars: external_exports.number().int().nonnegative().optional(),
37622
- maxChars: external_exports.number().int().nonnegative().optional()
37628
+ maxChars: external_exports.number().int().nonnegative().optional(),
37629
+ ...severityFields
37623
37630
  }),
37624
- external_exports.object({ kind: external_exports.literal("latency"), maxMs: external_exports.number().int().positive() }),
37631
+ external_exports.object({ kind: external_exports.literal("latency"), maxMs: external_exports.number().int().positive(), ...severityFields }),
37625
37632
  // Today's implicit "success" made explicit: the case produced output without erroring.
37626
- external_exports.object({ kind: external_exports.literal("no_error") }),
37633
+ external_exports.object({ kind: external_exports.literal("no_error"), ...severityFields }),
37627
37634
  // -------------------------------------------------------------------------
37628
37635
  // Trace checks — deterministic, free, pure assertions over the run's
37629
37636
  // EXECUTION TRACE (which tools/steps ran, in what order, whether it
@@ -37641,25 +37648,30 @@ var checkGraderSchema = external_exports.discriminatedUnion("kind", [
37641
37648
  input: external_exports.unknown().optional(),
37642
37649
  output: external_exports.unknown().optional(),
37643
37650
  isError: external_exports.boolean().optional(),
37644
- times: external_exports.number().int().positive().optional()
37651
+ times: external_exports.number().int().positive().optional(),
37652
+ ...severityFields
37645
37653
  }),
37646
37654
  // No tool named `name` was called.
37647
- external_exports.object({ kind: external_exports.literal("not_called_tool"), name: external_exports.string().min(1) }),
37655
+ external_exports.object({ kind: external_exports.literal("not_called_tool"), name: external_exports.string().min(1), ...severityFields }),
37648
37656
  // The run made no tool calls at all.
37649
- external_exports.object({ kind: external_exports.literal("used_no_tools") }),
37657
+ external_exports.object({ kind: external_exports.literal("used_no_tools"), ...severityFields }),
37650
37658
  // The run made at most `max` tool calls.
37651
- external_exports.object({ kind: external_exports.literal("max_tool_calls"), max: external_exports.number().int().nonnegative() }),
37659
+ external_exports.object({
37660
+ kind: external_exports.literal("max_tool_calls"),
37661
+ max: external_exports.number().int().nonnegative(),
37662
+ ...severityFields
37663
+ }),
37652
37664
  // `tools` appears as an ordered SUBSEQUENCE of the tool-call names (other
37653
37665
  // calls may interleave; relative order of the listed tools must hold).
37654
- external_exports.object({ kind: external_exports.literal("tool_order"), tools: external_exports.array(external_exports.string()).min(1) }),
37666
+ external_exports.object({ kind: external_exports.literal("tool_order"), tools: external_exports.array(external_exports.string()).min(1), ...severityFields }),
37655
37667
  // A step named (or typed) `name` ran.
37656
- external_exports.object({ kind: external_exports.literal("ran_step"), name: external_exports.string().min(1) }),
37668
+ external_exports.object({ kind: external_exports.literal("ran_step"), name: external_exports.string().min(1), ...severityFields }),
37657
37669
  // `steps` appears as an ordered SUBSEQUENCE of the steps that ran.
37658
- external_exports.object({ kind: external_exports.literal("step_order"), steps: external_exports.array(external_exports.string()).min(1) }),
37670
+ external_exports.object({ kind: external_exports.literal("step_order"), steps: external_exports.array(external_exports.string()).min(1), ...severityFields }),
37659
37671
  // The run completed (finished without erroring and was not left paused).
37660
- external_exports.object({ kind: external_exports.literal("completed") }),
37672
+ external_exports.object({ kind: external_exports.literal("completed"), ...severityFields }),
37661
37673
  // Total run cost was within `maxUsd` (US dollars).
37662
- external_exports.object({ kind: external_exports.literal("cost"), maxUsd: external_exports.number().positive() })
37674
+ external_exports.object({ kind: external_exports.literal("cost"), maxUsd: external_exports.number().positive(), ...severityFields })
37663
37675
  ]);
37664
37676
  var aiGraderSchema = external_exports.object({
37665
37677
  kind: external_exports.literal("ai"),
@@ -37671,7 +37683,8 @@ var aiGraderSchema = external_exports.object({
37671
37683
  /** Defaults to a cheap routed model (e.g. claude-haiku-4-5) at execution time. */
37672
37684
  model: external_exports.string().optional(),
37673
37685
  /** Pass cutoff for the 1-5 scale. */
37674
- threshold: external_exports.number().min(1).max(5).optional()
37686
+ threshold: external_exports.number().min(1).max(5).optional(),
37687
+ ...severityFields
37675
37688
  });
37676
37689
  var graderConfigSchema = external_exports.union([checkGraderSchema, aiGraderSchema]);
37677
37690
  var gradersSchema = external_exports.array(graderConfigSchema);
@@ -63990,185 +64003,191 @@ function buildJUnitXml(suites) {
63990
64003
 
63991
64004
  // src/commands/eval.ts
63992
64005
  var evalCommand = new Command20("eval").description("Manage evaluations");
63993
- evalCommand.command("submit").description("Submit an eval batch").requiredOption("-f, --flow <id>", "Flow ID to evaluate").requiredOption("-r, --records <file>", "JSON file with record IDs").option("-n, --name <name>", "Eval batch name").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(async (options) => {
63994
- const apiKey = await ensureAuth();
63995
- if (!apiKey) return;
63996
- let recordIds;
63997
- try {
63998
- const content = readFileSync16(options.records, "utf-8");
63999
- const parsed = JSON.parse(content);
64000
- recordIds = Array.isArray(parsed) ? parsed : parsed.recordIds || parsed.records || [];
64001
- } catch (error51) {
64002
- const message = error51 instanceof Error ? error51.message : "Unknown error";
64003
- console.error(chalk27.red(`Failed to read records file: ${message}`));
64004
- process.exit(1);
64005
- return;
64006
- }
64007
- const client = createCliClient(apiKey);
64008
- if (!isTTY(options) || options.json) {
64006
+ evalCommand.command("submit").description("Submit an eval batch").requiredOption("-f, --flow <id>", "Flow ID to evaluate").requiredOption("-r, --records <file>", "JSON file with record IDs").option("-n, --name <name>", "Eval batch name").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
64007
+ async (options) => {
64008
+ const apiKey = await ensureAuth();
64009
+ if (!apiKey) return;
64010
+ let recordIds;
64009
64011
  try {
64010
- const data = await client.post("/eval/submit", {
64011
- flowId: options.flow,
64012
- recordIds,
64013
- name: options.name
64014
- });
64015
- if (options.json) {
64016
- printJson(data);
64017
- } else {
64018
- console.log(chalk27.green("Eval submitted"));
64019
- console.log(` Batch ID: ${chalk27.green(data.id)}`);
64020
- if (data.name) console.log(` Name: ${data.name}`);
64021
- console.log(` Status: ${data.status}`);
64022
- console.log(` Records: ${data.totalRecords}`);
64023
- if (data.groupId) console.log(` Group: ${data.groupId}`);
64024
- }
64012
+ const content = readFileSync16(options.records, "utf-8");
64013
+ const parsed = JSON.parse(content);
64014
+ recordIds = Array.isArray(parsed) ? parsed : parsed.recordIds || parsed.records || [];
64025
64015
  } catch (error51) {
64026
64016
  const message = error51 instanceof Error ? error51.message : "Unknown error";
64027
- console.error(chalk27.red("Failed to submit eval"));
64028
- console.error(chalk27.red(message));
64017
+ console.error(chalk27.red(`Failed to read records file: ${message}`));
64029
64018
  process.exit(1);
64019
+ return;
64030
64020
  }
64031
- return;
64032
- }
64033
- const App = () => {
64034
- const [loading, setLoading] = useState36(true);
64035
- const [success2, setSuccess] = useState36(null);
64036
- const [error51, setError] = useState36(null);
64037
- const [resultNode, setResultNode] = useState36(void 0);
64038
- useEffect30(() => {
64039
- const run2 = async () => {
64040
- try {
64041
- const data = await client.post("/eval/submit", {
64042
- flowId: options.flow,
64043
- recordIds,
64044
- name: options.name
64045
- });
64046
- const fields = [
64047
- { label: "Batch ID", value: data.id, color: "green" }
64048
- ];
64049
- if (data.name) fields.push({ label: "Name", value: data.name });
64050
- fields.push({ label: "Status", value: data.status });
64051
- fields.push({ label: "Records", value: data.totalRecords });
64052
- if (data.groupId) fields.push({ label: "Group", value: data.groupId });
64053
- setResultNode(React19.createElement(EntityCard, { fields }));
64054
- setSuccess(true);
64055
- setLoading(false);
64056
- } catch (err) {
64057
- setError(err instanceof Error ? err : new Error(String(err)));
64058
- setSuccess(false);
64059
- setLoading(false);
64060
- }
64061
- };
64062
- run2();
64063
- }, []);
64064
- return React19.createElement(MutationResult, {
64065
- loading,
64066
- loadingLabel: `Submitting eval with ${recordIds.length} records...`,
64067
- success: success2,
64068
- successMessage: "Eval submitted",
64069
- error: error51,
64070
- result: resultNode
64071
- });
64072
- };
64073
- const { waitUntilExit } = render19(React19.createElement(App));
64074
- await waitUntilExit();
64075
- });
64076
- evalCommand.command("list").description("List eval batches").option("--flow <id>", "Filter by flow ID").option("--limit <n>", "Limit results", "20").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(async (options) => {
64077
- const apiKey = await ensureAuth();
64078
- if (!apiKey) return;
64079
- const client = createCliClient(apiKey);
64080
- const params = { limit: options.limit };
64081
- if (options.flow) params.flowId = options.flow;
64082
- if (!isTTY(options) || options.json) {
64083
- try {
64084
- const data = await client.get("/eval/batches", params);
64085
- if (options.json) {
64086
- printJson(data);
64087
- } else {
64088
- const batches = data.data ?? [];
64089
- if (batches.length === 0) {
64090
- console.log(chalk27.gray("No eval batches found"));
64091
- return;
64092
- }
64093
- console.log(chalk27.cyan("Eval Batches:"));
64094
- for (const batch of batches) {
64095
- const name = batch.name || batch.id;
64096
- const progress = batch.totalRecords ? `${batch.completedRecords ?? 0}/${batch.totalRecords}` : "";
64097
- const statusColor = batch.status === "completed" ? "green" : "yellow";
64098
- console.log(
64099
- ` ${chalk27.green(batch.id)} ${name} ${chalk27[statusColor](`[${batch.status}]`)} ${chalk27.gray(progress)}`
64100
- );
64101
- }
64102
- const total = getTotalCount(data.pagination);
64103
- if (total !== void 0) {
64104
- console.log(chalk27.dim(`
64105
- Total: ${total} batches`));
64021
+ const client = createCliClient(apiKey);
64022
+ if (!isTTY(options) || options.json) {
64023
+ try {
64024
+ const data = await client.post("/eval/submit", {
64025
+ flowId: options.flow,
64026
+ recordIds,
64027
+ name: options.name
64028
+ });
64029
+ if (options.json) {
64030
+ printJson(data);
64031
+ } else {
64032
+ console.log(chalk27.green("Eval submitted"));
64033
+ console.log(` Batch ID: ${chalk27.green(data.id)}`);
64034
+ if (data.name) console.log(` Name: ${data.name}`);
64035
+ console.log(` Status: ${data.status}`);
64036
+ console.log(` Records: ${data.totalRecords}`);
64037
+ if (data.groupId) console.log(` Group: ${data.groupId}`);
64106
64038
  }
64039
+ } catch (error51) {
64040
+ const message = error51 instanceof Error ? error51.message : "Unknown error";
64041
+ console.error(chalk27.red("Failed to submit eval"));
64042
+ console.error(chalk27.red(message));
64043
+ process.exit(1);
64107
64044
  }
64108
- } catch (error51) {
64109
- const message = error51 instanceof Error ? error51.message : "Unknown error";
64110
- console.error(chalk27.red("Failed to fetch eval batches"));
64111
- console.error(chalk27.red(message));
64112
- process.exit(1);
64045
+ return;
64113
64046
  }
64114
- return;
64047
+ const App = () => {
64048
+ const [loading, setLoading] = useState36(true);
64049
+ const [success2, setSuccess] = useState36(null);
64050
+ const [error51, setError] = useState36(null);
64051
+ const [resultNode, setResultNode] = useState36(void 0);
64052
+ useEffect30(() => {
64053
+ const run2 = async () => {
64054
+ try {
64055
+ const data = await client.post("/eval/submit", {
64056
+ flowId: options.flow,
64057
+ recordIds,
64058
+ name: options.name
64059
+ });
64060
+ const fields = [{ label: "Batch ID", value: data.id, color: "green" }];
64061
+ if (data.name) fields.push({ label: "Name", value: data.name });
64062
+ fields.push({ label: "Status", value: data.status });
64063
+ fields.push({ label: "Records", value: data.totalRecords });
64064
+ if (data.groupId) fields.push({ label: "Group", value: data.groupId });
64065
+ setResultNode(React19.createElement(EntityCard, { fields }));
64066
+ setSuccess(true);
64067
+ setLoading(false);
64068
+ } catch (err) {
64069
+ setError(err instanceof Error ? err : new Error(String(err)));
64070
+ setSuccess(false);
64071
+ setLoading(false);
64072
+ }
64073
+ };
64074
+ run2();
64075
+ }, []);
64076
+ return React19.createElement(MutationResult, {
64077
+ loading,
64078
+ loadingLabel: `Submitting eval with ${recordIds.length} records...`,
64079
+ success: success2,
64080
+ successMessage: "Eval submitted",
64081
+ error: error51,
64082
+ result: resultNode
64083
+ });
64084
+ };
64085
+ const { waitUntilExit } = render19(React19.createElement(App));
64086
+ await waitUntilExit();
64115
64087
  }
64116
- const App = () => {
64117
- const [loading, setLoading] = useState36(true);
64118
- const [items, setItems] = useState36(null);
64119
- const [total, setTotal] = useState36(void 0);
64120
- const [error51, setError] = useState36(null);
64121
- useEffect30(() => {
64122
- const run2 = async () => {
64123
- try {
64124
- const data = await client.get("/eval/batches", params);
64125
- setItems(data.data ?? []);
64126
- setTotal(getTotalCount(data.pagination));
64127
- setLoading(false);
64128
- } catch (err) {
64129
- setError(err instanceof Error ? err : new Error(String(err)));
64130
- setLoading(false);
64088
+ );
64089
+ evalCommand.command("list").description("List eval batches").option("--flow <id>", "Filter by flow ID").option("--limit <n>", "Limit results", "20").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
64090
+ async (options) => {
64091
+ const apiKey = await ensureAuth();
64092
+ if (!apiKey) return;
64093
+ const client = createCliClient(apiKey);
64094
+ const params = { limit: options.limit };
64095
+ if (options.flow) params.flowId = options.flow;
64096
+ if (!isTTY(options) || options.json) {
64097
+ try {
64098
+ const data = await client.get("/eval/batches", params);
64099
+ if (options.json) {
64100
+ printJson(data);
64101
+ } else {
64102
+ const batches = data.data ?? [];
64103
+ if (batches.length === 0) {
64104
+ console.log(chalk27.gray("No eval batches found"));
64105
+ return;
64106
+ }
64107
+ console.log(chalk27.cyan("Eval Batches:"));
64108
+ for (const batch of batches) {
64109
+ const name = batch.name || batch.id;
64110
+ const progress = batch.totalRecords ? `${batch.completedRecords ?? 0}/${batch.totalRecords}` : "";
64111
+ const statusColor = batch.status === "completed" ? "green" : "yellow";
64112
+ console.log(
64113
+ ` ${chalk27.green(batch.id)} ${name} ${chalk27[statusColor](`[${batch.status}]`)} ${chalk27.gray(progress)}`
64114
+ );
64115
+ }
64116
+ const total = getTotalCount(data.pagination);
64117
+ if (total !== void 0) {
64118
+ console.log(chalk27.dim(`
64119
+ Total: ${total} batches`));
64120
+ }
64131
64121
  }
64132
- };
64133
- run2();
64134
- }, []);
64135
- return React19.createElement(DataList, {
64136
- title: "Eval Batches",
64137
- items,
64138
- error: error51,
64139
- loading,
64140
- total,
64141
- emptyMessage: "No eval batches found",
64142
- renderCard: (item) => {
64143
- const b = item;
64144
- const name = b.name || b.id;
64145
- const progress = b.totalRecords ? `${b.completedRecords ?? 0}/${b.totalRecords}` : "";
64146
- const statusColor = b.status === "completed" ? "green" : "yellow";
64147
- return React19.createElement(
64148
- Text34,
64149
- { color: statusColor },
64150
- ` ${b.id} ${name} [${b.status}] ${progress}`
64151
- );
64122
+ } catch (error51) {
64123
+ const message = error51 instanceof Error ? error51.message : "Unknown error";
64124
+ console.error(chalk27.red("Failed to fetch eval batches"));
64125
+ console.error(chalk27.red(message));
64126
+ process.exit(1);
64152
64127
  }
64153
- });
64154
- };
64155
- const { waitUntilExit } = render19(React19.createElement(App));
64156
- await waitUntilExit();
64157
- });
64128
+ return;
64129
+ }
64130
+ const App = () => {
64131
+ const [loading, setLoading] = useState36(true);
64132
+ const [items, setItems] = useState36(null);
64133
+ const [total, setTotal] = useState36(void 0);
64134
+ const [error51, setError] = useState36(null);
64135
+ useEffect30(() => {
64136
+ const run2 = async () => {
64137
+ try {
64138
+ const data = await client.get("/eval/batches", params);
64139
+ setItems(data.data ?? []);
64140
+ setTotal(getTotalCount(data.pagination));
64141
+ setLoading(false);
64142
+ } catch (err) {
64143
+ setError(err instanceof Error ? err : new Error(String(err)));
64144
+ setLoading(false);
64145
+ }
64146
+ };
64147
+ run2();
64148
+ }, []);
64149
+ return React19.createElement(DataList, {
64150
+ title: "Eval Batches",
64151
+ items,
64152
+ error: error51,
64153
+ loading,
64154
+ total,
64155
+ emptyMessage: "No eval batches found",
64156
+ renderCard: (item) => {
64157
+ const b = item;
64158
+ const name = b.name || b.id;
64159
+ const progress = b.totalRecords ? `${b.completedRecords ?? 0}/${b.totalRecords}` : "";
64160
+ const statusColor = b.status === "completed" ? "green" : "yellow";
64161
+ return React19.createElement(
64162
+ Text34,
64163
+ { color: statusColor },
64164
+ ` ${b.id} ${name} [${b.status}] ${progress}`
64165
+ );
64166
+ }
64167
+ });
64168
+ };
64169
+ const { waitUntilExit } = render19(React19.createElement(App));
64170
+ await waitUntilExit();
64171
+ }
64172
+ );
64158
64173
  evalCommand.command("results <id>").description("Get eval batch results").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(async (id, options) => {
64159
64174
  const apiKey = await ensureAuth();
64160
64175
  if (!apiKey) return;
64161
64176
  const client = createCliClient(apiKey);
64162
64177
  if (!isTTY(options) || options.json) {
64163
64178
  try {
64164
- const data = await client.get(`/eval/${id}/results`);
64179
+ const data = await client.get(
64180
+ `/eval/${id}/results`
64181
+ );
64165
64182
  if (options.json) {
64166
64183
  printJson(data);
64167
64184
  } else {
64168
64185
  if (data.batch) {
64169
64186
  console.log(chalk27.cyan(`Eval: ${data.batch.name || data.batch.id}`));
64170
64187
  console.log(` Status: ${data.batch.status}`);
64171
- console.log(` Progress: ${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}`);
64188
+ console.log(
64189
+ ` Progress: ${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}`
64190
+ );
64172
64191
  console.log();
64173
64192
  }
64174
64193
  const results = data.data ?? [];
@@ -64201,19 +64220,28 @@ evalCommand.command("results <id>").description("Get eval batch results").option
64201
64220
  useEffect30(() => {
64202
64221
  const run2 = async () => {
64203
64222
  try {
64204
- const data = await client.get(`/eval/${id}/results`);
64223
+ const data = await client.get(
64224
+ `/eval/${id}/results`
64225
+ );
64205
64226
  const results = data.data ?? [];
64206
64227
  const fields = [];
64207
64228
  if (data.batch) {
64208
64229
  fields.push({ label: "Eval", value: data.batch.name || data.batch.id });
64209
64230
  fields.push({ label: "Status", value: data.batch.status });
64210
- fields.push({ label: "Progress", value: `${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}` });
64231
+ fields.push({
64232
+ label: "Progress",
64233
+ value: `${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}`
64234
+ });
64211
64235
  }
64212
64236
  fields.push({ label: "Results", value: results.length });
64213
64237
  if (results.length > 0) {
64214
64238
  const completed = results.filter((r) => r.status === "completed").length;
64215
64239
  const avgScore = results.filter((r) => r.score !== void 0).reduce((sum, r) => sum + (r.score ?? 0), 0) / (results.filter((r) => r.score !== void 0).length || 1);
64216
- fields.push({ label: "Completed", value: `${completed}/${results.length}`, color: "green" });
64240
+ fields.push({
64241
+ label: "Completed",
64242
+ value: `${completed}/${results.length}`,
64243
+ color: "green"
64244
+ });
64217
64245
  if (results.some((r) => r.score !== void 0)) {
64218
64246
  fields.push({ label: "Avg Score", value: avgScore.toFixed(2) });
64219
64247
  }
@@ -64309,6 +64337,13 @@ function printSuiteResult(rootDir, outcome) {
64309
64337
  if (testCase.errored) {
64310
64338
  console.log(chalk27.red(` \u2717 errored: ${testCase.outputExcerpt.slice(0, 200)}`));
64311
64339
  }
64340
+ } else {
64341
+ for (const outcomeItem of testCase.outcomes.filter(
64342
+ (o) => !o.passed && o.severity === "soft"
64343
+ )) {
64344
+ const reason = outcomeItem.reasoning ? `: ${outcomeItem.reasoning}` : "";
64345
+ console.log(chalk27.yellow(` \u26A0 soft ${outcomeItem.kind}${reason}`));
64346
+ }
64312
64347
  }
64313
64348
  }
64314
64349
  }
@@ -64323,7 +64358,12 @@ function toJUnitSuite(outcome) {
64323
64358
  }))
64324
64359
  };
64325
64360
  }
64326
- evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval suites (**/*.eval.ts) as a CI gate (exit 0 pass / 1 fail / 2 config)").option("--strict", "Fail on soft-threshold misses too (no-op until severity lands)").option("--virtual", "Run inline without persisting a suite/batch to the dashboard").option("--junit <path>", "Write JUnit XML results to <path>").option("--url <api>", "Override the API base URL (e.g. staging)").option("--cwd <dir>", "Directory to discover *.eval.ts under (default: current directory)").action(
64361
+ evalCommand.command("run [idOrDirPrefix]").description(
64362
+ "Run code-colocated eval suites (**/*.eval.ts) as a CI gate (exit 0 pass / 1 fail / 2 config)"
64363
+ ).option(
64364
+ "--strict",
64365
+ "Fail the exit code on soft grader misses too (default: soft misses are reported but do not fail)"
64366
+ ).option("--virtual", "Run inline without persisting a suite/batch to the dashboard").option("--junit <path>", "Write JUnit XML results to <path>").option("--url <api>", "Override the API base URL (e.g. staging)").option("--cwd <dir>", "Directory to discover *.eval.ts under (default: current directory)").action(
64327
64367
  async (idOrDirPrefix, options) => {
64328
64368
  const apiKey = await ensureAuth();
64329
64369
  if (!apiKey) {
@@ -64361,11 +64401,6 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
64361
64401
  } else {
64362
64402
  loaded = await loadAll(allFiles);
64363
64403
  }
64364
- if (options.strict) {
64365
- console.log(
64366
- chalk27.gray("Note: --strict has no effect yet (grader severity lands in a later increment).")
64367
- );
64368
- }
64369
64404
  const client = createCliClient(apiKey, options.url);
64370
64405
  const outcomes = [];
64371
64406
  for (const { file: file2, def } of loaded) {
@@ -64373,16 +64408,25 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
64373
64408
  try {
64374
64409
  let result;
64375
64410
  if (runVirtual) {
64376
- result = await client.post("/eval/run", { definition: def });
64377
- } else {
64378
- const ensured = await client.post("/eval/ensure", {
64379
- name: def.name,
64380
- definition: def
64411
+ result = await client.post("/eval/run", {
64412
+ definition: def,
64413
+ strict: options.strict ?? false
64381
64414
  });
64415
+ } else {
64416
+ const ensured = await client.post(
64417
+ "/eval/ensure",
64418
+ {
64419
+ name: def.name,
64420
+ definition: def
64421
+ }
64422
+ );
64382
64423
  if (!ensured.suiteId) {
64383
64424
  throw new Error(`ensure did not return a suiteId (result: ${ensured.result})`);
64384
64425
  }
64385
- result = await client.post("/eval/run", { suiteId: ensured.suiteId });
64426
+ result = await client.post("/eval/run", {
64427
+ suiteId: ensured.suiteId,
64428
+ strict: options.strict ?? false
64429
+ });
64386
64430
  }
64387
64431
  outcomes.push({ file: file2, definition: def, result });
64388
64432
  printSuiteResult(rootDir, { file: file2, definition: def, result });
@@ -64400,7 +64444,9 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
64400
64444
  writeFileSync6(outPath, xml, "utf-8");
64401
64445
  console.log(chalk27.gray(`JUnit results written to ${options.junit}`));
64402
64446
  } catch (error51) {
64403
- failConfig(`Failed to write JUnit report: ${error51 instanceof Error ? error51.message : String(error51)}`);
64447
+ failConfig(
64448
+ `Failed to write JUnit report: ${error51 instanceof Error ? error51.message : String(error51)}`
64449
+ );
64404
64450
  }
64405
64451
  }
64406
64452
  const failedSuites = outcomes.filter((o) => !o.result.passed);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@runtypelabs/cli",
3
- "version": "2.23.1",
3
+ "version": "2.24.0",
4
4
  "description": "Command-line interface for Runtype AI platform",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -24,7 +24,7 @@
24
24
  "rosie-skills": "0.8.1",
25
25
  "yaml": "^2.9.0",
26
26
  "@runtypelabs/ink-components": "0.3.4",
27
- "@runtypelabs/sdk": "5.5.0",
27
+ "@runtypelabs/sdk": "5.6.0",
28
28
  "@runtypelabs/terminal-animations": "0.2.1"
29
29
  },
30
30
  "devDependencies": {
@@ -39,7 +39,7 @@
39
39
  "tsx": "^4.7.1",
40
40
  "typescript": "^6.0.3",
41
41
  "vitest": "^4.1.0",
42
- "@runtypelabs/shared": "1.42.5"
42
+ "@runtypelabs/shared": "1.42.6"
43
43
  },
44
44
  "engines": {
45
45
  "node": ">=22.0.0"