@runtypelabs/cli 2.23.1 → 2.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +237 -191
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -37591,39 +37591,46 @@ var BUILT_IN_GRADER_IDS = [
|
|
|
37591
37591
|
"rightTone",
|
|
37592
37592
|
"safeToSend"
|
|
37593
37593
|
];
|
|
37594
|
+
var graderSeveritySchema = external_exports.enum(["gate", "soft"]);
|
|
37595
|
+
var severityFields = { severity: graderSeveritySchema.optional() };
|
|
37594
37596
|
var checkGraderSchema = external_exports.discriminatedUnion("kind", [
|
|
37595
37597
|
external_exports.object({
|
|
37596
37598
|
kind: external_exports.literal("contains"),
|
|
37597
37599
|
value: external_exports.string(),
|
|
37598
|
-
caseSensitive: external_exports.boolean().optional()
|
|
37600
|
+
caseSensitive: external_exports.boolean().optional(),
|
|
37601
|
+
...severityFields
|
|
37599
37602
|
}),
|
|
37600
37603
|
external_exports.object({
|
|
37601
37604
|
kind: external_exports.literal("not_contains"),
|
|
37602
37605
|
value: external_exports.string(),
|
|
37603
|
-
caseSensitive: external_exports.boolean().optional()
|
|
37606
|
+
caseSensitive: external_exports.boolean().optional(),
|
|
37607
|
+
...severityFields
|
|
37604
37608
|
}),
|
|
37605
37609
|
// Exact/normalized match against `case.expected.text`.
|
|
37606
|
-
external_exports.object({ kind: external_exports.literal("matches_expected") }),
|
|
37610
|
+
external_exports.object({ kind: external_exports.literal("matches_expected"), ...severityFields }),
|
|
37607
37611
|
external_exports.object({
|
|
37608
37612
|
kind: external_exports.literal("regex"),
|
|
37609
37613
|
pattern: external_exports.string(),
|
|
37610
|
-
flags: external_exports.string().optional()
|
|
37614
|
+
flags: external_exports.string().optional(),
|
|
37615
|
+
...severityFields
|
|
37611
37616
|
}),
|
|
37612
|
-
external_exports.object({ kind: external_exports.literal("valid_json") }),
|
|
37617
|
+
external_exports.object({ kind: external_exports.literal("valid_json"), ...severityFields }),
|
|
37613
37618
|
external_exports.object({
|
|
37614
37619
|
kind: external_exports.literal("json_field"),
|
|
37615
37620
|
path: external_exports.string(),
|
|
37616
37621
|
equals: external_exports.unknown().optional(),
|
|
37617
|
-
exists: external_exports.boolean().optional()
|
|
37622
|
+
exists: external_exports.boolean().optional(),
|
|
37623
|
+
...severityFields
|
|
37618
37624
|
}),
|
|
37619
37625
|
external_exports.object({
|
|
37620
37626
|
kind: external_exports.literal("length"),
|
|
37621
37627
|
minChars: external_exports.number().int().nonnegative().optional(),
|
|
37622
|
-
maxChars: external_exports.number().int().nonnegative().optional()
|
|
37628
|
+
maxChars: external_exports.number().int().nonnegative().optional(),
|
|
37629
|
+
...severityFields
|
|
37623
37630
|
}),
|
|
37624
|
-
external_exports.object({ kind: external_exports.literal("latency"), maxMs: external_exports.number().int().positive() }),
|
|
37631
|
+
external_exports.object({ kind: external_exports.literal("latency"), maxMs: external_exports.number().int().positive(), ...severityFields }),
|
|
37625
37632
|
// Today's implicit "success" made explicit: the case produced output without erroring.
|
|
37626
|
-
external_exports.object({ kind: external_exports.literal("no_error") }),
|
|
37633
|
+
external_exports.object({ kind: external_exports.literal("no_error"), ...severityFields }),
|
|
37627
37634
|
// -------------------------------------------------------------------------
|
|
37628
37635
|
// Trace checks — deterministic, free, pure assertions over the run's
|
|
37629
37636
|
// EXECUTION TRACE (which tools/steps ran, in what order, whether it
|
|
@@ -37641,25 +37648,30 @@ var checkGraderSchema = external_exports.discriminatedUnion("kind", [
|
|
|
37641
37648
|
input: external_exports.unknown().optional(),
|
|
37642
37649
|
output: external_exports.unknown().optional(),
|
|
37643
37650
|
isError: external_exports.boolean().optional(),
|
|
37644
|
-
times: external_exports.number().int().positive().optional()
|
|
37651
|
+
times: external_exports.number().int().positive().optional(),
|
|
37652
|
+
...severityFields
|
|
37645
37653
|
}),
|
|
37646
37654
|
// No tool named `name` was called.
|
|
37647
|
-
external_exports.object({ kind: external_exports.literal("not_called_tool"), name: external_exports.string().min(1) }),
|
|
37655
|
+
external_exports.object({ kind: external_exports.literal("not_called_tool"), name: external_exports.string().min(1), ...severityFields }),
|
|
37648
37656
|
// The run made no tool calls at all.
|
|
37649
|
-
external_exports.object({ kind: external_exports.literal("used_no_tools") }),
|
|
37657
|
+
external_exports.object({ kind: external_exports.literal("used_no_tools"), ...severityFields }),
|
|
37650
37658
|
// The run made at most `max` tool calls.
|
|
37651
|
-
external_exports.object({
|
|
37659
|
+
external_exports.object({
|
|
37660
|
+
kind: external_exports.literal("max_tool_calls"),
|
|
37661
|
+
max: external_exports.number().int().nonnegative(),
|
|
37662
|
+
...severityFields
|
|
37663
|
+
}),
|
|
37652
37664
|
// `tools` appears as an ordered SUBSEQUENCE of the tool-call names (other
|
|
37653
37665
|
// calls may interleave; relative order of the listed tools must hold).
|
|
37654
|
-
external_exports.object({ kind: external_exports.literal("tool_order"), tools: external_exports.array(external_exports.string()).min(1) }),
|
|
37666
|
+
external_exports.object({ kind: external_exports.literal("tool_order"), tools: external_exports.array(external_exports.string()).min(1), ...severityFields }),
|
|
37655
37667
|
// A step named (or typed) `name` ran.
|
|
37656
|
-
external_exports.object({ kind: external_exports.literal("ran_step"), name: external_exports.string().min(1) }),
|
|
37668
|
+
external_exports.object({ kind: external_exports.literal("ran_step"), name: external_exports.string().min(1), ...severityFields }),
|
|
37657
37669
|
// `steps` appears as an ordered SUBSEQUENCE of the steps that ran.
|
|
37658
|
-
external_exports.object({ kind: external_exports.literal("step_order"), steps: external_exports.array(external_exports.string()).min(1) }),
|
|
37670
|
+
external_exports.object({ kind: external_exports.literal("step_order"), steps: external_exports.array(external_exports.string()).min(1), ...severityFields }),
|
|
37659
37671
|
// The run completed (finished without erroring and was not left paused).
|
|
37660
|
-
external_exports.object({ kind: external_exports.literal("completed") }),
|
|
37672
|
+
external_exports.object({ kind: external_exports.literal("completed"), ...severityFields }),
|
|
37661
37673
|
// Total run cost was within `maxUsd` (US dollars).
|
|
37662
|
-
external_exports.object({ kind: external_exports.literal("cost"), maxUsd: external_exports.number().positive() })
|
|
37674
|
+
external_exports.object({ kind: external_exports.literal("cost"), maxUsd: external_exports.number().positive(), ...severityFields })
|
|
37663
37675
|
]);
|
|
37664
37676
|
var aiGraderSchema = external_exports.object({
|
|
37665
37677
|
kind: external_exports.literal("ai"),
|
|
@@ -37671,7 +37683,8 @@ var aiGraderSchema = external_exports.object({
|
|
|
37671
37683
|
/** Defaults to a cheap routed model (e.g. claude-haiku-4-5) at execution time. */
|
|
37672
37684
|
model: external_exports.string().optional(),
|
|
37673
37685
|
/** Pass cutoff for the 1-5 scale. */
|
|
37674
|
-
threshold: external_exports.number().min(1).max(5).optional()
|
|
37686
|
+
threshold: external_exports.number().min(1).max(5).optional(),
|
|
37687
|
+
...severityFields
|
|
37675
37688
|
});
|
|
37676
37689
|
var graderConfigSchema = external_exports.union([checkGraderSchema, aiGraderSchema]);
|
|
37677
37690
|
var gradersSchema = external_exports.array(graderConfigSchema);
|
|
@@ -63990,185 +64003,191 @@ function buildJUnitXml(suites) {
|
|
|
63990
64003
|
|
|
63991
64004
|
// src/commands/eval.ts
|
|
63992
64005
|
var evalCommand = new Command20("eval").description("Manage evaluations");
|
|
63993
|
-
evalCommand.command("submit").description("Submit an eval batch").requiredOption("-f, --flow <id>", "Flow ID to evaluate").requiredOption("-r, --records <file>", "JSON file with record IDs").option("-n, --name <name>", "Eval batch name").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
|
|
63994
|
-
|
|
63995
|
-
|
|
63996
|
-
|
|
63997
|
-
|
|
63998
|
-
const content = readFileSync16(options.records, "utf-8");
|
|
63999
|
-
const parsed = JSON.parse(content);
|
|
64000
|
-
recordIds = Array.isArray(parsed) ? parsed : parsed.recordIds || parsed.records || [];
|
|
64001
|
-
} catch (error51) {
|
|
64002
|
-
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64003
|
-
console.error(chalk27.red(`Failed to read records file: ${message}`));
|
|
64004
|
-
process.exit(1);
|
|
64005
|
-
return;
|
|
64006
|
-
}
|
|
64007
|
-
const client = createCliClient(apiKey);
|
|
64008
|
-
if (!isTTY(options) || options.json) {
|
|
64006
|
+
evalCommand.command("submit").description("Submit an eval batch").requiredOption("-f, --flow <id>", "Flow ID to evaluate").requiredOption("-r, --records <file>", "JSON file with record IDs").option("-n, --name <name>", "Eval batch name").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
|
|
64007
|
+
async (options) => {
|
|
64008
|
+
const apiKey = await ensureAuth();
|
|
64009
|
+
if (!apiKey) return;
|
|
64010
|
+
let recordIds;
|
|
64009
64011
|
try {
|
|
64010
|
-
const
|
|
64011
|
-
|
|
64012
|
-
|
|
64013
|
-
name: options.name
|
|
64014
|
-
});
|
|
64015
|
-
if (options.json) {
|
|
64016
|
-
printJson(data);
|
|
64017
|
-
} else {
|
|
64018
|
-
console.log(chalk27.green("Eval submitted"));
|
|
64019
|
-
console.log(` Batch ID: ${chalk27.green(data.id)}`);
|
|
64020
|
-
if (data.name) console.log(` Name: ${data.name}`);
|
|
64021
|
-
console.log(` Status: ${data.status}`);
|
|
64022
|
-
console.log(` Records: ${data.totalRecords}`);
|
|
64023
|
-
if (data.groupId) console.log(` Group: ${data.groupId}`);
|
|
64024
|
-
}
|
|
64012
|
+
const content = readFileSync16(options.records, "utf-8");
|
|
64013
|
+
const parsed = JSON.parse(content);
|
|
64014
|
+
recordIds = Array.isArray(parsed) ? parsed : parsed.recordIds || parsed.records || [];
|
|
64025
64015
|
} catch (error51) {
|
|
64026
64016
|
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64027
|
-
console.error(chalk27.red(
|
|
64028
|
-
console.error(chalk27.red(message));
|
|
64017
|
+
console.error(chalk27.red(`Failed to read records file: ${message}`));
|
|
64029
64018
|
process.exit(1);
|
|
64019
|
+
return;
|
|
64030
64020
|
}
|
|
64031
|
-
|
|
64032
|
-
|
|
64033
|
-
|
|
64034
|
-
|
|
64035
|
-
|
|
64036
|
-
|
|
64037
|
-
|
|
64038
|
-
|
|
64039
|
-
|
|
64040
|
-
|
|
64041
|
-
|
|
64042
|
-
|
|
64043
|
-
|
|
64044
|
-
|
|
64045
|
-
});
|
|
64046
|
-
|
|
64047
|
-
|
|
64048
|
-
];
|
|
64049
|
-
if (data.name) fields.push({ label: "Name", value: data.name });
|
|
64050
|
-
fields.push({ label: "Status", value: data.status });
|
|
64051
|
-
fields.push({ label: "Records", value: data.totalRecords });
|
|
64052
|
-
if (data.groupId) fields.push({ label: "Group", value: data.groupId });
|
|
64053
|
-
setResultNode(React19.createElement(EntityCard, { fields }));
|
|
64054
|
-
setSuccess(true);
|
|
64055
|
-
setLoading(false);
|
|
64056
|
-
} catch (err) {
|
|
64057
|
-
setError(err instanceof Error ? err : new Error(String(err)));
|
|
64058
|
-
setSuccess(false);
|
|
64059
|
-
setLoading(false);
|
|
64060
|
-
}
|
|
64061
|
-
};
|
|
64062
|
-
run2();
|
|
64063
|
-
}, []);
|
|
64064
|
-
return React19.createElement(MutationResult, {
|
|
64065
|
-
loading,
|
|
64066
|
-
loadingLabel: `Submitting eval with ${recordIds.length} records...`,
|
|
64067
|
-
success: success2,
|
|
64068
|
-
successMessage: "Eval submitted",
|
|
64069
|
-
error: error51,
|
|
64070
|
-
result: resultNode
|
|
64071
|
-
});
|
|
64072
|
-
};
|
|
64073
|
-
const { waitUntilExit } = render19(React19.createElement(App));
|
|
64074
|
-
await waitUntilExit();
|
|
64075
|
-
});
|
|
64076
|
-
evalCommand.command("list").description("List eval batches").option("--flow <id>", "Filter by flow ID").option("--limit <n>", "Limit results", "20").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(async (options) => {
|
|
64077
|
-
const apiKey = await ensureAuth();
|
|
64078
|
-
if (!apiKey) return;
|
|
64079
|
-
const client = createCliClient(apiKey);
|
|
64080
|
-
const params = { limit: options.limit };
|
|
64081
|
-
if (options.flow) params.flowId = options.flow;
|
|
64082
|
-
if (!isTTY(options) || options.json) {
|
|
64083
|
-
try {
|
|
64084
|
-
const data = await client.get("/eval/batches", params);
|
|
64085
|
-
if (options.json) {
|
|
64086
|
-
printJson(data);
|
|
64087
|
-
} else {
|
|
64088
|
-
const batches = data.data ?? [];
|
|
64089
|
-
if (batches.length === 0) {
|
|
64090
|
-
console.log(chalk27.gray("No eval batches found"));
|
|
64091
|
-
return;
|
|
64092
|
-
}
|
|
64093
|
-
console.log(chalk27.cyan("Eval Batches:"));
|
|
64094
|
-
for (const batch of batches) {
|
|
64095
|
-
const name = batch.name || batch.id;
|
|
64096
|
-
const progress = batch.totalRecords ? `${batch.completedRecords ?? 0}/${batch.totalRecords}` : "";
|
|
64097
|
-
const statusColor = batch.status === "completed" ? "green" : "yellow";
|
|
64098
|
-
console.log(
|
|
64099
|
-
` ${chalk27.green(batch.id)} ${name} ${chalk27[statusColor](`[${batch.status}]`)} ${chalk27.gray(progress)}`
|
|
64100
|
-
);
|
|
64101
|
-
}
|
|
64102
|
-
const total = getTotalCount(data.pagination);
|
|
64103
|
-
if (total !== void 0) {
|
|
64104
|
-
console.log(chalk27.dim(`
|
|
64105
|
-
Total: ${total} batches`));
|
|
64021
|
+
const client = createCliClient(apiKey);
|
|
64022
|
+
if (!isTTY(options) || options.json) {
|
|
64023
|
+
try {
|
|
64024
|
+
const data = await client.post("/eval/submit", {
|
|
64025
|
+
flowId: options.flow,
|
|
64026
|
+
recordIds,
|
|
64027
|
+
name: options.name
|
|
64028
|
+
});
|
|
64029
|
+
if (options.json) {
|
|
64030
|
+
printJson(data);
|
|
64031
|
+
} else {
|
|
64032
|
+
console.log(chalk27.green("Eval submitted"));
|
|
64033
|
+
console.log(` Batch ID: ${chalk27.green(data.id)}`);
|
|
64034
|
+
if (data.name) console.log(` Name: ${data.name}`);
|
|
64035
|
+
console.log(` Status: ${data.status}`);
|
|
64036
|
+
console.log(` Records: ${data.totalRecords}`);
|
|
64037
|
+
if (data.groupId) console.log(` Group: ${data.groupId}`);
|
|
64106
64038
|
}
|
|
64039
|
+
} catch (error51) {
|
|
64040
|
+
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64041
|
+
console.error(chalk27.red("Failed to submit eval"));
|
|
64042
|
+
console.error(chalk27.red(message));
|
|
64043
|
+
process.exit(1);
|
|
64107
64044
|
}
|
|
64108
|
-
|
|
64109
|
-
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64110
|
-
console.error(chalk27.red("Failed to fetch eval batches"));
|
|
64111
|
-
console.error(chalk27.red(message));
|
|
64112
|
-
process.exit(1);
|
|
64045
|
+
return;
|
|
64113
64046
|
}
|
|
64114
|
-
|
|
64047
|
+
const App = () => {
|
|
64048
|
+
const [loading, setLoading] = useState36(true);
|
|
64049
|
+
const [success2, setSuccess] = useState36(null);
|
|
64050
|
+
const [error51, setError] = useState36(null);
|
|
64051
|
+
const [resultNode, setResultNode] = useState36(void 0);
|
|
64052
|
+
useEffect30(() => {
|
|
64053
|
+
const run2 = async () => {
|
|
64054
|
+
try {
|
|
64055
|
+
const data = await client.post("/eval/submit", {
|
|
64056
|
+
flowId: options.flow,
|
|
64057
|
+
recordIds,
|
|
64058
|
+
name: options.name
|
|
64059
|
+
});
|
|
64060
|
+
const fields = [{ label: "Batch ID", value: data.id, color: "green" }];
|
|
64061
|
+
if (data.name) fields.push({ label: "Name", value: data.name });
|
|
64062
|
+
fields.push({ label: "Status", value: data.status });
|
|
64063
|
+
fields.push({ label: "Records", value: data.totalRecords });
|
|
64064
|
+
if (data.groupId) fields.push({ label: "Group", value: data.groupId });
|
|
64065
|
+
setResultNode(React19.createElement(EntityCard, { fields }));
|
|
64066
|
+
setSuccess(true);
|
|
64067
|
+
setLoading(false);
|
|
64068
|
+
} catch (err) {
|
|
64069
|
+
setError(err instanceof Error ? err : new Error(String(err)));
|
|
64070
|
+
setSuccess(false);
|
|
64071
|
+
setLoading(false);
|
|
64072
|
+
}
|
|
64073
|
+
};
|
|
64074
|
+
run2();
|
|
64075
|
+
}, []);
|
|
64076
|
+
return React19.createElement(MutationResult, {
|
|
64077
|
+
loading,
|
|
64078
|
+
loadingLabel: `Submitting eval with ${recordIds.length} records...`,
|
|
64079
|
+
success: success2,
|
|
64080
|
+
successMessage: "Eval submitted",
|
|
64081
|
+
error: error51,
|
|
64082
|
+
result: resultNode
|
|
64083
|
+
});
|
|
64084
|
+
};
|
|
64085
|
+
const { waitUntilExit } = render19(React19.createElement(App));
|
|
64086
|
+
await waitUntilExit();
|
|
64115
64087
|
}
|
|
64116
|
-
|
|
64117
|
-
|
|
64118
|
-
|
|
64119
|
-
const
|
|
64120
|
-
|
|
64121
|
-
|
|
64122
|
-
|
|
64123
|
-
|
|
64124
|
-
|
|
64125
|
-
|
|
64126
|
-
|
|
64127
|
-
|
|
64128
|
-
|
|
64129
|
-
|
|
64130
|
-
|
|
64088
|
+
);
|
|
64089
|
+
evalCommand.command("list").description("List eval batches").option("--flow <id>", "Filter by flow ID").option("--limit <n>", "Limit results", "20").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
|
|
64090
|
+
async (options) => {
|
|
64091
|
+
const apiKey = await ensureAuth();
|
|
64092
|
+
if (!apiKey) return;
|
|
64093
|
+
const client = createCliClient(apiKey);
|
|
64094
|
+
const params = { limit: options.limit };
|
|
64095
|
+
if (options.flow) params.flowId = options.flow;
|
|
64096
|
+
if (!isTTY(options) || options.json) {
|
|
64097
|
+
try {
|
|
64098
|
+
const data = await client.get("/eval/batches", params);
|
|
64099
|
+
if (options.json) {
|
|
64100
|
+
printJson(data);
|
|
64101
|
+
} else {
|
|
64102
|
+
const batches = data.data ?? [];
|
|
64103
|
+
if (batches.length === 0) {
|
|
64104
|
+
console.log(chalk27.gray("No eval batches found"));
|
|
64105
|
+
return;
|
|
64106
|
+
}
|
|
64107
|
+
console.log(chalk27.cyan("Eval Batches:"));
|
|
64108
|
+
for (const batch of batches) {
|
|
64109
|
+
const name = batch.name || batch.id;
|
|
64110
|
+
const progress = batch.totalRecords ? `${batch.completedRecords ?? 0}/${batch.totalRecords}` : "";
|
|
64111
|
+
const statusColor = batch.status === "completed" ? "green" : "yellow";
|
|
64112
|
+
console.log(
|
|
64113
|
+
` ${chalk27.green(batch.id)} ${name} ${chalk27[statusColor](`[${batch.status}]`)} ${chalk27.gray(progress)}`
|
|
64114
|
+
);
|
|
64115
|
+
}
|
|
64116
|
+
const total = getTotalCount(data.pagination);
|
|
64117
|
+
if (total !== void 0) {
|
|
64118
|
+
console.log(chalk27.dim(`
|
|
64119
|
+
Total: ${total} batches`));
|
|
64120
|
+
}
|
|
64131
64121
|
}
|
|
64132
|
-
}
|
|
64133
|
-
|
|
64134
|
-
|
|
64135
|
-
|
|
64136
|
-
|
|
64137
|
-
items,
|
|
64138
|
-
error: error51,
|
|
64139
|
-
loading,
|
|
64140
|
-
total,
|
|
64141
|
-
emptyMessage: "No eval batches found",
|
|
64142
|
-
renderCard: (item) => {
|
|
64143
|
-
const b = item;
|
|
64144
|
-
const name = b.name || b.id;
|
|
64145
|
-
const progress = b.totalRecords ? `${b.completedRecords ?? 0}/${b.totalRecords}` : "";
|
|
64146
|
-
const statusColor = b.status === "completed" ? "green" : "yellow";
|
|
64147
|
-
return React19.createElement(
|
|
64148
|
-
Text34,
|
|
64149
|
-
{ color: statusColor },
|
|
64150
|
-
` ${b.id} ${name} [${b.status}] ${progress}`
|
|
64151
|
-
);
|
|
64122
|
+
} catch (error51) {
|
|
64123
|
+
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64124
|
+
console.error(chalk27.red("Failed to fetch eval batches"));
|
|
64125
|
+
console.error(chalk27.red(message));
|
|
64126
|
+
process.exit(1);
|
|
64152
64127
|
}
|
|
64153
|
-
|
|
64154
|
-
|
|
64155
|
-
|
|
64156
|
-
|
|
64157
|
-
|
|
64128
|
+
return;
|
|
64129
|
+
}
|
|
64130
|
+
const App = () => {
|
|
64131
|
+
const [loading, setLoading] = useState36(true);
|
|
64132
|
+
const [items, setItems] = useState36(null);
|
|
64133
|
+
const [total, setTotal] = useState36(void 0);
|
|
64134
|
+
const [error51, setError] = useState36(null);
|
|
64135
|
+
useEffect30(() => {
|
|
64136
|
+
const run2 = async () => {
|
|
64137
|
+
try {
|
|
64138
|
+
const data = await client.get("/eval/batches", params);
|
|
64139
|
+
setItems(data.data ?? []);
|
|
64140
|
+
setTotal(getTotalCount(data.pagination));
|
|
64141
|
+
setLoading(false);
|
|
64142
|
+
} catch (err) {
|
|
64143
|
+
setError(err instanceof Error ? err : new Error(String(err)));
|
|
64144
|
+
setLoading(false);
|
|
64145
|
+
}
|
|
64146
|
+
};
|
|
64147
|
+
run2();
|
|
64148
|
+
}, []);
|
|
64149
|
+
return React19.createElement(DataList, {
|
|
64150
|
+
title: "Eval Batches",
|
|
64151
|
+
items,
|
|
64152
|
+
error: error51,
|
|
64153
|
+
loading,
|
|
64154
|
+
total,
|
|
64155
|
+
emptyMessage: "No eval batches found",
|
|
64156
|
+
renderCard: (item) => {
|
|
64157
|
+
const b = item;
|
|
64158
|
+
const name = b.name || b.id;
|
|
64159
|
+
const progress = b.totalRecords ? `${b.completedRecords ?? 0}/${b.totalRecords}` : "";
|
|
64160
|
+
const statusColor = b.status === "completed" ? "green" : "yellow";
|
|
64161
|
+
return React19.createElement(
|
|
64162
|
+
Text34,
|
|
64163
|
+
{ color: statusColor },
|
|
64164
|
+
` ${b.id} ${name} [${b.status}] ${progress}`
|
|
64165
|
+
);
|
|
64166
|
+
}
|
|
64167
|
+
});
|
|
64168
|
+
};
|
|
64169
|
+
const { waitUntilExit } = render19(React19.createElement(App));
|
|
64170
|
+
await waitUntilExit();
|
|
64171
|
+
}
|
|
64172
|
+
);
|
|
64158
64173
|
evalCommand.command("results <id>").description("Get eval batch results").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(async (id, options) => {
|
|
64159
64174
|
const apiKey = await ensureAuth();
|
|
64160
64175
|
if (!apiKey) return;
|
|
64161
64176
|
const client = createCliClient(apiKey);
|
|
64162
64177
|
if (!isTTY(options) || options.json) {
|
|
64163
64178
|
try {
|
|
64164
|
-
const data = await client.get(
|
|
64179
|
+
const data = await client.get(
|
|
64180
|
+
`/eval/${id}/results`
|
|
64181
|
+
);
|
|
64165
64182
|
if (options.json) {
|
|
64166
64183
|
printJson(data);
|
|
64167
64184
|
} else {
|
|
64168
64185
|
if (data.batch) {
|
|
64169
64186
|
console.log(chalk27.cyan(`Eval: ${data.batch.name || data.batch.id}`));
|
|
64170
64187
|
console.log(` Status: ${data.batch.status}`);
|
|
64171
|
-
console.log(
|
|
64188
|
+
console.log(
|
|
64189
|
+
` Progress: ${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}`
|
|
64190
|
+
);
|
|
64172
64191
|
console.log();
|
|
64173
64192
|
}
|
|
64174
64193
|
const results = data.data ?? [];
|
|
@@ -64201,19 +64220,28 @@ evalCommand.command("results <id>").description("Get eval batch results").option
|
|
|
64201
64220
|
useEffect30(() => {
|
|
64202
64221
|
const run2 = async () => {
|
|
64203
64222
|
try {
|
|
64204
|
-
const data = await client.get(
|
|
64223
|
+
const data = await client.get(
|
|
64224
|
+
`/eval/${id}/results`
|
|
64225
|
+
);
|
|
64205
64226
|
const results = data.data ?? [];
|
|
64206
64227
|
const fields = [];
|
|
64207
64228
|
if (data.batch) {
|
|
64208
64229
|
fields.push({ label: "Eval", value: data.batch.name || data.batch.id });
|
|
64209
64230
|
fields.push({ label: "Status", value: data.batch.status });
|
|
64210
|
-
fields.push({
|
|
64231
|
+
fields.push({
|
|
64232
|
+
label: "Progress",
|
|
64233
|
+
value: `${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}`
|
|
64234
|
+
});
|
|
64211
64235
|
}
|
|
64212
64236
|
fields.push({ label: "Results", value: results.length });
|
|
64213
64237
|
if (results.length > 0) {
|
|
64214
64238
|
const completed = results.filter((r) => r.status === "completed").length;
|
|
64215
64239
|
const avgScore = results.filter((r) => r.score !== void 0).reduce((sum, r) => sum + (r.score ?? 0), 0) / (results.filter((r) => r.score !== void 0).length || 1);
|
|
64216
|
-
fields.push({
|
|
64240
|
+
fields.push({
|
|
64241
|
+
label: "Completed",
|
|
64242
|
+
value: `${completed}/${results.length}`,
|
|
64243
|
+
color: "green"
|
|
64244
|
+
});
|
|
64217
64245
|
if (results.some((r) => r.score !== void 0)) {
|
|
64218
64246
|
fields.push({ label: "Avg Score", value: avgScore.toFixed(2) });
|
|
64219
64247
|
}
|
|
@@ -64309,6 +64337,13 @@ function printSuiteResult(rootDir, outcome) {
|
|
|
64309
64337
|
if (testCase.errored) {
|
|
64310
64338
|
console.log(chalk27.red(` \u2717 errored: ${testCase.outputExcerpt.slice(0, 200)}`));
|
|
64311
64339
|
}
|
|
64340
|
+
} else {
|
|
64341
|
+
for (const outcomeItem of testCase.outcomes.filter(
|
|
64342
|
+
(o) => !o.passed && o.severity === "soft"
|
|
64343
|
+
)) {
|
|
64344
|
+
const reason = outcomeItem.reasoning ? `: ${outcomeItem.reasoning}` : "";
|
|
64345
|
+
console.log(chalk27.yellow(` \u26A0 soft ${outcomeItem.kind}${reason}`));
|
|
64346
|
+
}
|
|
64312
64347
|
}
|
|
64313
64348
|
}
|
|
64314
64349
|
}
|
|
@@ -64323,7 +64358,12 @@ function toJUnitSuite(outcome) {
|
|
|
64323
64358
|
}))
|
|
64324
64359
|
};
|
|
64325
64360
|
}
|
|
64326
|
-
evalCommand.command("run [idOrDirPrefix]").description(
|
|
64361
|
+
evalCommand.command("run [idOrDirPrefix]").description(
|
|
64362
|
+
"Run code-colocated eval suites (**/*.eval.ts) as a CI gate (exit 0 pass / 1 fail / 2 config)"
|
|
64363
|
+
).option(
|
|
64364
|
+
"--strict",
|
|
64365
|
+
"Fail the exit code on soft grader misses too (default: soft misses are reported but do not fail)"
|
|
64366
|
+
).option("--virtual", "Run inline without persisting a suite/batch to the dashboard").option("--junit <path>", "Write JUnit XML results to <path>").option("--url <api>", "Override the API base URL (e.g. staging)").option("--cwd <dir>", "Directory to discover *.eval.ts under (default: current directory)").action(
|
|
64327
64367
|
async (idOrDirPrefix, options) => {
|
|
64328
64368
|
const apiKey = await ensureAuth();
|
|
64329
64369
|
if (!apiKey) {
|
|
@@ -64361,11 +64401,6 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
|
|
|
64361
64401
|
} else {
|
|
64362
64402
|
loaded = await loadAll(allFiles);
|
|
64363
64403
|
}
|
|
64364
|
-
if (options.strict) {
|
|
64365
|
-
console.log(
|
|
64366
|
-
chalk27.gray("Note: --strict has no effect yet (grader severity lands in a later increment).")
|
|
64367
|
-
);
|
|
64368
|
-
}
|
|
64369
64404
|
const client = createCliClient(apiKey, options.url);
|
|
64370
64405
|
const outcomes = [];
|
|
64371
64406
|
for (const { file: file2, def } of loaded) {
|
|
@@ -64373,16 +64408,25 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
|
|
|
64373
64408
|
try {
|
|
64374
64409
|
let result;
|
|
64375
64410
|
if (runVirtual) {
|
|
64376
|
-
result = await client.post("/eval/run", {
|
|
64377
|
-
|
|
64378
|
-
|
|
64379
|
-
name: def.name,
|
|
64380
|
-
definition: def
|
|
64411
|
+
result = await client.post("/eval/run", {
|
|
64412
|
+
definition: def,
|
|
64413
|
+
strict: options.strict ?? false
|
|
64381
64414
|
});
|
|
64415
|
+
} else {
|
|
64416
|
+
const ensured = await client.post(
|
|
64417
|
+
"/eval/ensure",
|
|
64418
|
+
{
|
|
64419
|
+
name: def.name,
|
|
64420
|
+
definition: def
|
|
64421
|
+
}
|
|
64422
|
+
);
|
|
64382
64423
|
if (!ensured.suiteId) {
|
|
64383
64424
|
throw new Error(`ensure did not return a suiteId (result: ${ensured.result})`);
|
|
64384
64425
|
}
|
|
64385
|
-
result = await client.post("/eval/run", {
|
|
64426
|
+
result = await client.post("/eval/run", {
|
|
64427
|
+
suiteId: ensured.suiteId,
|
|
64428
|
+
strict: options.strict ?? false
|
|
64429
|
+
});
|
|
64386
64430
|
}
|
|
64387
64431
|
outcomes.push({ file: file2, definition: def, result });
|
|
64388
64432
|
printSuiteResult(rootDir, { file: file2, definition: def, result });
|
|
@@ -64400,7 +64444,9 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
|
|
|
64400
64444
|
writeFileSync6(outPath, xml, "utf-8");
|
|
64401
64445
|
console.log(chalk27.gray(`JUnit results written to ${options.junit}`));
|
|
64402
64446
|
} catch (error51) {
|
|
64403
|
-
failConfig(
|
|
64447
|
+
failConfig(
|
|
64448
|
+
`Failed to write JUnit report: ${error51 instanceof Error ? error51.message : String(error51)}`
|
|
64449
|
+
);
|
|
64404
64450
|
}
|
|
64405
64451
|
}
|
|
64406
64452
|
const failedSuites = outcomes.filter((o) => !o.result.passed);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@runtypelabs/cli",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.24.0",
|
|
4
4
|
"description": "Command-line interface for Runtype AI platform",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
"rosie-skills": "0.8.1",
|
|
25
25
|
"yaml": "^2.9.0",
|
|
26
26
|
"@runtypelabs/ink-components": "0.3.4",
|
|
27
|
-
"@runtypelabs/sdk": "5.
|
|
27
|
+
"@runtypelabs/sdk": "5.6.0",
|
|
28
28
|
"@runtypelabs/terminal-animations": "0.2.1"
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
"tsx": "^4.7.1",
|
|
40
40
|
"typescript": "^6.0.3",
|
|
41
41
|
"vitest": "^4.1.0",
|
|
42
|
-
"@runtypelabs/shared": "1.42.
|
|
42
|
+
"@runtypelabs/shared": "1.42.6"
|
|
43
43
|
},
|
|
44
44
|
"engines": {
|
|
45
45
|
"node": ">=22.0.0"
|