@runtypelabs/cli 2.23.0 → 2.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +303 -185
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -20404,6 +20404,41 @@ function validateUpsertRecordSourceShape(flowSteps, buckets) {
|
|
|
20404
20404
|
}
|
|
20405
20405
|
}
|
|
20406
20406
|
}
|
|
20407
|
+
function validateStoreVectorSource(flowSteps, buckets, declaredFlowInputs) {
|
|
20408
|
+
const declaredVariables = collectDeclaredFlowInputs(flowSteps, declaredFlowInputs);
|
|
20409
|
+
for (const [stepIndex, step] of flowSteps.entries()) {
|
|
20410
|
+
if (step.enabled === false) continue;
|
|
20411
|
+
if (!isObjectRecord(step.config)) continue;
|
|
20412
|
+
const config3 = step.config;
|
|
20413
|
+
if (step.type === "store-vector") {
|
|
20414
|
+
const rawSource = typeof config3.vectorsSource === "string" ? config3.vectorsSource.trim() : "";
|
|
20415
|
+
if (rawSource) {
|
|
20416
|
+
const templateMatch = rawSource.match(/^\s*\{\{\s*([^}]+?)\s*\}\}\s*$/);
|
|
20417
|
+
const reference = (templateMatch?.[1] ?? rawSource).trim();
|
|
20418
|
+
const classification = classifyVariableReference(reference);
|
|
20419
|
+
if (classification.namespace === "plain" || classification.namespace === "flow") {
|
|
20420
|
+
const baseName = classification.baseName;
|
|
20421
|
+
const rootVariable = baseName.split(".")[0] || "";
|
|
20422
|
+
if (rootVariable && !declaredVariables.has(rootVariable) && !declaredVariables.has(baseName)) {
|
|
20423
|
+
addIssue(
|
|
20424
|
+
"warning",
|
|
20425
|
+
{
|
|
20426
|
+
code: "STORE_VECTOR_SOURCE_UNRESOLVED",
|
|
20427
|
+
message: `Vectors source "${rawSource}" references variable "${rootVariable}", but no earlier step declares an output variable "${rootVariable}" (and it is not a flow input). This will fail at runtime with "Could not resolve vectors". Set a prior step's outputVariable to "${rootVariable}" (typically a generate-embedding step), or declare "${rootVariable}" as a flow input.`,
|
|
20428
|
+
path: `flowSteps[${stepIndex}].config.vectorsSource`,
|
|
20429
|
+
step: { index: stepIndex, name: step.name, type: step.type },
|
|
20430
|
+
details: { vectorsSource: rawSource, rootVariable }
|
|
20431
|
+
},
|
|
20432
|
+
buckets
|
|
20433
|
+
);
|
|
20434
|
+
}
|
|
20435
|
+
}
|
|
20436
|
+
}
|
|
20437
|
+
}
|
|
20438
|
+
const outputVar = getStepOutputVariable(step);
|
|
20439
|
+
if (outputVar) declaredVariables.add(outputVar);
|
|
20440
|
+
}
|
|
20441
|
+
}
|
|
20407
20442
|
function checkConditionExpression(expr, path19, stepRef, buckets) {
|
|
20408
20443
|
if (typeof expr !== "string" || !expr.includes("{{")) return;
|
|
20409
20444
|
const match = UNQUOTED_TEMPLATE_BEFORE_OP.exec(expr) || UNQUOTED_TEMPLATE_AFTER_OP.exec(expr);
|
|
@@ -21061,6 +21096,7 @@ function collectFlowStructureIssues(flowData, deps, buckets) {
|
|
|
21061
21096
|
deps.declaredFlowInputs
|
|
21062
21097
|
);
|
|
21063
21098
|
validateUpsertRecordSourceShape(flowData.flowSteps, buckets);
|
|
21099
|
+
validateStoreVectorSource(flowData.flowSteps, buckets, deps.declaredFlowInputs);
|
|
21064
21100
|
validateConditionExpressions(flowData.flowSteps, buckets, conditionalStepsExceedingDepth);
|
|
21065
21101
|
return { pendingChecks };
|
|
21066
21102
|
}
|
|
@@ -37555,39 +37591,87 @@ var BUILT_IN_GRADER_IDS = [
|
|
|
37555
37591
|
"rightTone",
|
|
37556
37592
|
"safeToSend"
|
|
37557
37593
|
];
|
|
37594
|
+
var graderSeveritySchema = external_exports.enum(["gate", "soft"]);
|
|
37595
|
+
var severityFields = { severity: graderSeveritySchema.optional() };
|
|
37558
37596
|
var checkGraderSchema = external_exports.discriminatedUnion("kind", [
|
|
37559
37597
|
external_exports.object({
|
|
37560
37598
|
kind: external_exports.literal("contains"),
|
|
37561
37599
|
value: external_exports.string(),
|
|
37562
|
-
caseSensitive: external_exports.boolean().optional()
|
|
37600
|
+
caseSensitive: external_exports.boolean().optional(),
|
|
37601
|
+
...severityFields
|
|
37563
37602
|
}),
|
|
37564
37603
|
external_exports.object({
|
|
37565
37604
|
kind: external_exports.literal("not_contains"),
|
|
37566
37605
|
value: external_exports.string(),
|
|
37567
|
-
caseSensitive: external_exports.boolean().optional()
|
|
37606
|
+
caseSensitive: external_exports.boolean().optional(),
|
|
37607
|
+
...severityFields
|
|
37568
37608
|
}),
|
|
37569
37609
|
// Exact/normalized match against `case.expected.text`.
|
|
37570
|
-
external_exports.object({ kind: external_exports.literal("matches_expected") }),
|
|
37610
|
+
external_exports.object({ kind: external_exports.literal("matches_expected"), ...severityFields }),
|
|
37571
37611
|
external_exports.object({
|
|
37572
37612
|
kind: external_exports.literal("regex"),
|
|
37573
37613
|
pattern: external_exports.string(),
|
|
37574
|
-
flags: external_exports.string().optional()
|
|
37614
|
+
flags: external_exports.string().optional(),
|
|
37615
|
+
...severityFields
|
|
37575
37616
|
}),
|
|
37576
|
-
external_exports.object({ kind: external_exports.literal("valid_json") }),
|
|
37617
|
+
external_exports.object({ kind: external_exports.literal("valid_json"), ...severityFields }),
|
|
37577
37618
|
external_exports.object({
|
|
37578
37619
|
kind: external_exports.literal("json_field"),
|
|
37579
37620
|
path: external_exports.string(),
|
|
37580
37621
|
equals: external_exports.unknown().optional(),
|
|
37581
|
-
exists: external_exports.boolean().optional()
|
|
37622
|
+
exists: external_exports.boolean().optional(),
|
|
37623
|
+
...severityFields
|
|
37582
37624
|
}),
|
|
37583
37625
|
external_exports.object({
|
|
37584
37626
|
kind: external_exports.literal("length"),
|
|
37585
37627
|
minChars: external_exports.number().int().nonnegative().optional(),
|
|
37586
|
-
maxChars: external_exports.number().int().nonnegative().optional()
|
|
37628
|
+
maxChars: external_exports.number().int().nonnegative().optional(),
|
|
37629
|
+
...severityFields
|
|
37587
37630
|
}),
|
|
37588
|
-
external_exports.object({ kind: external_exports.literal("latency"), maxMs: external_exports.number().int().positive() }),
|
|
37631
|
+
external_exports.object({ kind: external_exports.literal("latency"), maxMs: external_exports.number().int().positive(), ...severityFields }),
|
|
37589
37632
|
// Today's implicit "success" made explicit: the case produced output without erroring.
|
|
37590
|
-
external_exports.object({ kind: external_exports.literal("no_error") })
|
|
37633
|
+
external_exports.object({ kind: external_exports.literal("no_error"), ...severityFields }),
|
|
37634
|
+
// -------------------------------------------------------------------------
|
|
37635
|
+
// Trace checks — deterministic, free, pure assertions over the run's
|
|
37636
|
+
// EXECUTION TRACE (which tools/steps ran, in what order, whether it
|
|
37637
|
+
// completed, what it cost) rather than its final output text. Scored by the
|
|
37638
|
+
// same pure `runCheck` engine against `GradingTarget.trace`. These are the
|
|
37639
|
+
// assertions a string/JSON check can't express (planning doc §3.1).
|
|
37640
|
+
// -------------------------------------------------------------------------
|
|
37641
|
+
// At least one tool call named `name` happened. Optional filters narrow the
|
|
37642
|
+
// match: `input`/`output` deep-equal a call's resolved input/result,
|
|
37643
|
+
// `isError` matches a call's error flag, and `times` asserts the matching
|
|
37644
|
+
// count EXACTLY (omit `times` for "at least once").
|
|
37645
|
+
external_exports.object({
|
|
37646
|
+
kind: external_exports.literal("called_tool"),
|
|
37647
|
+
name: external_exports.string().min(1),
|
|
37648
|
+
input: external_exports.unknown().optional(),
|
|
37649
|
+
output: external_exports.unknown().optional(),
|
|
37650
|
+
isError: external_exports.boolean().optional(),
|
|
37651
|
+
times: external_exports.number().int().positive().optional(),
|
|
37652
|
+
...severityFields
|
|
37653
|
+
}),
|
|
37654
|
+
// No tool named `name` was called.
|
|
37655
|
+
external_exports.object({ kind: external_exports.literal("not_called_tool"), name: external_exports.string().min(1), ...severityFields }),
|
|
37656
|
+
// The run made no tool calls at all.
|
|
37657
|
+
external_exports.object({ kind: external_exports.literal("used_no_tools"), ...severityFields }),
|
|
37658
|
+
// The run made at most `max` tool calls.
|
|
37659
|
+
external_exports.object({
|
|
37660
|
+
kind: external_exports.literal("max_tool_calls"),
|
|
37661
|
+
max: external_exports.number().int().nonnegative(),
|
|
37662
|
+
...severityFields
|
|
37663
|
+
}),
|
|
37664
|
+
// `tools` appears as an ordered SUBSEQUENCE of the tool-call names (other
|
|
37665
|
+
// calls may interleave; relative order of the listed tools must hold).
|
|
37666
|
+
external_exports.object({ kind: external_exports.literal("tool_order"), tools: external_exports.array(external_exports.string()).min(1), ...severityFields }),
|
|
37667
|
+
// A step named (or typed) `name` ran.
|
|
37668
|
+
external_exports.object({ kind: external_exports.literal("ran_step"), name: external_exports.string().min(1), ...severityFields }),
|
|
37669
|
+
// `steps` appears as an ordered SUBSEQUENCE of the steps that ran.
|
|
37670
|
+
external_exports.object({ kind: external_exports.literal("step_order"), steps: external_exports.array(external_exports.string()).min(1), ...severityFields }),
|
|
37671
|
+
// The run completed (finished without erroring and was not left paused).
|
|
37672
|
+
external_exports.object({ kind: external_exports.literal("completed"), ...severityFields }),
|
|
37673
|
+
// Total run cost was within `maxUsd` (US dollars).
|
|
37674
|
+
external_exports.object({ kind: external_exports.literal("cost"), maxUsd: external_exports.number().positive(), ...severityFields })
|
|
37591
37675
|
]);
|
|
37592
37676
|
var aiGraderSchema = external_exports.object({
|
|
37593
37677
|
kind: external_exports.literal("ai"),
|
|
@@ -37599,7 +37683,8 @@ var aiGraderSchema = external_exports.object({
|
|
|
37599
37683
|
/** Defaults to a cheap routed model (e.g. claude-haiku-4-5) at execution time. */
|
|
37600
37684
|
model: external_exports.string().optional(),
|
|
37601
37685
|
/** Pass cutoff for the 1-5 scale. */
|
|
37602
|
-
threshold: external_exports.number().min(1).max(5).optional()
|
|
37686
|
+
threshold: external_exports.number().min(1).max(5).optional(),
|
|
37687
|
+
...severityFields
|
|
37603
37688
|
});
|
|
37604
37689
|
var graderConfigSchema = external_exports.union([checkGraderSchema, aiGraderSchema]);
|
|
37605
37690
|
var gradersSchema = external_exports.array(graderConfigSchema);
|
|
@@ -42043,7 +42128,7 @@ var FLOW_STEP_TYPE_METADATA = {
|
|
|
42043
42128
|
configHints: "provider, query, maxResults, outputVariable"
|
|
42044
42129
|
},
|
|
42045
42130
|
"generate-embedding": {
|
|
42046
|
-
description: "Create a vector embedding from text using an embedding model.",
|
|
42131
|
+
description: "Create a vector embedding from text using an embedding model. Writes { embedding, model, dimensions, textLength, metadata } to outputVariable; feed that variable into a store-vector step via vectorsSource.",
|
|
42047
42132
|
category: "vector",
|
|
42048
42133
|
isPrompt: false,
|
|
42049
42134
|
configHints: "inputSource, text, embeddingModel, outputVariable"
|
|
@@ -42055,10 +42140,10 @@ var FLOW_STEP_TYPE_METADATA = {
|
|
|
42055
42140
|
configHints: "query, limit, threshold, outputVariable"
|
|
42056
42141
|
},
|
|
42057
42142
|
"store-vector": {
|
|
42058
|
-
description: "Store vector embeddings in a vector database.",
|
|
42143
|
+
description: "Store vector embeddings in a vector database (pgvector, Weaviate, or Vectorize). vectorsSource accepts a bare variable name, a dot-path, or a {{var}} template, and must resolve to a number[] or an { embedding: number[] } object (the output of a prior generate-embedding step). The vector length must match the target index dimension.",
|
|
42059
42144
|
category: "vector",
|
|
42060
42145
|
isPrompt: false,
|
|
42061
|
-
configHints: "vectorsSource, destination, outputVariable"
|
|
42146
|
+
configHints: "vectorsSource, destination, idTemplate, outputVariable"
|
|
42062
42147
|
},
|
|
42063
42148
|
crawl: {
|
|
42064
42149
|
description: "Crawl a website and extract content from pages.",
|
|
@@ -63918,185 +64003,191 @@ function buildJUnitXml(suites) {
|
|
|
63918
64003
|
|
|
63919
64004
|
// src/commands/eval.ts
|
|
63920
64005
|
var evalCommand = new Command20("eval").description("Manage evaluations");
|
|
63921
|
-
evalCommand.command("submit").description("Submit an eval batch").requiredOption("-f, --flow <id>", "Flow ID to evaluate").requiredOption("-r, --records <file>", "JSON file with record IDs").option("-n, --name <name>", "Eval batch name").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
|
|
63922
|
-
|
|
63923
|
-
|
|
63924
|
-
|
|
63925
|
-
|
|
63926
|
-
const content = readFileSync16(options.records, "utf-8");
|
|
63927
|
-
const parsed = JSON.parse(content);
|
|
63928
|
-
recordIds = Array.isArray(parsed) ? parsed : parsed.recordIds || parsed.records || [];
|
|
63929
|
-
} catch (error51) {
|
|
63930
|
-
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
63931
|
-
console.error(chalk27.red(`Failed to read records file: ${message}`));
|
|
63932
|
-
process.exit(1);
|
|
63933
|
-
return;
|
|
63934
|
-
}
|
|
63935
|
-
const client = createCliClient(apiKey);
|
|
63936
|
-
if (!isTTY(options) || options.json) {
|
|
64006
|
+
evalCommand.command("submit").description("Submit an eval batch").requiredOption("-f, --flow <id>", "Flow ID to evaluate").requiredOption("-r, --records <file>", "JSON file with record IDs").option("-n, --name <name>", "Eval batch name").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
|
|
64007
|
+
async (options) => {
|
|
64008
|
+
const apiKey = await ensureAuth();
|
|
64009
|
+
if (!apiKey) return;
|
|
64010
|
+
let recordIds;
|
|
63937
64011
|
try {
|
|
63938
|
-
const
|
|
63939
|
-
|
|
63940
|
-
|
|
63941
|
-
name: options.name
|
|
63942
|
-
});
|
|
63943
|
-
if (options.json) {
|
|
63944
|
-
printJson(data);
|
|
63945
|
-
} else {
|
|
63946
|
-
console.log(chalk27.green("Eval submitted"));
|
|
63947
|
-
console.log(` Batch ID: ${chalk27.green(data.id)}`);
|
|
63948
|
-
if (data.name) console.log(` Name: ${data.name}`);
|
|
63949
|
-
console.log(` Status: ${data.status}`);
|
|
63950
|
-
console.log(` Records: ${data.totalRecords}`);
|
|
63951
|
-
if (data.groupId) console.log(` Group: ${data.groupId}`);
|
|
63952
|
-
}
|
|
64012
|
+
const content = readFileSync16(options.records, "utf-8");
|
|
64013
|
+
const parsed = JSON.parse(content);
|
|
64014
|
+
recordIds = Array.isArray(parsed) ? parsed : parsed.recordIds || parsed.records || [];
|
|
63953
64015
|
} catch (error51) {
|
|
63954
64016
|
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
63955
|
-
console.error(chalk27.red(
|
|
63956
|
-
console.error(chalk27.red(message));
|
|
64017
|
+
console.error(chalk27.red(`Failed to read records file: ${message}`));
|
|
63957
64018
|
process.exit(1);
|
|
64019
|
+
return;
|
|
63958
64020
|
}
|
|
63959
|
-
|
|
63960
|
-
|
|
63961
|
-
|
|
63962
|
-
|
|
63963
|
-
|
|
63964
|
-
|
|
63965
|
-
|
|
63966
|
-
|
|
63967
|
-
|
|
63968
|
-
|
|
63969
|
-
|
|
63970
|
-
|
|
63971
|
-
|
|
63972
|
-
|
|
63973
|
-
});
|
|
63974
|
-
|
|
63975
|
-
|
|
63976
|
-
];
|
|
63977
|
-
if (data.name) fields.push({ label: "Name", value: data.name });
|
|
63978
|
-
fields.push({ label: "Status", value: data.status });
|
|
63979
|
-
fields.push({ label: "Records", value: data.totalRecords });
|
|
63980
|
-
if (data.groupId) fields.push({ label: "Group", value: data.groupId });
|
|
63981
|
-
setResultNode(React19.createElement(EntityCard, { fields }));
|
|
63982
|
-
setSuccess(true);
|
|
63983
|
-
setLoading(false);
|
|
63984
|
-
} catch (err) {
|
|
63985
|
-
setError(err instanceof Error ? err : new Error(String(err)));
|
|
63986
|
-
setSuccess(false);
|
|
63987
|
-
setLoading(false);
|
|
63988
|
-
}
|
|
63989
|
-
};
|
|
63990
|
-
run2();
|
|
63991
|
-
}, []);
|
|
63992
|
-
return React19.createElement(MutationResult, {
|
|
63993
|
-
loading,
|
|
63994
|
-
loadingLabel: `Submitting eval with ${recordIds.length} records...`,
|
|
63995
|
-
success: success2,
|
|
63996
|
-
successMessage: "Eval submitted",
|
|
63997
|
-
error: error51,
|
|
63998
|
-
result: resultNode
|
|
63999
|
-
});
|
|
64000
|
-
};
|
|
64001
|
-
const { waitUntilExit } = render19(React19.createElement(App));
|
|
64002
|
-
await waitUntilExit();
|
|
64003
|
-
});
|
|
64004
|
-
evalCommand.command("list").description("List eval batches").option("--flow <id>", "Filter by flow ID").option("--limit <n>", "Limit results", "20").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(async (options) => {
|
|
64005
|
-
const apiKey = await ensureAuth();
|
|
64006
|
-
if (!apiKey) return;
|
|
64007
|
-
const client = createCliClient(apiKey);
|
|
64008
|
-
const params = { limit: options.limit };
|
|
64009
|
-
if (options.flow) params.flowId = options.flow;
|
|
64010
|
-
if (!isTTY(options) || options.json) {
|
|
64011
|
-
try {
|
|
64012
|
-
const data = await client.get("/eval/batches", params);
|
|
64013
|
-
if (options.json) {
|
|
64014
|
-
printJson(data);
|
|
64015
|
-
} else {
|
|
64016
|
-
const batches = data.data ?? [];
|
|
64017
|
-
if (batches.length === 0) {
|
|
64018
|
-
console.log(chalk27.gray("No eval batches found"));
|
|
64019
|
-
return;
|
|
64020
|
-
}
|
|
64021
|
-
console.log(chalk27.cyan("Eval Batches:"));
|
|
64022
|
-
for (const batch of batches) {
|
|
64023
|
-
const name = batch.name || batch.id;
|
|
64024
|
-
const progress = batch.totalRecords ? `${batch.completedRecords ?? 0}/${batch.totalRecords}` : "";
|
|
64025
|
-
const statusColor = batch.status === "completed" ? "green" : "yellow";
|
|
64026
|
-
console.log(
|
|
64027
|
-
` ${chalk27.green(batch.id)} ${name} ${chalk27[statusColor](`[${batch.status}]`)} ${chalk27.gray(progress)}`
|
|
64028
|
-
);
|
|
64029
|
-
}
|
|
64030
|
-
const total = getTotalCount(data.pagination);
|
|
64031
|
-
if (total !== void 0) {
|
|
64032
|
-
console.log(chalk27.dim(`
|
|
64033
|
-
Total: ${total} batches`));
|
|
64021
|
+
const client = createCliClient(apiKey);
|
|
64022
|
+
if (!isTTY(options) || options.json) {
|
|
64023
|
+
try {
|
|
64024
|
+
const data = await client.post("/eval/submit", {
|
|
64025
|
+
flowId: options.flow,
|
|
64026
|
+
recordIds,
|
|
64027
|
+
name: options.name
|
|
64028
|
+
});
|
|
64029
|
+
if (options.json) {
|
|
64030
|
+
printJson(data);
|
|
64031
|
+
} else {
|
|
64032
|
+
console.log(chalk27.green("Eval submitted"));
|
|
64033
|
+
console.log(` Batch ID: ${chalk27.green(data.id)}`);
|
|
64034
|
+
if (data.name) console.log(` Name: ${data.name}`);
|
|
64035
|
+
console.log(` Status: ${data.status}`);
|
|
64036
|
+
console.log(` Records: ${data.totalRecords}`);
|
|
64037
|
+
if (data.groupId) console.log(` Group: ${data.groupId}`);
|
|
64034
64038
|
}
|
|
64039
|
+
} catch (error51) {
|
|
64040
|
+
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64041
|
+
console.error(chalk27.red("Failed to submit eval"));
|
|
64042
|
+
console.error(chalk27.red(message));
|
|
64043
|
+
process.exit(1);
|
|
64035
64044
|
}
|
|
64036
|
-
|
|
64037
|
-
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64038
|
-
console.error(chalk27.red("Failed to fetch eval batches"));
|
|
64039
|
-
console.error(chalk27.red(message));
|
|
64040
|
-
process.exit(1);
|
|
64045
|
+
return;
|
|
64041
64046
|
}
|
|
64042
|
-
|
|
64047
|
+
const App = () => {
|
|
64048
|
+
const [loading, setLoading] = useState36(true);
|
|
64049
|
+
const [success2, setSuccess] = useState36(null);
|
|
64050
|
+
const [error51, setError] = useState36(null);
|
|
64051
|
+
const [resultNode, setResultNode] = useState36(void 0);
|
|
64052
|
+
useEffect30(() => {
|
|
64053
|
+
const run2 = async () => {
|
|
64054
|
+
try {
|
|
64055
|
+
const data = await client.post("/eval/submit", {
|
|
64056
|
+
flowId: options.flow,
|
|
64057
|
+
recordIds,
|
|
64058
|
+
name: options.name
|
|
64059
|
+
});
|
|
64060
|
+
const fields = [{ label: "Batch ID", value: data.id, color: "green" }];
|
|
64061
|
+
if (data.name) fields.push({ label: "Name", value: data.name });
|
|
64062
|
+
fields.push({ label: "Status", value: data.status });
|
|
64063
|
+
fields.push({ label: "Records", value: data.totalRecords });
|
|
64064
|
+
if (data.groupId) fields.push({ label: "Group", value: data.groupId });
|
|
64065
|
+
setResultNode(React19.createElement(EntityCard, { fields }));
|
|
64066
|
+
setSuccess(true);
|
|
64067
|
+
setLoading(false);
|
|
64068
|
+
} catch (err) {
|
|
64069
|
+
setError(err instanceof Error ? err : new Error(String(err)));
|
|
64070
|
+
setSuccess(false);
|
|
64071
|
+
setLoading(false);
|
|
64072
|
+
}
|
|
64073
|
+
};
|
|
64074
|
+
run2();
|
|
64075
|
+
}, []);
|
|
64076
|
+
return React19.createElement(MutationResult, {
|
|
64077
|
+
loading,
|
|
64078
|
+
loadingLabel: `Submitting eval with ${recordIds.length} records...`,
|
|
64079
|
+
success: success2,
|
|
64080
|
+
successMessage: "Eval submitted",
|
|
64081
|
+
error: error51,
|
|
64082
|
+
result: resultNode
|
|
64083
|
+
});
|
|
64084
|
+
};
|
|
64085
|
+
const { waitUntilExit } = render19(React19.createElement(App));
|
|
64086
|
+
await waitUntilExit();
|
|
64043
64087
|
}
|
|
64044
|
-
|
|
64045
|
-
|
|
64046
|
-
|
|
64047
|
-
const
|
|
64048
|
-
|
|
64049
|
-
|
|
64050
|
-
|
|
64051
|
-
|
|
64052
|
-
|
|
64053
|
-
|
|
64054
|
-
|
|
64055
|
-
|
|
64056
|
-
|
|
64057
|
-
|
|
64058
|
-
|
|
64088
|
+
);
|
|
64089
|
+
evalCommand.command("list").description("List eval batches").option("--flow <id>", "Filter by flow ID").option("--limit <n>", "Limit results", "20").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(
|
|
64090
|
+
async (options) => {
|
|
64091
|
+
const apiKey = await ensureAuth();
|
|
64092
|
+
if (!apiKey) return;
|
|
64093
|
+
const client = createCliClient(apiKey);
|
|
64094
|
+
const params = { limit: options.limit };
|
|
64095
|
+
if (options.flow) params.flowId = options.flow;
|
|
64096
|
+
if (!isTTY(options) || options.json) {
|
|
64097
|
+
try {
|
|
64098
|
+
const data = await client.get("/eval/batches", params);
|
|
64099
|
+
if (options.json) {
|
|
64100
|
+
printJson(data);
|
|
64101
|
+
} else {
|
|
64102
|
+
const batches = data.data ?? [];
|
|
64103
|
+
if (batches.length === 0) {
|
|
64104
|
+
console.log(chalk27.gray("No eval batches found"));
|
|
64105
|
+
return;
|
|
64106
|
+
}
|
|
64107
|
+
console.log(chalk27.cyan("Eval Batches:"));
|
|
64108
|
+
for (const batch of batches) {
|
|
64109
|
+
const name = batch.name || batch.id;
|
|
64110
|
+
const progress = batch.totalRecords ? `${batch.completedRecords ?? 0}/${batch.totalRecords}` : "";
|
|
64111
|
+
const statusColor = batch.status === "completed" ? "green" : "yellow";
|
|
64112
|
+
console.log(
|
|
64113
|
+
` ${chalk27.green(batch.id)} ${name} ${chalk27[statusColor](`[${batch.status}]`)} ${chalk27.gray(progress)}`
|
|
64114
|
+
);
|
|
64115
|
+
}
|
|
64116
|
+
const total = getTotalCount(data.pagination);
|
|
64117
|
+
if (total !== void 0) {
|
|
64118
|
+
console.log(chalk27.dim(`
|
|
64119
|
+
Total: ${total} batches`));
|
|
64120
|
+
}
|
|
64059
64121
|
}
|
|
64060
|
-
}
|
|
64061
|
-
|
|
64062
|
-
|
|
64063
|
-
|
|
64064
|
-
|
|
64065
|
-
items,
|
|
64066
|
-
error: error51,
|
|
64067
|
-
loading,
|
|
64068
|
-
total,
|
|
64069
|
-
emptyMessage: "No eval batches found",
|
|
64070
|
-
renderCard: (item) => {
|
|
64071
|
-
const b = item;
|
|
64072
|
-
const name = b.name || b.id;
|
|
64073
|
-
const progress = b.totalRecords ? `${b.completedRecords ?? 0}/${b.totalRecords}` : "";
|
|
64074
|
-
const statusColor = b.status === "completed" ? "green" : "yellow";
|
|
64075
|
-
return React19.createElement(
|
|
64076
|
-
Text34,
|
|
64077
|
-
{ color: statusColor },
|
|
64078
|
-
` ${b.id} ${name} [${b.status}] ${progress}`
|
|
64079
|
-
);
|
|
64122
|
+
} catch (error51) {
|
|
64123
|
+
const message = error51 instanceof Error ? error51.message : "Unknown error";
|
|
64124
|
+
console.error(chalk27.red("Failed to fetch eval batches"));
|
|
64125
|
+
console.error(chalk27.red(message));
|
|
64126
|
+
process.exit(1);
|
|
64080
64127
|
}
|
|
64081
|
-
|
|
64082
|
-
|
|
64083
|
-
|
|
64084
|
-
|
|
64085
|
-
|
|
64128
|
+
return;
|
|
64129
|
+
}
|
|
64130
|
+
const App = () => {
|
|
64131
|
+
const [loading, setLoading] = useState36(true);
|
|
64132
|
+
const [items, setItems] = useState36(null);
|
|
64133
|
+
const [total, setTotal] = useState36(void 0);
|
|
64134
|
+
const [error51, setError] = useState36(null);
|
|
64135
|
+
useEffect30(() => {
|
|
64136
|
+
const run2 = async () => {
|
|
64137
|
+
try {
|
|
64138
|
+
const data = await client.get("/eval/batches", params);
|
|
64139
|
+
setItems(data.data ?? []);
|
|
64140
|
+
setTotal(getTotalCount(data.pagination));
|
|
64141
|
+
setLoading(false);
|
|
64142
|
+
} catch (err) {
|
|
64143
|
+
setError(err instanceof Error ? err : new Error(String(err)));
|
|
64144
|
+
setLoading(false);
|
|
64145
|
+
}
|
|
64146
|
+
};
|
|
64147
|
+
run2();
|
|
64148
|
+
}, []);
|
|
64149
|
+
return React19.createElement(DataList, {
|
|
64150
|
+
title: "Eval Batches",
|
|
64151
|
+
items,
|
|
64152
|
+
error: error51,
|
|
64153
|
+
loading,
|
|
64154
|
+
total,
|
|
64155
|
+
emptyMessage: "No eval batches found",
|
|
64156
|
+
renderCard: (item) => {
|
|
64157
|
+
const b = item;
|
|
64158
|
+
const name = b.name || b.id;
|
|
64159
|
+
const progress = b.totalRecords ? `${b.completedRecords ?? 0}/${b.totalRecords}` : "";
|
|
64160
|
+
const statusColor = b.status === "completed" ? "green" : "yellow";
|
|
64161
|
+
return React19.createElement(
|
|
64162
|
+
Text34,
|
|
64163
|
+
{ color: statusColor },
|
|
64164
|
+
` ${b.id} ${name} [${b.status}] ${progress}`
|
|
64165
|
+
);
|
|
64166
|
+
}
|
|
64167
|
+
});
|
|
64168
|
+
};
|
|
64169
|
+
const { waitUntilExit } = render19(React19.createElement(App));
|
|
64170
|
+
await waitUntilExit();
|
|
64171
|
+
}
|
|
64172
|
+
);
|
|
64086
64173
|
evalCommand.command("results <id>").description("Get eval batch results").option("--json", "Output as JSON").option("--tty", "Force TTY mode").option("--no-tty", "Force non-TTY mode").action(async (id, options) => {
|
|
64087
64174
|
const apiKey = await ensureAuth();
|
|
64088
64175
|
if (!apiKey) return;
|
|
64089
64176
|
const client = createCliClient(apiKey);
|
|
64090
64177
|
if (!isTTY(options) || options.json) {
|
|
64091
64178
|
try {
|
|
64092
|
-
const data = await client.get(
|
|
64179
|
+
const data = await client.get(
|
|
64180
|
+
`/eval/${id}/results`
|
|
64181
|
+
);
|
|
64093
64182
|
if (options.json) {
|
|
64094
64183
|
printJson(data);
|
|
64095
64184
|
} else {
|
|
64096
64185
|
if (data.batch) {
|
|
64097
64186
|
console.log(chalk27.cyan(`Eval: ${data.batch.name || data.batch.id}`));
|
|
64098
64187
|
console.log(` Status: ${data.batch.status}`);
|
|
64099
|
-
console.log(
|
|
64188
|
+
console.log(
|
|
64189
|
+
` Progress: ${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}`
|
|
64190
|
+
);
|
|
64100
64191
|
console.log();
|
|
64101
64192
|
}
|
|
64102
64193
|
const results = data.data ?? [];
|
|
@@ -64129,19 +64220,28 @@ evalCommand.command("results <id>").description("Get eval batch results").option
|
|
|
64129
64220
|
useEffect30(() => {
|
|
64130
64221
|
const run2 = async () => {
|
|
64131
64222
|
try {
|
|
64132
|
-
const data = await client.get(
|
|
64223
|
+
const data = await client.get(
|
|
64224
|
+
`/eval/${id}/results`
|
|
64225
|
+
);
|
|
64133
64226
|
const results = data.data ?? [];
|
|
64134
64227
|
const fields = [];
|
|
64135
64228
|
if (data.batch) {
|
|
64136
64229
|
fields.push({ label: "Eval", value: data.batch.name || data.batch.id });
|
|
64137
64230
|
fields.push({ label: "Status", value: data.batch.status });
|
|
64138
|
-
fields.push({
|
|
64231
|
+
fields.push({
|
|
64232
|
+
label: "Progress",
|
|
64233
|
+
value: `${data.batch.completedRecords ?? 0}/${data.batch.totalRecords ?? 0}`
|
|
64234
|
+
});
|
|
64139
64235
|
}
|
|
64140
64236
|
fields.push({ label: "Results", value: results.length });
|
|
64141
64237
|
if (results.length > 0) {
|
|
64142
64238
|
const completed = results.filter((r) => r.status === "completed").length;
|
|
64143
64239
|
const avgScore = results.filter((r) => r.score !== void 0).reduce((sum, r) => sum + (r.score ?? 0), 0) / (results.filter((r) => r.score !== void 0).length || 1);
|
|
64144
|
-
fields.push({
|
|
64240
|
+
fields.push({
|
|
64241
|
+
label: "Completed",
|
|
64242
|
+
value: `${completed}/${results.length}`,
|
|
64243
|
+
color: "green"
|
|
64244
|
+
});
|
|
64145
64245
|
if (results.some((r) => r.score !== void 0)) {
|
|
64146
64246
|
fields.push({ label: "Avg Score", value: avgScore.toFixed(2) });
|
|
64147
64247
|
}
|
|
@@ -64237,6 +64337,13 @@ function printSuiteResult(rootDir, outcome) {
|
|
|
64237
64337
|
if (testCase.errored) {
|
|
64238
64338
|
console.log(chalk27.red(` \u2717 errored: ${testCase.outputExcerpt.slice(0, 200)}`));
|
|
64239
64339
|
}
|
|
64340
|
+
} else {
|
|
64341
|
+
for (const outcomeItem of testCase.outcomes.filter(
|
|
64342
|
+
(o) => !o.passed && o.severity === "soft"
|
|
64343
|
+
)) {
|
|
64344
|
+
const reason = outcomeItem.reasoning ? `: ${outcomeItem.reasoning}` : "";
|
|
64345
|
+
console.log(chalk27.yellow(` \u26A0 soft ${outcomeItem.kind}${reason}`));
|
|
64346
|
+
}
|
|
64240
64347
|
}
|
|
64241
64348
|
}
|
|
64242
64349
|
}
|
|
@@ -64251,7 +64358,12 @@ function toJUnitSuite(outcome) {
|
|
|
64251
64358
|
}))
|
|
64252
64359
|
};
|
|
64253
64360
|
}
|
|
64254
|
-
evalCommand.command("run [idOrDirPrefix]").description(
|
|
64361
|
+
evalCommand.command("run [idOrDirPrefix]").description(
|
|
64362
|
+
"Run code-colocated eval suites (**/*.eval.ts) as a CI gate (exit 0 pass / 1 fail / 2 config)"
|
|
64363
|
+
).option(
|
|
64364
|
+
"--strict",
|
|
64365
|
+
"Fail the exit code on soft grader misses too (default: soft misses are reported but do not fail)"
|
|
64366
|
+
).option("--virtual", "Run inline without persisting a suite/batch to the dashboard").option("--junit <path>", "Write JUnit XML results to <path>").option("--url <api>", "Override the API base URL (e.g. staging)").option("--cwd <dir>", "Directory to discover *.eval.ts under (default: current directory)").action(
|
|
64255
64367
|
async (idOrDirPrefix, options) => {
|
|
64256
64368
|
const apiKey = await ensureAuth();
|
|
64257
64369
|
if (!apiKey) {
|
|
@@ -64289,11 +64401,6 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
|
|
|
64289
64401
|
} else {
|
|
64290
64402
|
loaded = await loadAll(allFiles);
|
|
64291
64403
|
}
|
|
64292
|
-
if (options.strict) {
|
|
64293
|
-
console.log(
|
|
64294
|
-
chalk27.gray("Note: --strict has no effect yet (grader severity lands in a later increment).")
|
|
64295
|
-
);
|
|
64296
|
-
}
|
|
64297
64404
|
const client = createCliClient(apiKey, options.url);
|
|
64298
64405
|
const outcomes = [];
|
|
64299
64406
|
for (const { file: file2, def } of loaded) {
|
|
@@ -64301,16 +64408,25 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
|
|
|
64301
64408
|
try {
|
|
64302
64409
|
let result;
|
|
64303
64410
|
if (runVirtual) {
|
|
64304
|
-
result = await client.post("/eval/run", {
|
|
64305
|
-
|
|
64306
|
-
|
|
64307
|
-
name: def.name,
|
|
64308
|
-
definition: def
|
|
64411
|
+
result = await client.post("/eval/run", {
|
|
64412
|
+
definition: def,
|
|
64413
|
+
strict: options.strict ?? false
|
|
64309
64414
|
});
|
|
64415
|
+
} else {
|
|
64416
|
+
const ensured = await client.post(
|
|
64417
|
+
"/eval/ensure",
|
|
64418
|
+
{
|
|
64419
|
+
name: def.name,
|
|
64420
|
+
definition: def
|
|
64421
|
+
}
|
|
64422
|
+
);
|
|
64310
64423
|
if (!ensured.suiteId) {
|
|
64311
64424
|
throw new Error(`ensure did not return a suiteId (result: ${ensured.result})`);
|
|
64312
64425
|
}
|
|
64313
|
-
result = await client.post("/eval/run", {
|
|
64426
|
+
result = await client.post("/eval/run", {
|
|
64427
|
+
suiteId: ensured.suiteId,
|
|
64428
|
+
strict: options.strict ?? false
|
|
64429
|
+
});
|
|
64314
64430
|
}
|
|
64315
64431
|
outcomes.push({ file: file2, definition: def, result });
|
|
64316
64432
|
printSuiteResult(rootDir, { file: file2, definition: def, result });
|
|
@@ -64328,7 +64444,9 @@ evalCommand.command("run [idOrDirPrefix]").description("Run code-colocated eval
|
|
|
64328
64444
|
writeFileSync6(outPath, xml, "utf-8");
|
|
64329
64445
|
console.log(chalk27.gray(`JUnit results written to ${options.junit}`));
|
|
64330
64446
|
} catch (error51) {
|
|
64331
|
-
failConfig(
|
|
64447
|
+
failConfig(
|
|
64448
|
+
`Failed to write JUnit report: ${error51 instanceof Error ? error51.message : String(error51)}`
|
|
64449
|
+
);
|
|
64332
64450
|
}
|
|
64333
64451
|
}
|
|
64334
64452
|
const failedSuites = outcomes.filter((o) => !o.result.passed);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@runtypelabs/cli",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.24.0",
|
|
4
4
|
"description": "Command-line interface for Runtype AI platform",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
"rosie-skills": "0.8.1",
|
|
25
25
|
"yaml": "^2.9.0",
|
|
26
26
|
"@runtypelabs/ink-components": "0.3.4",
|
|
27
|
-
"@runtypelabs/sdk": "5.
|
|
27
|
+
"@runtypelabs/sdk": "5.6.0",
|
|
28
28
|
"@runtypelabs/terminal-animations": "0.2.1"
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
"tsx": "^4.7.1",
|
|
40
40
|
"typescript": "^6.0.3",
|
|
41
41
|
"vitest": "^4.1.0",
|
|
42
|
-
"@runtypelabs/shared": "1.42.
|
|
42
|
+
"@runtypelabs/shared": "1.42.6"
|
|
43
43
|
},
|
|
44
44
|
"engines": {
|
|
45
45
|
"node": ">=22.0.0"
|