@kradle/cli 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -1
- package/dist/commands/challenge/watch.js +2 -1
- package/dist/commands/evaluation/init.d.ts +9 -0
- package/dist/commands/evaluation/init.js +58 -0
- package/dist/commands/evaluation/list.d.ts +7 -0
- package/dist/commands/evaluation/list.js +55 -0
- package/dist/commands/evaluation/run.d.ts +13 -0
- package/dist/commands/evaluation/run.js +60 -0
- package/dist/lib/api-client.d.ts +14 -1
- package/dist/lib/api-client.js +31 -5
- package/dist/lib/challenge.js +5 -0
- package/dist/lib/config.d.ts +0 -1
- package/dist/lib/config.js +0 -2
- package/dist/lib/evaluation/evaluator.d.ts +88 -0
- package/dist/lib/evaluation/evaluator.js +275 -0
- package/dist/lib/evaluation/index.d.ts +4 -0
- package/dist/lib/evaluation/index.js +4 -0
- package/dist/lib/evaluation/runner.d.ts +80 -0
- package/dist/lib/evaluation/runner.js +280 -0
- package/dist/lib/evaluation/tui.d.ts +20 -0
- package/dist/lib/evaluation/tui.js +129 -0
- package/dist/lib/evaluation/types.d.ts +127 -0
- package/dist/lib/evaluation/types.js +86 -0
- package/dist/lib/schemas.d.ts +14 -0
- package/dist/lib/schemas.js +10 -0
- package/oclif.manifest.json +104 -1
- package/package.json +8 -1
- package/static/evaluation_template.ts +69 -0
- package/static/project_template/dev.env +0 -1
- package/static/project_template/prod.env +0 -1
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { jsx as _jsx, Fragment as _Fragment, jsxs as _jsxs } from "react/jsx-runtime";
|
|
2
|
+
import { Box, render, Text, useInput } from "ink";
|
|
3
|
+
import { useEffect, useMemo, useState } from "react";
|
|
4
|
+
import { STATUS_ICONS } from "./types.js";
|
|
5
|
+
const formatElapsed = (startTime) => {
|
|
6
|
+
const elapsed = Date.now() - startTime;
|
|
7
|
+
const seconds = Math.floor(elapsed / 1000);
|
|
8
|
+
const minutes = Math.floor(seconds / 60);
|
|
9
|
+
const hours = Math.floor(minutes / 60);
|
|
10
|
+
if (hours > 0)
|
|
11
|
+
return `${hours}h ${minutes % 60}m`;
|
|
12
|
+
if (minutes > 0)
|
|
13
|
+
return `${minutes}m ${seconds % 60}s`;
|
|
14
|
+
return `${seconds}s`;
|
|
15
|
+
};
|
|
16
|
+
const getVisibleRows = () => {
|
|
17
|
+
const terminalHeight = process.stdout.rows || 24;
|
|
18
|
+
return Math.max(1, terminalHeight - 6 - 1); // Header (3 lines) + footer (3 lines) + 1 line of offset
|
|
19
|
+
};
|
|
20
|
+
const getVisibleColumns = () => {
|
|
21
|
+
const terminalWidth = process.stdout.columns || 80;
|
|
22
|
+
return Math.max(1, terminalWidth - 1);
|
|
23
|
+
};
|
|
24
|
+
const RenderRunLine = ({ state, total, isSelected, padding, }) => {
|
|
25
|
+
const { icon, color } = STATUS_ICONS[state.status] ?? STATUS_ICONS.queued;
|
|
26
|
+
const indexLabel = `${String(state.index + 1).padStart(padding, " ")}/${total}`;
|
|
27
|
+
const statusLabel = state.status.padEnd(12);
|
|
28
|
+
const startTime = state.startTime ?? null;
|
|
29
|
+
const showElapsed = startTime !== null &&
|
|
30
|
+
state.status !== "completed" &&
|
|
31
|
+
state.status !== "finished" &&
|
|
32
|
+
state.status !== "game_over" &&
|
|
33
|
+
state.status !== "error";
|
|
34
|
+
const elapsedLabel = showElapsed && startTime !== null ? formatElapsed(startTime) : null;
|
|
35
|
+
const agents = state.config.participants.map((p) => p.agent.split(":").pop() ?? p.agent).join(", ");
|
|
36
|
+
const summary = `${state.config.challenge_slug} (${agents})`;
|
|
37
|
+
const maxSummaryLength = getVisibleColumns() - indexLabel.length - statusLabel.length - (elapsedLabel ? elapsedLabel.length : 0) - 4; // 4 for the spaces and emoji
|
|
38
|
+
const summaryText = summary.length > maxSummaryLength ? summary.slice(0, maxSummaryLength - 1) + "…" : summary;
|
|
39
|
+
return (_jsxs(Text, { inverse: isSelected, children: [_jsx(Text, { color: color, children: icon }), " ", indexLabel, " ", _jsx(Text, { color: color, children: statusLabel }), elapsedLabel ? (_jsxs(_Fragment, { children: [" ", _jsx(Text, { dimColor: true, children: elapsedLabel })] })) : null, " ", _jsx(Text, { dimColor: true, children: summaryText })] }));
|
|
40
|
+
};
|
|
41
|
+
const EvaluationUI = ({ evaluationName, states, statusCounts, onQuit, onOpenRun }) => {
|
|
42
|
+
const [selectedIndex, setSelectedIndex] = useState(0);
|
|
43
|
+
const [scrollOffset, setScrollOffset] = useState(0);
|
|
44
|
+
const [tick, setTick] = useState(0); // force elapsed-time updates
|
|
45
|
+
useEffect(() => {
|
|
46
|
+
if (states.length === 0) {
|
|
47
|
+
setSelectedIndex(0);
|
|
48
|
+
setScrollOffset(0);
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
setSelectedIndex((current) => Math.min(current, states.length - 1));
|
|
52
|
+
}, [states.length]);
|
|
53
|
+
useEffect(() => {
|
|
54
|
+
const rows = getVisibleRows();
|
|
55
|
+
setScrollOffset((offset) => {
|
|
56
|
+
if (selectedIndex < offset)
|
|
57
|
+
return selectedIndex;
|
|
58
|
+
if (selectedIndex >= offset + rows)
|
|
59
|
+
return Math.min(selectedIndex - rows + 1, Math.max(0, states.length - rows));
|
|
60
|
+
return Math.min(offset, Math.max(0, states.length - rows));
|
|
61
|
+
});
|
|
62
|
+
}, [selectedIndex, states.length]);
|
|
63
|
+
useEffect(() => {
|
|
64
|
+
const interval = setInterval(() => setTick((value) => value + 1), 1000);
|
|
65
|
+
return () => clearInterval(interval);
|
|
66
|
+
}, []);
|
|
67
|
+
useInput((input, key) => {
|
|
68
|
+
if (input === "q" || (key.ctrl && input === "c")) {
|
|
69
|
+
onQuit();
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
if (states.length === 0)
|
|
73
|
+
return;
|
|
74
|
+
if (key.upArrow || input === "k") {
|
|
75
|
+
setSelectedIndex((current) => Math.max(0, current - 1));
|
|
76
|
+
}
|
|
77
|
+
else if (key.downArrow || input === "j") {
|
|
78
|
+
setSelectedIndex((current) => Math.min(states.length - 1, current + 1));
|
|
79
|
+
}
|
|
80
|
+
else if (input === "o") {
|
|
81
|
+
onOpenRun(selectedIndex);
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
const rowsAvailable = getVisibleRows();
|
|
85
|
+
const visibleRuns = useMemo(() => states.slice(scrollOffset, scrollOffset + rowsAvailable), [states, scrollOffset, rowsAvailable]);
|
|
86
|
+
const showScroll = states.length > rowsAvailable;
|
|
87
|
+
const rangeStart = states.length === 0 ? 0 : scrollOffset + 1;
|
|
88
|
+
const rangeEnd = Math.min(scrollOffset + rowsAvailable, states.length);
|
|
89
|
+
const horizontalRule = "─".repeat(Math.min(process.stdout.columns || 80, 80));
|
|
90
|
+
const padding = states.length.toString().length;
|
|
91
|
+
return (_jsxs(Box, { flexDirection: "column", children: [_jsx(Text, { bold: true, children: `Evaluation: ${evaluationName}` }), _jsx(Text, { dimColor: true, children: horizontalRule }), _jsx(Text, { dimColor: true, children: "q:quit \u2191/\u2193/j/k:select o:open in browser" }), _jsx(Text, { children: " " }), _jsxs(Box, { flexDirection: "column", children: [visibleRuns.map((state, index) => (_jsx(RenderRunLine, { state: state, total: states.length, isSelected: scrollOffset + index === selectedIndex, padding: padding }, state.index))), visibleRuns.length < rowsAvailable
|
|
92
|
+
? Array.from({ length: rowsAvailable - visibleRuns.length }).map((_, index) => (_jsx(Text, { children: " " }, `empty-${index}`)))
|
|
93
|
+
: null] }), showScroll ? _jsx(Text, { dimColor: true, children: `[${rangeStart}-${rangeEnd} of ${states.length}]` }) : _jsx(Text, { children: " " }), _jsx(Text, { dimColor: true, children: horizontalRule }), _jsxs(Text, { children: [_jsx(Text, { children: "Completed: " }), _jsx(Text, { color: "green", children: statusCounts.completed }), _jsx(Text, { children: ` | Active: ` }), _jsx(Text, { color: "yellow", children: statusCounts.active }), _jsx(Text, { children: ` | Queued: ` }), _jsx(Text, { dimColor: true, children: statusCounts.queued }), statusCounts.errors > 0 ? (_jsxs(_Fragment, { children: [_jsx(Text, { children: ` | Errors: ` }), _jsx(Text, { color: "red", children: statusCounts.errors })] })) : null] })] }));
|
|
94
|
+
};
|
|
95
|
+
export class TUI {
|
|
96
|
+
options;
|
|
97
|
+
states = [];
|
|
98
|
+
statusCounts = { completed: 0, active: 0, queued: 0, errors: 0 };
|
|
99
|
+
app;
|
|
100
|
+
running = false;
|
|
101
|
+
constructor(options) {
|
|
102
|
+
this.options = options;
|
|
103
|
+
}
|
|
104
|
+
start() {
|
|
105
|
+
this.running = true;
|
|
106
|
+
this.app = render(this.renderApp());
|
|
107
|
+
}
|
|
108
|
+
stop() {
|
|
109
|
+
this.running = false;
|
|
110
|
+
this.app?.unmount();
|
|
111
|
+
this.app = undefined;
|
|
112
|
+
}
|
|
113
|
+
updateStates(states) {
|
|
114
|
+
this.states = states;
|
|
115
|
+
this.rerender();
|
|
116
|
+
}
|
|
117
|
+
updateStatusCounts(counts) {
|
|
118
|
+
this.statusCounts = counts;
|
|
119
|
+
this.rerender();
|
|
120
|
+
}
|
|
121
|
+
rerender() {
|
|
122
|
+
if (!this.running || !this.app)
|
|
123
|
+
return;
|
|
124
|
+
this.app.rerender(this.renderApp());
|
|
125
|
+
}
|
|
126
|
+
renderApp() {
|
|
127
|
+
return (_jsx(EvaluationUI, { evaluationName: this.options.evaluationName, states: this.states, statusCounts: this.statusCounts, onQuit: this.options.onQuit, onOpenRun: this.options.onOpenRun }));
|
|
128
|
+
}
|
|
129
|
+
}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const ParticipantSchema: z.ZodObject<{
|
|
3
|
+
agent: z.ZodString;
|
|
4
|
+
role: z.ZodOptional<z.ZodString>;
|
|
5
|
+
}, z.core.$strip>;
|
|
6
|
+
export type Participant = z.infer<typeof ParticipantSchema>;
|
|
7
|
+
export declare const RunConfigSchema: z.ZodObject<{
|
|
8
|
+
challenge_slug: z.ZodString;
|
|
9
|
+
participants: z.ZodArray<z.ZodObject<{
|
|
10
|
+
agent: z.ZodString;
|
|
11
|
+
role: z.ZodOptional<z.ZodString>;
|
|
12
|
+
}, z.core.$strip>>;
|
|
13
|
+
}, z.core.$strip>;
|
|
14
|
+
export type RunConfig = z.infer<typeof RunConfigSchema>;
|
|
15
|
+
export declare const ManifestSchema: z.ZodObject<{
|
|
16
|
+
runs: z.ZodArray<z.ZodObject<{
|
|
17
|
+
challenge_slug: z.ZodString;
|
|
18
|
+
participants: z.ZodArray<z.ZodObject<{
|
|
19
|
+
agent: z.ZodString;
|
|
20
|
+
role: z.ZodOptional<z.ZodString>;
|
|
21
|
+
}, z.core.$strip>>;
|
|
22
|
+
}, z.core.$strip>>;
|
|
23
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
24
|
+
}, z.core.$strip>;
|
|
25
|
+
export type Manifest = z.infer<typeof ManifestSchema>;
|
|
26
|
+
export type RunStatus = "queued" | "initializing" | "watcher_connected" | "participants_connected" | "started" | "running" | "recovering" | "completed" | "game_over" | "finished" | "error";
|
|
27
|
+
export declare const ProgressEntrySchema: z.ZodObject<{
|
|
28
|
+
index: z.ZodNumber;
|
|
29
|
+
status: z.ZodEnum<{
|
|
30
|
+
error: "error";
|
|
31
|
+
queued: "queued";
|
|
32
|
+
initializing: "initializing";
|
|
33
|
+
watcher_connected: "watcher_connected";
|
|
34
|
+
participants_connected: "participants_connected";
|
|
35
|
+
started: "started";
|
|
36
|
+
running: "running";
|
|
37
|
+
recovering: "recovering";
|
|
38
|
+
completed: "completed";
|
|
39
|
+
game_over: "game_over";
|
|
40
|
+
finished: "finished";
|
|
41
|
+
}>;
|
|
42
|
+
runId: z.ZodOptional<z.ZodString>;
|
|
43
|
+
startTime: z.ZodOptional<z.ZodNumber>;
|
|
44
|
+
endTime: z.ZodOptional<z.ZodNumber>;
|
|
45
|
+
error: z.ZodOptional<z.ZodString>;
|
|
46
|
+
}, z.core.$strip>;
|
|
47
|
+
export type ProgressEntry = z.infer<typeof ProgressEntrySchema>;
|
|
48
|
+
export declare const ProgressSchema: z.ZodObject<{
|
|
49
|
+
entries: z.ZodArray<z.ZodObject<{
|
|
50
|
+
index: z.ZodNumber;
|
|
51
|
+
status: z.ZodEnum<{
|
|
52
|
+
error: "error";
|
|
53
|
+
queued: "queued";
|
|
54
|
+
initializing: "initializing";
|
|
55
|
+
watcher_connected: "watcher_connected";
|
|
56
|
+
participants_connected: "participants_connected";
|
|
57
|
+
started: "started";
|
|
58
|
+
running: "running";
|
|
59
|
+
recovering: "recovering";
|
|
60
|
+
completed: "completed";
|
|
61
|
+
game_over: "game_over";
|
|
62
|
+
finished: "finished";
|
|
63
|
+
}>;
|
|
64
|
+
runId: z.ZodOptional<z.ZodString>;
|
|
65
|
+
startTime: z.ZodOptional<z.ZodNumber>;
|
|
66
|
+
endTime: z.ZodOptional<z.ZodNumber>;
|
|
67
|
+
error: z.ZodOptional<z.ZodString>;
|
|
68
|
+
}, z.core.$strip>>;
|
|
69
|
+
lastUpdated: z.ZodNumber;
|
|
70
|
+
}, z.core.$strip>;
|
|
71
|
+
export type Progress = z.infer<typeof ProgressSchema>;
|
|
72
|
+
export declare const RunResultSchema: z.ZodObject<{
|
|
73
|
+
index: z.ZodNumber;
|
|
74
|
+
runId: z.ZodString;
|
|
75
|
+
challenge_slug: z.ZodString;
|
|
76
|
+
participants: z.ZodArray<z.ZodObject<{
|
|
77
|
+
agent: z.ZodString;
|
|
78
|
+
role: z.ZodOptional<z.ZodString>;
|
|
79
|
+
}, z.core.$strip>>;
|
|
80
|
+
status: z.ZodString;
|
|
81
|
+
startTime: z.ZodNumber;
|
|
82
|
+
endTime: z.ZodNumber;
|
|
83
|
+
duration: z.ZodNumber;
|
|
84
|
+
logs: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
|
|
85
|
+
summary: z.ZodOptional<z.ZodString>;
|
|
86
|
+
error: z.ZodOptional<z.ZodString>;
|
|
87
|
+
outcome: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
88
|
+
}, z.core.$strip>;
|
|
89
|
+
export type RunResult = z.infer<typeof RunResultSchema>;
|
|
90
|
+
export interface RunState {
|
|
91
|
+
index: number;
|
|
92
|
+
config: RunConfig;
|
|
93
|
+
status: RunStatus;
|
|
94
|
+
runId?: string;
|
|
95
|
+
startTime?: number;
|
|
96
|
+
error?: string;
|
|
97
|
+
}
|
|
98
|
+
export interface StatusCounts {
|
|
99
|
+
completed: number;
|
|
100
|
+
active: number;
|
|
101
|
+
queued: number;
|
|
102
|
+
errors: number;
|
|
103
|
+
}
|
|
104
|
+
export declare const RunStatusResponseSchema: z.ZodObject<{
|
|
105
|
+
id: z.ZodString;
|
|
106
|
+
status: z.ZodString;
|
|
107
|
+
createdAt: z.ZodOptional<z.ZodString>;
|
|
108
|
+
updatedAt: z.ZodOptional<z.ZodString>;
|
|
109
|
+
outcome: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
110
|
+
}, z.core.$strip>;
|
|
111
|
+
export type RunStatusResponse = z.infer<typeof RunStatusResponseSchema>;
|
|
112
|
+
export declare const RunLogsResponseSchema: z.ZodObject<{
|
|
113
|
+
logs: z.ZodArray<z.ZodUnknown>;
|
|
114
|
+
}, z.core.$strip>;
|
|
115
|
+
export type RunLogsResponse = z.infer<typeof RunLogsResponseSchema>;
|
|
116
|
+
export declare const EvaluationMetadataSchema: z.ZodObject<{
|
|
117
|
+
currentIteration: z.ZodNumber;
|
|
118
|
+
}, z.core.$strip>;
|
|
119
|
+
export type EvaluationMetadata = z.infer<typeof EvaluationMetadataSchema>;
|
|
120
|
+
export interface EvaluationOptions {
|
|
121
|
+
new: boolean;
|
|
122
|
+
maxConcurrent: number;
|
|
123
|
+
}
|
|
124
|
+
export declare const STATUS_ICONS: Record<RunStatus, {
|
|
125
|
+
icon: string;
|
|
126
|
+
color: "white" | "yellow" | "blue" | "magenta" | "cyan" | "green" | "red";
|
|
127
|
+
}>;
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
// Participant in a run
|
|
3
|
+
export const ParticipantSchema = z.object({
|
|
4
|
+
agent: z.string(),
|
|
5
|
+
role: z.string().optional(),
|
|
6
|
+
});
|
|
7
|
+
// Single run configuration
|
|
8
|
+
export const RunConfigSchema = z.object({
|
|
9
|
+
challenge_slug: z.string(),
|
|
10
|
+
participants: z.array(ParticipantSchema),
|
|
11
|
+
});
|
|
12
|
+
// Manifest returned by config.ts main()
|
|
13
|
+
export const ManifestSchema = z.object({
|
|
14
|
+
runs: z.array(RunConfigSchema),
|
|
15
|
+
tags: z.array(z.string()).optional(),
|
|
16
|
+
});
|
|
17
|
+
// Progress entry for a single run
|
|
18
|
+
export const ProgressEntrySchema = z.object({
|
|
19
|
+
index: z.number(),
|
|
20
|
+
status: z.enum([
|
|
21
|
+
"queued",
|
|
22
|
+
"initializing",
|
|
23
|
+
"watcher_connected",
|
|
24
|
+
"participants_connected",
|
|
25
|
+
"started",
|
|
26
|
+
"running",
|
|
27
|
+
"recovering",
|
|
28
|
+
"completed",
|
|
29
|
+
"game_over",
|
|
30
|
+
"finished",
|
|
31
|
+
"error",
|
|
32
|
+
]),
|
|
33
|
+
runId: z.string().optional(),
|
|
34
|
+
startTime: z.number().optional(),
|
|
35
|
+
endTime: z.number().optional(),
|
|
36
|
+
error: z.string().optional(),
|
|
37
|
+
});
|
|
38
|
+
// Progress file schema
|
|
39
|
+
export const ProgressSchema = z.object({
|
|
40
|
+
entries: z.array(ProgressEntrySchema),
|
|
41
|
+
lastUpdated: z.number(),
|
|
42
|
+
});
|
|
43
|
+
// Run result with logs and summary
|
|
44
|
+
export const RunResultSchema = z.object({
|
|
45
|
+
index: z.number(),
|
|
46
|
+
runId: z.string(),
|
|
47
|
+
challenge_slug: z.string(),
|
|
48
|
+
participants: z.array(ParticipantSchema),
|
|
49
|
+
status: z.string(),
|
|
50
|
+
startTime: z.number(),
|
|
51
|
+
endTime: z.number(),
|
|
52
|
+
duration: z.number(),
|
|
53
|
+
logs: z.array(z.unknown()).optional(),
|
|
54
|
+
summary: z.string().optional(),
|
|
55
|
+
error: z.string().optional(),
|
|
56
|
+
outcome: z.record(z.string(), z.unknown()).optional(),
|
|
57
|
+
});
|
|
58
|
+
// API response schemas for run status
|
|
59
|
+
export const RunStatusResponseSchema = z.object({
|
|
60
|
+
id: z.string(),
|
|
61
|
+
status: z.string(),
|
|
62
|
+
createdAt: z.string().optional(),
|
|
63
|
+
updatedAt: z.string().optional(),
|
|
64
|
+
outcome: z.record(z.string(), z.unknown()).optional(),
|
|
65
|
+
});
|
|
66
|
+
export const RunLogsResponseSchema = z.object({
|
|
67
|
+
logs: z.array(z.unknown()),
|
|
68
|
+
});
|
|
69
|
+
// Evaluation metadata stored in .evaluation.json
|
|
70
|
+
export const EvaluationMetadataSchema = z.object({
|
|
71
|
+
currentIteration: z.number(),
|
|
72
|
+
});
|
|
73
|
+
// Icons and colors for TUI
|
|
74
|
+
export const STATUS_ICONS = {
|
|
75
|
+
queued: { icon: "·", color: "white" },
|
|
76
|
+
initializing: { icon: "○", color: "yellow" },
|
|
77
|
+
watcher_connected: { icon: "◐", color: "blue" },
|
|
78
|
+
participants_connected: { icon: "◉", color: "blue" },
|
|
79
|
+
started: { icon: "▶", color: "magenta" },
|
|
80
|
+
running: { icon: "▶", color: "magenta" },
|
|
81
|
+
recovering: { icon: "⟳", color: "cyan" },
|
|
82
|
+
completed: { icon: "✓", color: "green" },
|
|
83
|
+
game_over: { icon: "✓", color: "green" },
|
|
84
|
+
finished: { icon: "✓", color: "green" },
|
|
85
|
+
error: { icon: "✗", color: "red" },
|
|
86
|
+
};
|
package/dist/lib/schemas.d.ts
CHANGED
|
@@ -20,10 +20,13 @@ export declare const ChallengeSchema: z.ZodObject<{
|
|
|
20
20
|
spectator: "spectator";
|
|
21
21
|
}>;
|
|
22
22
|
}, z.core.$strip>;
|
|
23
|
+
description: z.ZodOptional<z.ZodString>;
|
|
23
24
|
task: z.ZodString;
|
|
24
25
|
roles: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
25
26
|
description: z.ZodString;
|
|
26
27
|
specificTask: z.ZodString;
|
|
28
|
+
minParticipants: z.ZodOptional<z.ZodNumber>;
|
|
29
|
+
maxParticipants: z.ZodOptional<z.ZodNumber>;
|
|
27
30
|
}, z.core.$strip>>;
|
|
28
31
|
objective: z.ZodObject<{
|
|
29
32
|
fieldName: z.ZodString;
|
|
@@ -58,10 +61,13 @@ export declare const ChallengesResponseSchema: z.ZodObject<{
|
|
|
58
61
|
spectator: "spectator";
|
|
59
62
|
}>;
|
|
60
63
|
}, z.core.$strip>;
|
|
64
|
+
description: z.ZodOptional<z.ZodString>;
|
|
61
65
|
task: z.ZodString;
|
|
62
66
|
roles: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
63
67
|
description: z.ZodString;
|
|
64
68
|
specificTask: z.ZodString;
|
|
69
|
+
minParticipants: z.ZodOptional<z.ZodNumber>;
|
|
70
|
+
maxParticipants: z.ZodOptional<z.ZodNumber>;
|
|
65
71
|
}, z.core.$strip>>;
|
|
66
72
|
objective: z.ZodObject<{
|
|
67
73
|
fieldName: z.ZodString;
|
|
@@ -82,6 +88,13 @@ export declare const HumanSchema: z.ZodObject<{
|
|
|
82
88
|
export declare const RunResponseSchema: z.ZodObject<{
|
|
83
89
|
runIds: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
84
90
|
}, z.core.$strip>;
|
|
91
|
+
export declare const RunStatusSchema: z.ZodObject<{
|
|
92
|
+
id: z.ZodString;
|
|
93
|
+
status: z.ZodString;
|
|
94
|
+
createdAt: z.ZodOptional<z.ZodString>;
|
|
95
|
+
updatedAt: z.ZodOptional<z.ZodString>;
|
|
96
|
+
outcome: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
97
|
+
}, z.core.$strip>;
|
|
85
98
|
export declare const UploadUrlResponseSchema: z.ZodObject<{
|
|
86
99
|
uploadUrl: z.ZodString;
|
|
87
100
|
expiresAt: z.ZodString;
|
|
@@ -123,5 +136,6 @@ export type ChallengeSchemaType = z.infer<typeof ChallengeSchema>;
|
|
|
123
136
|
export type ChallengesResponseType = z.infer<typeof ChallengesResponseSchema>;
|
|
124
137
|
export type HumanSchemaType = z.infer<typeof HumanSchema>;
|
|
125
138
|
export type RunResponseType = z.infer<typeof RunResponseSchema>;
|
|
139
|
+
export type RunStatusSchemaType = z.infer<typeof RunStatusSchema>;
|
|
126
140
|
export type AgentSchemaType = z.infer<typeof AgentSchema>;
|
|
127
141
|
export type AgentsResponseType = z.infer<typeof AgentsResponseSchema>;
|
package/dist/lib/schemas.js
CHANGED
|
@@ -11,10 +11,13 @@ export const ChallengeSchema = z.object({
|
|
|
11
11
|
datapack: z.boolean(),
|
|
12
12
|
gameMode: z.enum(["survival", "creative", "adventure", "spectator"]),
|
|
13
13
|
}),
|
|
14
|
+
description: z.string().optional(),
|
|
14
15
|
task: z.string(),
|
|
15
16
|
roles: z.record(z.string(), z.object({
|
|
16
17
|
description: z.string(),
|
|
17
18
|
specificTask: z.string(),
|
|
19
|
+
minParticipants: z.number().optional(),
|
|
20
|
+
maxParticipants: z.number().optional(),
|
|
18
21
|
})),
|
|
19
22
|
objective: z.object({
|
|
20
23
|
fieldName: z.string(),
|
|
@@ -34,6 +37,13 @@ export const HumanSchema = z.object({
|
|
|
34
37
|
export const RunResponseSchema = z.object({
|
|
35
38
|
runIds: z.array(z.string()).optional(),
|
|
36
39
|
});
|
|
40
|
+
export const RunStatusSchema = z.object({
|
|
41
|
+
id: z.string(),
|
|
42
|
+
status: z.string(),
|
|
43
|
+
createdAt: z.string().optional(),
|
|
44
|
+
updatedAt: z.string().optional(),
|
|
45
|
+
outcome: z.record(z.string(), z.unknown()).optional(),
|
|
46
|
+
});
|
|
37
47
|
export const UploadUrlResponseSchema = z.object({
|
|
38
48
|
uploadUrl: z.string(),
|
|
39
49
|
expiresAt: z.string(),
|
package/oclif.manifest.json
CHANGED
|
@@ -304,7 +304,110 @@
|
|
|
304
304
|
"challenge",
|
|
305
305
|
"watch.js"
|
|
306
306
|
]
|
|
307
|
+
},
|
|
308
|
+
"evaluation:init": {
|
|
309
|
+
"aliases": [],
|
|
310
|
+
"args": {
|
|
311
|
+
"name": {
|
|
312
|
+
"description": "Name of the evaluation",
|
|
313
|
+
"name": "name",
|
|
314
|
+
"required": true
|
|
315
|
+
}
|
|
316
|
+
},
|
|
317
|
+
"description": "Initialize a new evaluation",
|
|
318
|
+
"examples": [
|
|
319
|
+
"<%= config.bin %> <%= command.id %> my-evaluation"
|
|
320
|
+
],
|
|
321
|
+
"flags": {},
|
|
322
|
+
"hasDynamicHelp": false,
|
|
323
|
+
"hiddenAliases": [],
|
|
324
|
+
"id": "evaluation:init",
|
|
325
|
+
"pluginAlias": "@kradle/cli",
|
|
326
|
+
"pluginName": "@kradle/cli",
|
|
327
|
+
"pluginType": "core",
|
|
328
|
+
"strict": true,
|
|
329
|
+
"enableJsonFlag": false,
|
|
330
|
+
"isESM": true,
|
|
331
|
+
"relativePath": [
|
|
332
|
+
"dist",
|
|
333
|
+
"commands",
|
|
334
|
+
"evaluation",
|
|
335
|
+
"init.js"
|
|
336
|
+
]
|
|
337
|
+
},
|
|
338
|
+
"evaluation:list": {
|
|
339
|
+
"aliases": [],
|
|
340
|
+
"args": {},
|
|
341
|
+
"description": "List all evaluations",
|
|
342
|
+
"examples": [
|
|
343
|
+
"<%= config.bin %> <%= command.id %>"
|
|
344
|
+
],
|
|
345
|
+
"flags": {},
|
|
346
|
+
"hasDynamicHelp": false,
|
|
347
|
+
"hiddenAliases": [],
|
|
348
|
+
"id": "evaluation:list",
|
|
349
|
+
"pluginAlias": "@kradle/cli",
|
|
350
|
+
"pluginName": "@kradle/cli",
|
|
351
|
+
"pluginType": "core",
|
|
352
|
+
"strict": true,
|
|
353
|
+
"enableJsonFlag": false,
|
|
354
|
+
"isESM": true,
|
|
355
|
+
"relativePath": [
|
|
356
|
+
"dist",
|
|
357
|
+
"commands",
|
|
358
|
+
"evaluation",
|
|
359
|
+
"list.js"
|
|
360
|
+
]
|
|
361
|
+
},
|
|
362
|
+
"evaluation:run": {
|
|
363
|
+
"aliases": [],
|
|
364
|
+
"args": {
|
|
365
|
+
"name": {
|
|
366
|
+
"description": "Name of the evaluation to run",
|
|
367
|
+
"name": "name",
|
|
368
|
+
"required": true
|
|
369
|
+
}
|
|
370
|
+
},
|
|
371
|
+
"description": "Run an evaluation. If the evaluation had an ongoing iteration, it will resume from the last state.",
|
|
372
|
+
"examples": [
|
|
373
|
+
"<%= config.bin %> <%= command.id %> my-evaluation",
|
|
374
|
+
"<%= config.bin %> <%= command.id %> my-evaluation --new",
|
|
375
|
+
"<%= config.bin %> <%= command.id %> my-evaluation --max-concurrent 10"
|
|
376
|
+
],
|
|
377
|
+
"flags": {
|
|
378
|
+
"new": {
|
|
379
|
+
"char": "n",
|
|
380
|
+
"description": "Start a new iteration of the evaluation",
|
|
381
|
+
"name": "new",
|
|
382
|
+
"allowNo": false,
|
|
383
|
+
"type": "boolean"
|
|
384
|
+
},
|
|
385
|
+
"max-concurrent": {
|
|
386
|
+
"char": "m",
|
|
387
|
+
"description": "Maximum concurrent runs",
|
|
388
|
+
"name": "max-concurrent",
|
|
389
|
+
"default": 5,
|
|
390
|
+
"hasDynamicHelp": false,
|
|
391
|
+
"multiple": false,
|
|
392
|
+
"type": "option"
|
|
393
|
+
}
|
|
394
|
+
},
|
|
395
|
+
"hasDynamicHelp": false,
|
|
396
|
+
"hiddenAliases": [],
|
|
397
|
+
"id": "evaluation:run",
|
|
398
|
+
"pluginAlias": "@kradle/cli",
|
|
399
|
+
"pluginName": "@kradle/cli",
|
|
400
|
+
"pluginType": "core",
|
|
401
|
+
"strict": true,
|
|
402
|
+
"enableJsonFlag": false,
|
|
403
|
+
"isESM": true,
|
|
404
|
+
"relativePath": [
|
|
405
|
+
"dist",
|
|
406
|
+
"commands",
|
|
407
|
+
"evaluation",
|
|
408
|
+
"run.js"
|
|
409
|
+
]
|
|
307
410
|
}
|
|
308
411
|
},
|
|
309
|
-
"version": "0.0.
|
|
412
|
+
"version": "0.0.5"
|
|
310
413
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kradle/cli",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.5",
|
|
4
4
|
"description": "Kradle's CLI. Manage challenges, evaluations, agents and more!",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"cli"
|
|
@@ -38,8 +38,10 @@
|
|
|
38
38
|
"chokidar": "^4.0.3",
|
|
39
39
|
"dotenv": "^17.2.3",
|
|
40
40
|
"enquirer": "^2.4.1",
|
|
41
|
+
"ink": "^4.4.1",
|
|
41
42
|
"listr2": "^9.0.5",
|
|
42
43
|
"picocolors": "^1.1.1",
|
|
44
|
+
"react": "^18.2.0",
|
|
43
45
|
"tar": "^7.5.2",
|
|
44
46
|
"zod": "^4.1.12"
|
|
45
47
|
},
|
|
@@ -48,6 +50,8 @@
|
|
|
48
50
|
"@oclif/test": "^4",
|
|
49
51
|
"@types/chai": "^4",
|
|
50
52
|
"@types/node": "^18",
|
|
53
|
+
"@types/react": "^19.2.7",
|
|
54
|
+
"@types/react-dom": "^19.2.3",
|
|
51
55
|
"@types/tar": "^6.1.13",
|
|
52
56
|
"chai": "^4",
|
|
53
57
|
"oclif": "^4",
|
|
@@ -72,6 +76,9 @@
|
|
|
72
76
|
},
|
|
73
77
|
"agent": {
|
|
74
78
|
"description": "Manage agents"
|
|
79
|
+
},
|
|
80
|
+
"evaluation": {
|
|
81
|
+
"description": "Manage and run evaluations"
|
|
75
82
|
}
|
|
76
83
|
}
|
|
77
84
|
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
export function main(): Manifest {
|
|
2
|
+
const CHALLENGE_SLUG = "[INSERT CHALLENGE SLUG HERE]";
|
|
3
|
+
|
|
4
|
+
const AGENTS: string[] = [
|
|
5
|
+
"team-kradle:claude-sonnet-4",
|
|
6
|
+
"team-kradle:qwen3-coder",
|
|
7
|
+
"team-kradle:deepseek-chat-v3-1",
|
|
8
|
+
"team-kradle:grok-4",
|
|
9
|
+
"team-kradle:grok-code-fast-1",
|
|
10
|
+
"team-kradle:gpt-5",
|
|
11
|
+
"team-kradle:kimi-k2",
|
|
12
|
+
"team-kradle:gemini-2-5-flash",
|
|
13
|
+
"team-kradle:gemini-2-5-pro",
|
|
14
|
+
"team-kradle:glm-4-5-air",
|
|
15
|
+
"team-kradle:gpt-5-mini",
|
|
16
|
+
"team-kradle:o3-mini",
|
|
17
|
+
"team-kradle:codestral-2508",
|
|
18
|
+
];
|
|
19
|
+
|
|
20
|
+
const NUM_RUNS = 200;
|
|
21
|
+
const NUM_AGENTS_PER_RUN = 4;
|
|
22
|
+
|
|
23
|
+
const ADDITIONAL_TAGS: string[] = [];
|
|
24
|
+
|
|
25
|
+
const runs: RunConfig[] = [];
|
|
26
|
+
|
|
27
|
+
for (let i = 0; i < NUM_RUNS; i++) {
|
|
28
|
+
const selectedAgents = sampleWithoutReplacement(AGENTS, NUM_AGENTS_PER_RUN);
|
|
29
|
+
|
|
30
|
+
runs.push({
|
|
31
|
+
challenge_slug: CHALLENGE_SLUG,
|
|
32
|
+
participants: selectedAgents.map((agent) => ({ agent })),
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return { runs, tags: ADDITIONAL_TAGS };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function sampleWithoutReplacement<T>(arr: T[], count: number): T[] {
|
|
40
|
+
if (count > arr.length) {
|
|
41
|
+
throw new Error("Sample size cannot be larger than array length.");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const copy = [...arr];
|
|
45
|
+
const result: T[] = [];
|
|
46
|
+
|
|
47
|
+
for (let i = 0; i < count; i++) {
|
|
48
|
+
const idx = Math.floor(Math.random() * copy.length);
|
|
49
|
+
result.push(copy[idx]);
|
|
50
|
+
copy.splice(idx, 1);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return result;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
type Participant = {
|
|
57
|
+
agent: string;
|
|
58
|
+
role?: string;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
type RunConfig = {
|
|
62
|
+
challenge_slug: string;
|
|
63
|
+
participants: Participant[];
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
type Manifest = {
|
|
67
|
+
runs: RunConfig[];
|
|
68
|
+
tags?: string[];
|
|
69
|
+
};
|
|
@@ -2,5 +2,4 @@ WEB_API_URL=https://dev-api.kradle.ai/v0 #https://api.kradle.ai/v0
|
|
|
2
2
|
WEB_URL=https://dev.kradle.ai #https:/.kradle.ai/workbench
|
|
3
3
|
STUDIO_API_URL=http://localhost:2999/api/v0
|
|
4
4
|
STUDIO_URL=kradle-dev://open #kradle://://open
|
|
5
|
-
GCS_BUCKET=mckradle-3c267.firebasestorage.app #kradle-prod-storage
|
|
6
5
|
KRADLE_CHALLENGES_PATH=~/Documents/kradle-studio/challenges
|
|
@@ -2,5 +2,4 @@ WEB_API_URL=https://api.kradle.ai/v0 #https://dev-api.kradle.ai/v0
|
|
|
2
2
|
WEB_URL=https://kradle.ai #https://dev.kradle.ai
|
|
3
3
|
STUDIO_API_URL=http://localhost:2999/api/v0
|
|
4
4
|
STUDIO_URL=kradle://open #kradle-dev://://open
|
|
5
|
-
GCS_BUCKET=kradle-prod-storage #mckradle-3c267.firebasestorage.app
|
|
6
5
|
KRADLE_CHALLENGES_PATH=~/Documents/kradle-studio/challenges
|