@kradle/cli 0.0.17 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -65
- package/dist/commands/agent/list.d.ts +4 -0
- package/dist/commands/agent/list.js +6 -4
- package/dist/commands/challenge/build.d.ts +9 -1
- package/dist/commands/challenge/build.js +40 -12
- package/dist/commands/challenge/create.d.ts +5 -1
- package/dist/commands/challenge/create.js +17 -18
- package/dist/commands/challenge/delete.d.ts +4 -1
- package/dist/commands/challenge/delete.js +5 -5
- package/dist/commands/challenge/list.d.ts +5 -0
- package/dist/commands/challenge/list.js +11 -10
- package/dist/commands/challenge/run.d.ts +8 -1
- package/dist/commands/challenge/run.js +13 -8
- package/dist/commands/challenge/watch.d.ts +4 -1
- package/dist/commands/challenge/watch.js +8 -8
- package/dist/commands/{evaluation → experiment}/create.d.ts +4 -0
- package/dist/commands/{evaluation → experiment}/create.js +22 -21
- package/dist/commands/{evaluation → experiment}/list.js +17 -19
- package/dist/commands/experiment/recordings.d.ts +19 -0
- package/dist/commands/experiment/recordings.js +416 -0
- package/dist/commands/experiment/run.d.ts +17 -0
- package/dist/commands/experiment/run.js +67 -0
- package/dist/commands/init.js +2 -2
- package/dist/lib/api-client.d.ts +51 -10
- package/dist/lib/api-client.js +108 -39
- package/dist/lib/arguments.d.ts +3 -2
- package/dist/lib/arguments.js +5 -3
- package/dist/lib/challenge.d.ts +13 -18
- package/dist/lib/challenge.js +58 -62
- package/dist/lib/experiment/experimenter.d.ts +92 -0
- package/dist/lib/experiment/experimenter.js +368 -0
- package/dist/lib/{evaluation → experiment}/index.d.ts +1 -1
- package/dist/lib/{evaluation → experiment}/index.js +1 -1
- package/dist/lib/{evaluation → experiment}/runner.d.ts +2 -0
- package/dist/lib/{evaluation → experiment}/runner.js +21 -2
- package/dist/lib/{evaluation → experiment}/tui.d.ts +1 -1
- package/dist/lib/{evaluation → experiment}/tui.js +3 -3
- package/dist/lib/{evaluation → experiment}/types.d.ts +10 -4
- package/dist/lib/{evaluation → experiment}/types.js +5 -3
- package/dist/lib/flags.d.ts +47 -0
- package/dist/lib/flags.js +63 -0
- package/dist/lib/schemas.d.ts +63 -2
- package/dist/lib/schemas.js +27 -1
- package/dist/lib/utils.d.ts +9 -10
- package/dist/lib/utils.js +12 -12
- package/oclif.manifest.json +423 -64
- package/package.json +11 -8
- package/static/challenge.ts +12 -13
- package/static/experiment_template.ts +114 -0
- package/static/project_template/dev.env +5 -5
- package/static/project_template/prod.env +4 -4
- package/static/project_template/tsconfig.json +1 -1
- package/dist/commands/challenge/multi-upload.d.ts +0 -6
- package/dist/commands/challenge/multi-upload.js +0 -80
- package/dist/commands/evaluation/run.d.ts +0 -13
- package/dist/commands/evaluation/run.js +0 -61
- package/dist/lib/config.d.ts +0 -12
- package/dist/lib/config.js +0 -49
- package/dist/lib/evaluation/evaluator.d.ts +0 -88
- package/dist/lib/evaluation/evaluator.js +0 -268
- package/static/evaluation_template.ts +0 -69
- /package/dist/commands/{evaluation → experiment}/list.d.ts +0 -0
|
@@ -1,268 +0,0 @@
|
|
|
1
|
-
import fs from "node:fs/promises";
|
|
2
|
-
import path from "node:path";
|
|
3
|
-
import { executeNodeCommand, openInBrowser } from "../utils.js";
|
|
4
|
-
import { Runner } from "./runner.js";
|
|
5
|
-
import { TUI } from "./tui.js";
|
|
6
|
-
import { EvaluationMetadataSchema, ManifestSchema, ProgressSchema } from "./types.js";
|
|
7
|
-
export class Evaluator {
|
|
8
|
-
name;
|
|
9
|
-
config;
|
|
10
|
-
api;
|
|
11
|
-
evaluationDir;
|
|
12
|
-
metadataPath;
|
|
13
|
-
runner;
|
|
14
|
-
tui;
|
|
15
|
-
currentIteration;
|
|
16
|
-
constructor(name, config, api) {
|
|
17
|
-
this.name = name;
|
|
18
|
-
this.config = config;
|
|
19
|
-
this.api = api;
|
|
20
|
-
this.evaluationDir = path.resolve(process.cwd(), "evaluations", name);
|
|
21
|
-
this.metadataPath = path.join(this.evaluationDir, ".evaluation.json");
|
|
22
|
-
}
|
|
23
|
-
/**
|
|
24
|
-
* Get paths for a specific iteration
|
|
25
|
-
*/
|
|
26
|
-
getIterationPaths(iteration) {
|
|
27
|
-
const iterationDir = path.join(this.evaluationDir, "iterations", iteration.toString().padStart(3, "0"));
|
|
28
|
-
return {
|
|
29
|
-
iterationDir,
|
|
30
|
-
configPath: path.join(iterationDir, "config.ts"),
|
|
31
|
-
manifestPath: path.join(iterationDir, "manifest.json"),
|
|
32
|
-
progressPath: path.join(iterationDir, "progress.json"),
|
|
33
|
-
};
|
|
34
|
-
}
|
|
35
|
-
get configPath() {
|
|
36
|
-
return path.join(this.evaluationDir, "config.ts");
|
|
37
|
-
}
|
|
38
|
-
/**
|
|
39
|
-
* Get the current iteration directory path
|
|
40
|
-
*/
|
|
41
|
-
getCurrentIterationDir() {
|
|
42
|
-
if (this.currentIteration === undefined) {
|
|
43
|
-
throw new Error("No iteration set");
|
|
44
|
-
}
|
|
45
|
-
return this.getIterationPaths(this.currentIteration).iterationDir;
|
|
46
|
-
}
|
|
47
|
-
/**
|
|
48
|
-
* Check if evaluation exists
|
|
49
|
-
*/
|
|
50
|
-
async exists() {
|
|
51
|
-
try {
|
|
52
|
-
await fs.access(this.evaluationDir);
|
|
53
|
-
return true;
|
|
54
|
-
}
|
|
55
|
-
catch {
|
|
56
|
-
return false;
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
/**
|
|
60
|
-
* Check if config.ts exists (master config)
|
|
61
|
-
*/
|
|
62
|
-
async configExists() {
|
|
63
|
-
try {
|
|
64
|
-
await fs.access(this.configPath);
|
|
65
|
-
return true;
|
|
66
|
-
}
|
|
67
|
-
catch {
|
|
68
|
-
return false;
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
/**
|
|
72
|
-
* Load evaluation metadata
|
|
73
|
-
*/
|
|
74
|
-
async loadMetadata() {
|
|
75
|
-
try {
|
|
76
|
-
const content = await fs.readFile(this.metadataPath, "utf-8");
|
|
77
|
-
const data = JSON.parse(content);
|
|
78
|
-
return EvaluationMetadataSchema.parse(data);
|
|
79
|
-
}
|
|
80
|
-
catch {
|
|
81
|
-
return null;
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
/**
|
|
85
|
-
* Save evaluation metadata
|
|
86
|
-
*/
|
|
87
|
-
async saveMetadata(metadata) {
|
|
88
|
-
await fs.writeFile(this.metadataPath, JSON.stringify(metadata, null, 2));
|
|
89
|
-
}
|
|
90
|
-
/**
|
|
91
|
-
* Get the current iteration number, or -1 if none exists
|
|
92
|
-
*/
|
|
93
|
-
async getCurrentIterationNumber() {
|
|
94
|
-
const metadata = await this.loadMetadata();
|
|
95
|
-
return metadata?.currentIteration ?? -1;
|
|
96
|
-
}
|
|
97
|
-
/**
|
|
98
|
-
* Create a new iteration
|
|
99
|
-
*/
|
|
100
|
-
async createNewIteration() {
|
|
101
|
-
const currentIteration = await this.getCurrentIterationNumber();
|
|
102
|
-
const newIteration = currentIteration + 1;
|
|
103
|
-
const paths = this.getIterationPaths(newIteration);
|
|
104
|
-
// Create iteration directory
|
|
105
|
-
await fs.mkdir(paths.iterationDir, { recursive: true });
|
|
106
|
-
// Copy master config to iteration
|
|
107
|
-
const masterConfigPath = path.join(this.evaluationDir, "config.ts");
|
|
108
|
-
await fs.copyFile(masterConfigPath, paths.configPath);
|
|
109
|
-
// Generate manifest from config
|
|
110
|
-
const manifest = await this.generateManifest(paths.configPath);
|
|
111
|
-
await fs.writeFile(paths.manifestPath, JSON.stringify(manifest, null, 2));
|
|
112
|
-
// Update metadata
|
|
113
|
-
await this.saveMetadata({ currentIteration: newIteration });
|
|
114
|
-
this.currentIteration = newIteration;
|
|
115
|
-
return newIteration;
|
|
116
|
-
}
|
|
117
|
-
/**
|
|
118
|
-
* Get or create an iteration
|
|
119
|
-
* @param createNew - If true, always create a new iteration. Otherwise, use current iteration or create first one if none exists.
|
|
120
|
-
*/
|
|
121
|
-
async getOrCreateIteration(createNew) {
|
|
122
|
-
if (createNew) {
|
|
123
|
-
return await this.createNewIteration();
|
|
124
|
-
}
|
|
125
|
-
const currentIteration = await this.getCurrentIterationNumber();
|
|
126
|
-
if (currentIteration < 0) {
|
|
127
|
-
return await this.createNewIteration();
|
|
128
|
-
}
|
|
129
|
-
this.currentIteration = currentIteration;
|
|
130
|
-
return currentIteration;
|
|
131
|
-
}
|
|
132
|
-
/**
|
|
133
|
-
* Load manifest from iteration
|
|
134
|
-
*/
|
|
135
|
-
async loadManifest(iteration) {
|
|
136
|
-
const paths = this.getIterationPaths(iteration);
|
|
137
|
-
const content = await fs.readFile(paths.manifestPath, "utf-8");
|
|
138
|
-
const data = JSON.parse(content);
|
|
139
|
-
return ManifestSchema.parse(data);
|
|
140
|
-
}
|
|
141
|
-
/**
|
|
142
|
-
* Load progress from iteration
|
|
143
|
-
*/
|
|
144
|
-
async loadProgress(iteration) {
|
|
145
|
-
try {
|
|
146
|
-
const paths = this.getIterationPaths(iteration);
|
|
147
|
-
const content = await fs.readFile(paths.progressPath, "utf-8");
|
|
148
|
-
const data = JSON.parse(content);
|
|
149
|
-
return ProgressSchema.parse(data);
|
|
150
|
-
}
|
|
151
|
-
catch {
|
|
152
|
-
return null;
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
/**
|
|
156
|
-
* Save progress to current iteration
|
|
157
|
-
*/
|
|
158
|
-
async saveProgress() {
|
|
159
|
-
if (!this.runner || this.currentIteration === undefined)
|
|
160
|
-
return;
|
|
161
|
-
const paths = this.getIterationPaths(this.currentIteration);
|
|
162
|
-
const progress = {
|
|
163
|
-
entries: this.runner.getProgressEntries(),
|
|
164
|
-
lastUpdated: Date.now(),
|
|
165
|
-
};
|
|
166
|
-
await fs.writeFile(paths.progressPath, JSON.stringify(progress, null, 2));
|
|
167
|
-
}
|
|
168
|
-
/**
|
|
169
|
-
* Execute config.ts to generate manifest
|
|
170
|
-
*/
|
|
171
|
-
async generateManifest(configPath) {
|
|
172
|
-
const manifest = await this.executeConfigFile(configPath);
|
|
173
|
-
return ManifestSchema.parse(manifest);
|
|
174
|
-
}
|
|
175
|
-
/**
|
|
176
|
-
* Execute config.ts file and return the manifest
|
|
177
|
-
*/
|
|
178
|
-
async executeConfigFile(configPath) {
|
|
179
|
-
// We spawn a new NodeJS process to execute & log the config file.
|
|
180
|
-
// We can't directly import the file because it would be cached, and import cache can't be invalidated.
|
|
181
|
-
const stdout = await executeNodeCommand([
|
|
182
|
-
"--experimental-transform-types",
|
|
183
|
-
"--no-warnings",
|
|
184
|
-
"-e",
|
|
185
|
-
`console.log(JSON.stringify(require("${configPath}").main()));`,
|
|
186
|
-
], this.config);
|
|
187
|
-
return JSON.parse(stdout.trim());
|
|
188
|
-
}
|
|
189
|
-
/**
|
|
190
|
-
* Run the evaluation
|
|
191
|
-
*/
|
|
192
|
-
async run(options) {
|
|
193
|
-
const iteration = await this.getOrCreateIteration(options.new);
|
|
194
|
-
// Load manifest
|
|
195
|
-
const manifest = await this.loadManifest(iteration);
|
|
196
|
-
// We have 2 mandatory tags: "eval-<evaluation-name>" and "eval-<evaluation-name>-iteration-<iteration>"
|
|
197
|
-
const evaluationTag = `eval-${this.name}`;
|
|
198
|
-
const iterationTag = `${evaluationTag}-iteration-${iteration}`;
|
|
199
|
-
const tags = [evaluationTag, iterationTag, ...(manifest.tags ?? [])];
|
|
200
|
-
// Create runner
|
|
201
|
-
this.runner = new Runner(manifest.runs, this.api, this.config.WEB_URL, {
|
|
202
|
-
maxConcurrent: options.maxConcurrent,
|
|
203
|
-
tags: tags,
|
|
204
|
-
onStateChange: () => this.onRunStateChange(),
|
|
205
|
-
});
|
|
206
|
-
// Restore progress if applicable
|
|
207
|
-
const progress = await this.loadProgress(iteration);
|
|
208
|
-
if (progress) {
|
|
209
|
-
this.runner.restoreProgress(progress.entries);
|
|
210
|
-
}
|
|
211
|
-
// Create TUI
|
|
212
|
-
this.tui = new TUI({
|
|
213
|
-
evaluationName: `${this.name} (iteration ${iteration})`,
|
|
214
|
-
onQuit: () => this.handleQuit(),
|
|
215
|
-
onOpenRun: (index) => this.openRun(index),
|
|
216
|
-
});
|
|
217
|
-
// Initial state update
|
|
218
|
-
this.tui.updateStates(this.runner.getAllStates());
|
|
219
|
-
this.tui.updateStatusCounts(this.runner.getStatusCounts());
|
|
220
|
-
// Start TUI
|
|
221
|
-
this.tui.start();
|
|
222
|
-
try {
|
|
223
|
-
// Execute runs
|
|
224
|
-
await this.runner.execute();
|
|
225
|
-
// Final save
|
|
226
|
-
await this.saveProgress();
|
|
227
|
-
}
|
|
228
|
-
finally {
|
|
229
|
-
this.tui.stop();
|
|
230
|
-
console.log("");
|
|
231
|
-
}
|
|
232
|
-
if (options.openMetabase ?? true) {
|
|
233
|
-
openInBrowser(`https://daunt-fair.metabaseapp.com/dashboard/10-runs-analysis?run_tags=${iterationTag}`);
|
|
234
|
-
}
|
|
235
|
-
const errors = this.runner?.getAllStates().filter((state) => state.status === "error");
|
|
236
|
-
if (errors?.length > 0) {
|
|
237
|
-
throw new Error(`${errors.map((error) => error.error).join("\n\n")}`);
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
/**
|
|
241
|
-
* Handle state change from runner
|
|
242
|
-
*/
|
|
243
|
-
onRunStateChange() {
|
|
244
|
-
if (this.tui && this.runner) {
|
|
245
|
-
this.tui.updateStates(this.runner.getAllStates());
|
|
246
|
-
this.tui.updateStatusCounts(this.runner.getStatusCounts());
|
|
247
|
-
}
|
|
248
|
-
// Periodically save progress
|
|
249
|
-
this.saveProgress().catch(() => { });
|
|
250
|
-
}
|
|
251
|
-
/**
|
|
252
|
-
* Handle quit request
|
|
253
|
-
*/
|
|
254
|
-
handleQuit() {
|
|
255
|
-
this.runner?.stop();
|
|
256
|
-
this.tui?.stop();
|
|
257
|
-
process.exit(0);
|
|
258
|
-
}
|
|
259
|
-
/**
|
|
260
|
-
* Open run in browser
|
|
261
|
-
*/
|
|
262
|
-
openRun(index) {
|
|
263
|
-
const url = this.runner?.getRunUrl(index);
|
|
264
|
-
if (url) {
|
|
265
|
-
openInBrowser(url);
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
}
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
export function main(): Manifest {
|
|
2
|
-
const CHALLENGE_SLUG = "[INSERT CHALLENGE SLUG HERE]";
|
|
3
|
-
|
|
4
|
-
const AGENTS: string[] = [
|
|
5
|
-
"team-kradle:claude-sonnet-4",
|
|
6
|
-
"team-kradle:qwen3-coder",
|
|
7
|
-
"team-kradle:deepseek-chat-v3-1",
|
|
8
|
-
"team-kradle:grok-4",
|
|
9
|
-
"team-kradle:grok-code-fast-1",
|
|
10
|
-
"team-kradle:gpt-5",
|
|
11
|
-
"team-kradle:kimi-k2",
|
|
12
|
-
"team-kradle:gemini-2-5-flash",
|
|
13
|
-
"team-kradle:gemini-2-5-pro",
|
|
14
|
-
"team-kradle:glm-4-5-air",
|
|
15
|
-
"team-kradle:gpt-5-mini",
|
|
16
|
-
"team-kradle:o3-mini",
|
|
17
|
-
"team-kradle:codestral-2508",
|
|
18
|
-
];
|
|
19
|
-
|
|
20
|
-
const NUM_RUNS = 200;
|
|
21
|
-
const NUM_AGENTS_PER_RUN = 4;
|
|
22
|
-
|
|
23
|
-
const ADDITIONAL_TAGS: string[] = [];
|
|
24
|
-
|
|
25
|
-
const runs: RunConfig[] = [];
|
|
26
|
-
|
|
27
|
-
for (let i = 0; i < NUM_RUNS; i++) {
|
|
28
|
-
const selectedAgents = sampleWithoutReplacement(AGENTS, NUM_AGENTS_PER_RUN);
|
|
29
|
-
|
|
30
|
-
runs.push({
|
|
31
|
-
challenge_slug: CHALLENGE_SLUG,
|
|
32
|
-
participants: selectedAgents.map((agent) => ({ agent })),
|
|
33
|
-
});
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
return { runs, tags: ADDITIONAL_TAGS };
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
function sampleWithoutReplacement<T>(arr: T[], count: number): T[] {
|
|
40
|
-
if (count > arr.length) {
|
|
41
|
-
throw new Error("Sample size cannot be larger than array length.");
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
const copy = [...arr];
|
|
45
|
-
const result: T[] = [];
|
|
46
|
-
|
|
47
|
-
for (let i = 0; i < count; i++) {
|
|
48
|
-
const idx = Math.floor(Math.random() * copy.length);
|
|
49
|
-
result.push(copy[idx]);
|
|
50
|
-
copy.splice(idx, 1);
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
return result;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
type Participant = {
|
|
57
|
-
agent: string;
|
|
58
|
-
role?: string;
|
|
59
|
-
};
|
|
60
|
-
|
|
61
|
-
type RunConfig = {
|
|
62
|
-
challenge_slug: string;
|
|
63
|
-
participants: Participant[];
|
|
64
|
-
};
|
|
65
|
-
|
|
66
|
-
type Manifest = {
|
|
67
|
-
runs: RunConfig[];
|
|
68
|
-
tags?: string[];
|
|
69
|
-
};
|
|
File without changes
|