@kradle/cli 0.0.17 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +93 -65
  2. package/dist/commands/agent/list.d.ts +4 -0
  3. package/dist/commands/agent/list.js +6 -4
  4. package/dist/commands/challenge/build.d.ts +9 -1
  5. package/dist/commands/challenge/build.js +40 -12
  6. package/dist/commands/challenge/create.d.ts +5 -1
  7. package/dist/commands/challenge/create.js +17 -18
  8. package/dist/commands/challenge/delete.d.ts +4 -1
  9. package/dist/commands/challenge/delete.js +5 -5
  10. package/dist/commands/challenge/list.d.ts +5 -0
  11. package/dist/commands/challenge/list.js +11 -10
  12. package/dist/commands/challenge/run.d.ts +8 -1
  13. package/dist/commands/challenge/run.js +13 -8
  14. package/dist/commands/challenge/watch.d.ts +4 -1
  15. package/dist/commands/challenge/watch.js +8 -8
  16. package/dist/commands/{evaluation → experiment}/create.d.ts +4 -0
  17. package/dist/commands/{evaluation → experiment}/create.js +22 -21
  18. package/dist/commands/{evaluation → experiment}/list.js +17 -19
  19. package/dist/commands/experiment/recordings.d.ts +19 -0
  20. package/dist/commands/experiment/recordings.js +416 -0
  21. package/dist/commands/experiment/run.d.ts +17 -0
  22. package/dist/commands/experiment/run.js +67 -0
  23. package/dist/commands/init.js +2 -2
  24. package/dist/lib/api-client.d.ts +51 -10
  25. package/dist/lib/api-client.js +108 -39
  26. package/dist/lib/arguments.d.ts +3 -2
  27. package/dist/lib/arguments.js +5 -3
  28. package/dist/lib/challenge.d.ts +13 -18
  29. package/dist/lib/challenge.js +58 -62
  30. package/dist/lib/experiment/experimenter.d.ts +92 -0
  31. package/dist/lib/experiment/experimenter.js +368 -0
  32. package/dist/lib/{evaluation → experiment}/index.d.ts +1 -1
  33. package/dist/lib/{evaluation → experiment}/index.js +1 -1
  34. package/dist/lib/{evaluation → experiment}/runner.d.ts +2 -0
  35. package/dist/lib/{evaluation → experiment}/runner.js +21 -2
  36. package/dist/lib/{evaluation → experiment}/tui.d.ts +1 -1
  37. package/dist/lib/{evaluation → experiment}/tui.js +3 -3
  38. package/dist/lib/{evaluation → experiment}/types.d.ts +10 -4
  39. package/dist/lib/{evaluation → experiment}/types.js +5 -3
  40. package/dist/lib/flags.d.ts +47 -0
  41. package/dist/lib/flags.js +63 -0
  42. package/dist/lib/schemas.d.ts +63 -2
  43. package/dist/lib/schemas.js +27 -1
  44. package/dist/lib/utils.d.ts +9 -10
  45. package/dist/lib/utils.js +12 -12
  46. package/oclif.manifest.json +423 -64
  47. package/package.json +11 -8
  48. package/static/challenge.ts +12 -13
  49. package/static/experiment_template.ts +114 -0
  50. package/static/project_template/dev.env +5 -5
  51. package/static/project_template/prod.env +4 -4
  52. package/static/project_template/tsconfig.json +1 -1
  53. package/dist/commands/challenge/multi-upload.d.ts +0 -6
  54. package/dist/commands/challenge/multi-upload.js +0 -80
  55. package/dist/commands/evaluation/run.d.ts +0 -13
  56. package/dist/commands/evaluation/run.js +0 -61
  57. package/dist/lib/config.d.ts +0 -12
  58. package/dist/lib/config.js +0 -49
  59. package/dist/lib/evaluation/evaluator.d.ts +0 -88
  60. package/dist/lib/evaluation/evaluator.js +0 -268
  61. package/static/evaluation_template.ts +0 -69
  62. /package/dist/commands/{evaluation → experiment}/list.d.ts +0 -0
@@ -1,268 +0,0 @@
1
- import fs from "node:fs/promises";
2
- import path from "node:path";
3
- import { executeNodeCommand, openInBrowser } from "../utils.js";
4
- import { Runner } from "./runner.js";
5
- import { TUI } from "./tui.js";
6
- import { EvaluationMetadataSchema, ManifestSchema, ProgressSchema } from "./types.js";
7
- export class Evaluator {
8
- name;
9
- config;
10
- api;
11
- evaluationDir;
12
- metadataPath;
13
- runner;
14
- tui;
15
- currentIteration;
16
- constructor(name, config, api) {
17
- this.name = name;
18
- this.config = config;
19
- this.api = api;
20
- this.evaluationDir = path.resolve(process.cwd(), "evaluations", name);
21
- this.metadataPath = path.join(this.evaluationDir, ".evaluation.json");
22
- }
23
- /**
24
- * Get paths for a specific iteration
25
- */
26
- getIterationPaths(iteration) {
27
- const iterationDir = path.join(this.evaluationDir, "iterations", iteration.toString().padStart(3, "0"));
28
- return {
29
- iterationDir,
30
- configPath: path.join(iterationDir, "config.ts"),
31
- manifestPath: path.join(iterationDir, "manifest.json"),
32
- progressPath: path.join(iterationDir, "progress.json"),
33
- };
34
- }
35
- get configPath() {
36
- return path.join(this.evaluationDir, "config.ts");
37
- }
38
- /**
39
- * Get the current iteration directory path
40
- */
41
- getCurrentIterationDir() {
42
- if (this.currentIteration === undefined) {
43
- throw new Error("No iteration set");
44
- }
45
- return this.getIterationPaths(this.currentIteration).iterationDir;
46
- }
47
- /**
48
- * Check if evaluation exists
49
- */
50
- async exists() {
51
- try {
52
- await fs.access(this.evaluationDir);
53
- return true;
54
- }
55
- catch {
56
- return false;
57
- }
58
- }
59
- /**
60
- * Check if config.ts exists (master config)
61
- */
62
- async configExists() {
63
- try {
64
- await fs.access(this.configPath);
65
- return true;
66
- }
67
- catch {
68
- return false;
69
- }
70
- }
71
- /**
72
- * Load evaluation metadata
73
- */
74
- async loadMetadata() {
75
- try {
76
- const content = await fs.readFile(this.metadataPath, "utf-8");
77
- const data = JSON.parse(content);
78
- return EvaluationMetadataSchema.parse(data);
79
- }
80
- catch {
81
- return null;
82
- }
83
- }
84
- /**
85
- * Save evaluation metadata
86
- */
87
- async saveMetadata(metadata) {
88
- await fs.writeFile(this.metadataPath, JSON.stringify(metadata, null, 2));
89
- }
90
- /**
91
- * Get the current iteration number, or -1 if none exists
92
- */
93
- async getCurrentIterationNumber() {
94
- const metadata = await this.loadMetadata();
95
- return metadata?.currentIteration ?? -1;
96
- }
97
- /**
98
- * Create a new iteration
99
- */
100
- async createNewIteration() {
101
- const currentIteration = await this.getCurrentIterationNumber();
102
- const newIteration = currentIteration + 1;
103
- const paths = this.getIterationPaths(newIteration);
104
- // Create iteration directory
105
- await fs.mkdir(paths.iterationDir, { recursive: true });
106
- // Copy master config to iteration
107
- const masterConfigPath = path.join(this.evaluationDir, "config.ts");
108
- await fs.copyFile(masterConfigPath, paths.configPath);
109
- // Generate manifest from config
110
- const manifest = await this.generateManifest(paths.configPath);
111
- await fs.writeFile(paths.manifestPath, JSON.stringify(manifest, null, 2));
112
- // Update metadata
113
- await this.saveMetadata({ currentIteration: newIteration });
114
- this.currentIteration = newIteration;
115
- return newIteration;
116
- }
117
- /**
118
- * Get or create an iteration
119
- * @param createNew - If true, always create a new iteration. Otherwise, use current iteration or create first one if none exists.
120
- */
121
- async getOrCreateIteration(createNew) {
122
- if (createNew) {
123
- return await this.createNewIteration();
124
- }
125
- const currentIteration = await this.getCurrentIterationNumber();
126
- if (currentIteration < 0) {
127
- return await this.createNewIteration();
128
- }
129
- this.currentIteration = currentIteration;
130
- return currentIteration;
131
- }
132
- /**
133
- * Load manifest from iteration
134
- */
135
- async loadManifest(iteration) {
136
- const paths = this.getIterationPaths(iteration);
137
- const content = await fs.readFile(paths.manifestPath, "utf-8");
138
- const data = JSON.parse(content);
139
- return ManifestSchema.parse(data);
140
- }
141
- /**
142
- * Load progress from iteration
143
- */
144
- async loadProgress(iteration) {
145
- try {
146
- const paths = this.getIterationPaths(iteration);
147
- const content = await fs.readFile(paths.progressPath, "utf-8");
148
- const data = JSON.parse(content);
149
- return ProgressSchema.parse(data);
150
- }
151
- catch {
152
- return null;
153
- }
154
- }
155
- /**
156
- * Save progress to current iteration
157
- */
158
- async saveProgress() {
159
- if (!this.runner || this.currentIteration === undefined)
160
- return;
161
- const paths = this.getIterationPaths(this.currentIteration);
162
- const progress = {
163
- entries: this.runner.getProgressEntries(),
164
- lastUpdated: Date.now(),
165
- };
166
- await fs.writeFile(paths.progressPath, JSON.stringify(progress, null, 2));
167
- }
168
- /**
169
- * Execute config.ts to generate manifest
170
- */
171
- async generateManifest(configPath) {
172
- const manifest = await this.executeConfigFile(configPath);
173
- return ManifestSchema.parse(manifest);
174
- }
175
- /**
176
- * Execute config.ts file and return the manifest
177
- */
178
- async executeConfigFile(configPath) {
179
- // We spawn a new NodeJS process to execute & log the config file.
180
- // We can't directly import the file because it would be cached, and import cache can't be invalidated.
181
- const stdout = await executeNodeCommand([
182
- "--experimental-transform-types",
183
- "--no-warnings",
184
- "-e",
185
- `console.log(JSON.stringify(require("${configPath}").main()));`,
186
- ], this.config);
187
- return JSON.parse(stdout.trim());
188
- }
189
- /**
190
- * Run the evaluation
191
- */
192
- async run(options) {
193
- const iteration = await this.getOrCreateIteration(options.new);
194
- // Load manifest
195
- const manifest = await this.loadManifest(iteration);
196
- // We have 2 mandatory tags: "eval-<evaluation-name>" and "eval-<evaluation-name>-iteration-<iteration>"
197
- const evaluationTag = `eval-${this.name}`;
198
- const iterationTag = `${evaluationTag}-iteration-${iteration}`;
199
- const tags = [evaluationTag, iterationTag, ...(manifest.tags ?? [])];
200
- // Create runner
201
- this.runner = new Runner(manifest.runs, this.api, this.config.WEB_URL, {
202
- maxConcurrent: options.maxConcurrent,
203
- tags: tags,
204
- onStateChange: () => this.onRunStateChange(),
205
- });
206
- // Restore progress if applicable
207
- const progress = await this.loadProgress(iteration);
208
- if (progress) {
209
- this.runner.restoreProgress(progress.entries);
210
- }
211
- // Create TUI
212
- this.tui = new TUI({
213
- evaluationName: `${this.name} (iteration ${iteration})`,
214
- onQuit: () => this.handleQuit(),
215
- onOpenRun: (index) => this.openRun(index),
216
- });
217
- // Initial state update
218
- this.tui.updateStates(this.runner.getAllStates());
219
- this.tui.updateStatusCounts(this.runner.getStatusCounts());
220
- // Start TUI
221
- this.tui.start();
222
- try {
223
- // Execute runs
224
- await this.runner.execute();
225
- // Final save
226
- await this.saveProgress();
227
- }
228
- finally {
229
- this.tui.stop();
230
- console.log("");
231
- }
232
- if (options.openMetabase ?? true) {
233
- openInBrowser(`https://daunt-fair.metabaseapp.com/dashboard/10-runs-analysis?run_tags=${iterationTag}`);
234
- }
235
- const errors = this.runner?.getAllStates().filter((state) => state.status === "error");
236
- if (errors?.length > 0) {
237
- throw new Error(`${errors.map((error) => error.error).join("\n\n")}`);
238
- }
239
- }
240
- /**
241
- * Handle state change from runner
242
- */
243
- onRunStateChange() {
244
- if (this.tui && this.runner) {
245
- this.tui.updateStates(this.runner.getAllStates());
246
- this.tui.updateStatusCounts(this.runner.getStatusCounts());
247
- }
248
- // Periodically save progress
249
- this.saveProgress().catch(() => { });
250
- }
251
- /**
252
- * Handle quit request
253
- */
254
- handleQuit() {
255
- this.runner?.stop();
256
- this.tui?.stop();
257
- process.exit(0);
258
- }
259
- /**
260
- * Open run in browser
261
- */
262
- openRun(index) {
263
- const url = this.runner?.getRunUrl(index);
264
- if (url) {
265
- openInBrowser(url);
266
- }
267
- }
268
- }
@@ -1,69 +0,0 @@
1
- export function main(): Manifest {
2
- const CHALLENGE_SLUG = "[INSERT CHALLENGE SLUG HERE]";
3
-
4
- const AGENTS: string[] = [
5
- "team-kradle:claude-sonnet-4",
6
- "team-kradle:qwen3-coder",
7
- "team-kradle:deepseek-chat-v3-1",
8
- "team-kradle:grok-4",
9
- "team-kradle:grok-code-fast-1",
10
- "team-kradle:gpt-5",
11
- "team-kradle:kimi-k2",
12
- "team-kradle:gemini-2-5-flash",
13
- "team-kradle:gemini-2-5-pro",
14
- "team-kradle:glm-4-5-air",
15
- "team-kradle:gpt-5-mini",
16
- "team-kradle:o3-mini",
17
- "team-kradle:codestral-2508",
18
- ];
19
-
20
- const NUM_RUNS = 200;
21
- const NUM_AGENTS_PER_RUN = 4;
22
-
23
- const ADDITIONAL_TAGS: string[] = [];
24
-
25
- const runs: RunConfig[] = [];
26
-
27
- for (let i = 0; i < NUM_RUNS; i++) {
28
- const selectedAgents = sampleWithoutReplacement(AGENTS, NUM_AGENTS_PER_RUN);
29
-
30
- runs.push({
31
- challenge_slug: CHALLENGE_SLUG,
32
- participants: selectedAgents.map((agent) => ({ agent })),
33
- });
34
- }
35
-
36
- return { runs, tags: ADDITIONAL_TAGS };
37
- }
38
-
39
- function sampleWithoutReplacement<T>(arr: T[], count: number): T[] {
40
- if (count > arr.length) {
41
- throw new Error("Sample size cannot be larger than array length.");
42
- }
43
-
44
- const copy = [...arr];
45
- const result: T[] = [];
46
-
47
- for (let i = 0; i < count; i++) {
48
- const idx = Math.floor(Math.random() * copy.length);
49
- result.push(copy[idx]);
50
- copy.splice(idx, 1);
51
- }
52
-
53
- return result;
54
- }
55
-
56
- type Participant = {
57
- agent: string;
58
- role?: string;
59
- };
60
-
61
- type RunConfig = {
62
- challenge_slug: string;
63
- participants: Participant[];
64
- };
65
-
66
- type Manifest = {
67
- runs: RunConfig[];
68
- tags?: string[];
69
- };