@kradle/cli 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -45,7 +45,6 @@ WEB_URL=https://kradle.ai
45
45
  STUDIO_API_URL=http://localhost:8080
46
46
  STUDIO_URL=kradle-studio://
47
47
  KRADLE_API_KEY=your-api-key
48
- GCS_BUCKET=your-gcs-bucket
49
48
  KRADLE_CHALLENGES_PATH=~/Documents/kradle-studio/challenges
50
49
  ```
51
50
 
@@ -126,6 +125,29 @@ kradle challenge multi-upload
126
125
 
127
126
  Provides an interactive UI to select multiple challenges and uploads them in parallel.
128
127
 
128
+ ### Evaluations (beta)
129
+
130
+ Plan and execute batches of runs across challenges/agents, with resumable iterations and a TUI.
131
+
132
+ - **Init**: scaffold an evaluation config `evaluations/<name>/config.ts`
133
+ ```bash
134
+ kradle evaluation init <name>
135
+ ```
136
+ - **List**: list local evaluations
137
+ ```bash
138
+ kradle evaluation list
139
+ ```
140
+ - **Run**: execute or resume an evaluation (iterations stored under `evaluations/<name>/iterations/`)
141
+ ```bash
142
+ kradle evaluation run <name> [--new] [--max-concurrent N]
143
+ ```
144
+
145
+ Features:
146
+ - Iterations: `--new` starts a new iteration; otherwise resumes the latest.
147
+ - Resumable state: progress is persisted per iteration; in-flight runs are re-polled on resume, completed runs stay completed.
148
+ - Ink TUI: live status counts, elapsed times, scrollable run list; keys `q/Ctrl+C` quit, `↑/↓/j/k` move, `o` open run URL.
149
+ - Per-iteration manifest: generated from the evaluation `config.ts` into `manifest.json` before runs start.
150
+
129
151
  ## Development
130
152
 
131
153
  ### Setup
@@ -90,7 +90,8 @@ export default class Watch extends Command {
90
90
  this.log(pc.blue(`\nStarting watch mode for ${pc.bold(challenge.shortSlug)}\n`));
91
91
  this.log(pc.dim("Watching for changes... (Ctrl+C to stop)\n"));
92
92
  const watcher = chokidar.watch([challenge.challengeDir], {
93
- ignored: /(^|[/\\])\../, // ignore dotfiles
93
+ // ⚠️ WE IGNORE THE DATAPACK FOLDER FOR NOW, BUT IT'S A SHORT TERM FIX.
94
+ ignored: [/(^|[/\\])\../, (p) => p.includes("/datapack")], // ignore dotfiles and datapack folder
94
95
  persistent: true,
95
96
  ignoreInitial: true,
96
97
  });
@@ -0,0 +1,9 @@
1
+ import { Command } from "@oclif/core";
2
+ export default class Init extends Command {
3
+ static description: string;
4
+ static examples: string[];
5
+ static args: {
6
+ name: import("@oclif/core/interfaces").Arg<string, Record<string, unknown>>;
7
+ };
8
+ run(): Promise<void>;
9
+ }
@@ -0,0 +1,58 @@
1
+ import { exec } from "node:child_process";
2
+ import fs from "node:fs/promises";
3
+ import path from "node:path";
4
+ import { Args, Command } from "@oclif/core";
5
+ import pc from "picocolors";
6
+ import { loadConfig } from "../../lib/config.js";
7
+ import { getStaticResourcePath } from "../../lib/utils.js";
8
+ export default class Init extends Command {
9
+ static description = "Initialize a new evaluation";
10
+ static examples = ["<%= config.bin %> <%= command.id %> my-evaluation"];
11
+ static args = {
12
+ name: Args.string({
13
+ description: "Name of the evaluation",
14
+ required: true,
15
+ }),
16
+ };
17
+ async run() {
18
+ const { args } = await this.parse(Init);
19
+ loadConfig(); // Validate config is available
20
+ const evaluationDir = path.resolve(process.cwd(), "evaluations", args.name);
21
+ const configPath = path.join(evaluationDir, "config.ts");
22
+ // Check if evaluation already exists
23
+ try {
24
+ await fs.access(evaluationDir);
25
+ this.error(pc.red(`Evaluation '${args.name}' already exists at ${evaluationDir}`));
26
+ }
27
+ catch {
28
+ // Directory doesn't exist, which is what we want
29
+ }
30
+ // Create evaluation directory
31
+ await fs.mkdir(evaluationDir, { recursive: true });
32
+ // Copy template
33
+ const templatePath = getStaticResourcePath("evaluation_template.ts");
34
+ await fs.copyFile(templatePath, configPath);
35
+ this.log(pc.green(`✓ Created evaluation '${args.name}'`));
36
+ this.log(pc.dim(` Config: ${configPath}`));
37
+ // Offer to open in editor on macOS
38
+ if (process.platform === "darwin") {
39
+ this.log("");
40
+ this.log(pc.blue(">> Opening config.ts in your editor..."));
41
+ // Try Cursor first, then VS Code, then fallback to default
42
+ exec(`cursor "${configPath}" || code "${configPath}" || open "${configPath}"`, (error) => {
43
+ if (error) {
44
+ this.log(pc.dim(` Could not open editor automatically. Please open: ${configPath}`));
45
+ }
46
+ });
47
+ }
48
+ else {
49
+ this.log("");
50
+ this.log(pc.blue(`>> Edit the config file to define your runs:`));
51
+ this.log(pc.dim(` ${configPath}`));
52
+ }
53
+ this.log("");
54
+ this.log(pc.blue(">> Next steps:"));
55
+ this.log(pc.dim(` 1. Edit ${path.basename(configPath)} to define your evaluation runs`));
56
+ this.log(pc.dim(` 2. Run: kradle evaluation run ${args.name}`));
57
+ }
58
+ }
@@ -0,0 +1,7 @@
1
+ import { Command } from "@oclif/core";
2
+ export default class List extends Command {
3
+ static description: string;
4
+ static examples: string[];
5
+ run(): Promise<void>;
6
+ private fileExists;
7
+ }
@@ -0,0 +1,55 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { Command } from "@oclif/core";
4
+ import pc from "picocolors";
5
+ import { loadConfig } from "../../lib/config.js";
6
+ export default class List extends Command {
7
+ static description = "List all evaluations";
8
+ static examples = ["<%= config.bin %> <%= command.id %>"];
9
+ async run() {
10
+ this.parse(List);
11
+ loadConfig(); // Validate config is available
12
+ const evaluationsDir = path.resolve(process.cwd(), "evaluations");
13
+ try {
14
+ const entries = await fs.readdir(evaluationsDir, { withFileTypes: true });
15
+ const evaluations = entries.filter((e) => e.isDirectory());
16
+ if (evaluations.length === 0) {
17
+ this.log(pc.yellow("No evaluations found."));
18
+ this.log(pc.dim(` Run 'kradle evaluation init <name>' to create one.`));
19
+ return;
20
+ }
21
+ this.log(pc.blue(">> Evaluations:"));
22
+ this.log("");
23
+ for (const evaluation of evaluations) {
24
+ const evalDir = path.join(evaluationsDir, evaluation.name);
25
+ const hasConfig = await this.fileExists(path.join(evalDir, "config.ts"));
26
+ const hasManifest = await this.fileExists(path.join(evalDir, "manifest.json"));
27
+ const hasProgress = await this.fileExists(path.join(evalDir, "progress.json"));
28
+ let status = "";
29
+ if (hasProgress) {
30
+ status = pc.yellow(" (in progress)");
31
+ }
32
+ else if (hasManifest) {
33
+ status = pc.green(" (ready)");
34
+ }
35
+ else if (hasConfig) {
36
+ status = pc.dim(" (config only)");
37
+ }
38
+ this.log(` ${pc.bold(evaluation.name)}${status}`);
39
+ }
40
+ }
41
+ catch {
42
+ this.log(pc.yellow("No evaluations directory found."));
43
+ this.log(pc.dim(` Run 'kradle evaluation init <name>' to create your first evaluation.`));
44
+ }
45
+ }
46
+ async fileExists(filePath) {
47
+ try {
48
+ await fs.access(filePath);
49
+ return true;
50
+ }
51
+ catch {
52
+ return false;
53
+ }
54
+ }
55
+ }
@@ -0,0 +1,13 @@
1
+ import { Command } from "@oclif/core";
2
+ export default class Run extends Command {
3
+ static description: string;
4
+ static examples: string[];
5
+ static args: {
6
+ name: import("@oclif/core/interfaces").Arg<string, Record<string, unknown>>;
7
+ };
8
+ static flags: {
9
+ new: import("@oclif/core/interfaces").BooleanFlag<boolean>;
10
+ "max-concurrent": import("@oclif/core/interfaces").OptionFlag<number, import("@oclif/core/interfaces").CustomOptions>;
11
+ };
12
+ run(): Promise<void>;
13
+ }
@@ -0,0 +1,60 @@
1
+ import { Args, Command, Flags } from "@oclif/core";
2
+ import pc from "picocolors";
3
+ import { ApiClient } from "../../lib/api-client.js";
4
+ import { loadConfig } from "../../lib/config.js";
5
+ import { Evaluator } from "../../lib/evaluation/evaluator.js";
6
+ const DEFAULT_MAX_CONCURRENT = 5;
7
+ export default class Run extends Command {
8
+ static description = "Run an evaluation. If the evaluation had an ongoing iteration, it will resume from the last state.";
9
+ static examples = [
10
+ "<%= config.bin %> <%= command.id %> my-evaluation",
11
+ "<%= config.bin %> <%= command.id %> my-evaluation --new",
12
+ "<%= config.bin %> <%= command.id %> my-evaluation --max-concurrent 10",
13
+ ];
14
+ static args = {
15
+ name: Args.string({
16
+ description: "Name of the evaluation to run",
17
+ required: true,
18
+ }),
19
+ };
20
+ static flags = {
21
+ new: Flags.boolean({
22
+ char: "n",
23
+ description: "Start a new iteration of the evaluation",
24
+ default: false,
25
+ }),
26
+ "max-concurrent": Flags.integer({
27
+ char: "m",
28
+ description: "Maximum concurrent runs",
29
+ default: DEFAULT_MAX_CONCURRENT,
30
+ }),
31
+ };
32
+ async run() {
33
+ const { args, flags } = await this.parse(Run);
34
+ const config = loadConfig();
35
+ const api = new ApiClient(config);
36
+ const evaluator = new Evaluator(args.name, config, api);
37
+ // Check if evaluation exists
38
+ if (!(await evaluator.exists())) {
39
+ this.error(pc.red(`Evaluation '${args.name}' does not exist. Run 'kradle evaluation init ${args.name}' first.`));
40
+ }
41
+ // Check if config.ts exists
42
+ if (!(await evaluator.configExists())) {
43
+ this.error(pc.red(`Config file not found at ${evaluator.configPath}`));
44
+ }
45
+ this.log(pc.blue(`>> Starting evaluation: ${args.name}`));
46
+ if (flags.new) {
47
+ this.log(pc.yellow(" --new: Starting a new iteration of the evaluation"));
48
+ }
49
+ try {
50
+ await evaluator.run({
51
+ new: flags.new,
52
+ maxConcurrent: flags["max-concurrent"],
53
+ });
54
+ this.log(pc.green("\n✓ Evaluation complete!"));
55
+ }
56
+ catch (error) {
57
+ this.error(pc.red(`Evaluation failed: ${error instanceof Error ? error.message : String(error)}`));
58
+ }
59
+ }
60
+ }
@@ -1,7 +1,7 @@
1
1
  import type z from "zod";
2
2
  import type { Challenge } from "./challenge.js";
3
3
  import type { Config } from "./config.js";
4
- import { type AgentSchemaType, type ChallengeSchemaType, HumanSchema } from "./schemas.js";
4
+ import { type AgentSchemaType, type ChallengeSchemaType, HumanSchema, type RunStatusSchemaType } from "./schemas.js";
5
5
  export declare class ApiClient {
6
6
  private config;
7
7
  constructor(config: Config);
@@ -52,4 +52,17 @@ export declare class ApiClient {
52
52
  runIds?: string[] | undefined;
53
53
  }>;
54
54
  deleteChallenge(challengeId: string): Promise<void>;
55
+ /**
56
+ * Get the status of a run.
57
+ * @param runId - The ID of the run.
58
+ * @returns The run status.
59
+ */
60
+ getRunStatus(runId: string): Promise<RunStatusSchemaType>;
61
+ /**
62
+ * Add a tag to a run.
63
+ * @param runId - The ID of the run to tag.
64
+ * @param tag - The tag string to add.
65
+ * @throws an error if the tag fails to be added.
66
+ */
67
+ tagRun(runId: string, tag: string): Promise<void>;
55
68
  }
@@ -1,4 +1,4 @@
1
- import { AgentsResponseSchema, ChallengeSchema, ChallengesResponseSchema, HumanSchema, RunResponseSchema, UploadUrlResponseSchema, } from "./schemas.js";
1
+ import { AgentsResponseSchema, ChallengeSchema, ChallengesResponseSchema, HumanSchema, RunResponseSchema, RunStatusSchema, UploadUrlResponseSchema, } from "./schemas.js";
2
2
  const DEFAULT_PAGE_SIZE = 30;
3
3
  const DEFAULT_CHALLENGE_SCHEMA = {
4
4
  slug: "",
@@ -49,17 +49,21 @@ export class ApiClient {
49
49
  method: "POST",
50
50
  ...options,
51
51
  });
52
- const data = await response.json();
52
+ const text = await response.text();
53
+ if (!text) {
54
+ return undefined;
55
+ }
56
+ const data = JSON.parse(text);
53
57
  return schema ? schema.parse(data) : data;
54
58
  }
55
59
  async put(target, url, options = {}) {
56
- await this.request(target, url, {
60
+ return await this.request(target, url, {
57
61
  method: "PUT",
58
62
  ...options,
59
63
  });
60
64
  }
61
65
  async delete(target, url, options = {}) {
62
- await this.request(target, url, {
66
+ return await this.request(target, url, {
63
67
  method: "DELETE",
64
68
  ...options,
65
69
  });
@@ -140,8 +144,9 @@ export class ApiClient {
140
144
  */
141
145
  async updateChallenge(challenge, challengeConfig) {
142
146
  const url = `challenges/${challenge.shortSlug}`;
147
+ console.log(url);
143
148
  const config = challengeConfig ?? (await challenge.loadConfig());
144
- return this.put("web", url, {
149
+ await this.put("web", url, {
145
150
  body: JSON.stringify(config),
146
151
  });
147
152
  }
@@ -159,4 +164,25 @@ export class ApiClient {
159
164
  const url = `challenges/${challengeId}`;
160
165
  await this.delete("web", url);
161
166
  }
167
+ /**
168
+ * Get the status of a run.
169
+ * @param runId - The ID of the run.
170
+ * @returns The run status.
171
+ */
172
+ async getRunStatus(runId) {
173
+ const url = `runs/${runId}`;
174
+ return this.get("web", url, {}, RunStatusSchema);
175
+ }
176
+ /**
177
+ * Add a tag to a run.
178
+ * @param runId - The ID of the run to tag.
179
+ * @param tag - The tag string to add.
180
+ * @throws an error if the tag fails to be added.
181
+ */
182
+ async tagRun(runId, tag) {
183
+ const url = `runs/${runId}/tag`;
184
+ await this.post("web", url, {
185
+ body: JSON.stringify({ tag }),
186
+ });
187
+ }
162
188
  }
@@ -98,6 +98,11 @@ export class Challenge {
98
98
  catch (error) {
99
99
  throw new Error(`Failed to build datapack: ${error instanceof Error ? error.message : error}`);
100
100
  }
101
+ // @TODO - re-enable once we have a proper build pipeline
102
+ // Recursively copy the challenge dir to target directory, under src/
103
+ // await fs.cp(this.challengeDir, path.join(this.config.KRADLE_CHALLENGES_PATH, this.shortSlug, "src"), {
104
+ // recursive: true,
105
+ // });
101
106
  }
102
107
  /**
103
108
  * Load the challenge configuration from config.ts
@@ -5,7 +5,6 @@ export declare const ConfigSchema: z.ZodObject<{
5
5
  STUDIO_API_URL: z.ZodString;
6
6
  STUDIO_URL: z.ZodString;
7
7
  KRADLE_API_KEY: z.ZodString;
8
- GCS_BUCKET: z.ZodString;
9
8
  KRADLE_CHALLENGES_PATH: z.ZodDefault<z.ZodString>;
10
9
  NAMESPACE: z.ZodDefault<z.ZodString>;
11
10
  }, z.core.$strip>;
@@ -8,7 +8,6 @@ export const ConfigSchema = z.object({
8
8
  STUDIO_API_URL: z.string().url(),
9
9
  STUDIO_URL: z.string(),
10
10
  KRADLE_API_KEY: z.string(),
11
- GCS_BUCKET: z.string(),
12
11
  /**
13
12
  * Absolute path to the challenges directory. Defaults to ~/Documents/kradle-studio/challenges.
14
13
  */
@@ -30,7 +29,6 @@ export function loadConfig() {
30
29
  STUDIO_API_URL: process.env.STUDIO_API_URL,
31
30
  STUDIO_URL: process.env.STUDIO_URL,
32
31
  KRADLE_API_KEY: process.env.KRADLE_API_KEY,
33
- GCS_BUCKET: process.env.GCS_BUCKET,
34
32
  KRADLE_CHALLENGES_PATH: challengesPath,
35
33
  NAMESPACE: process.env.NAMESPACE,
36
34
  });
@@ -0,0 +1,88 @@
1
+ import type { ApiClient } from "../api-client.js";
2
+ import type { Config } from "../config.js";
3
+ import type { EvaluationMetadata, EvaluationOptions, Manifest, Progress } from "./types.js";
4
+ export declare class Evaluator {
5
+ private name;
6
+ private config;
7
+ private api;
8
+ evaluationDir: string;
9
+ metadataPath: string;
10
+ private runner?;
11
+ private tui?;
12
+ private currentIteration?;
13
+ constructor(name: string, config: Config, api: ApiClient);
14
+ /**
15
+ * Get paths for a specific iteration
16
+ */
17
+ private getIterationPaths;
18
+ get configPath(): string;
19
+ /**
20
+ * Get the current iteration directory path
21
+ */
22
+ getCurrentIterationDir(): string;
23
+ /**
24
+ * Check if evaluation exists
25
+ */
26
+ exists(): Promise<boolean>;
27
+ /**
28
+ * Check if config.ts exists (master config)
29
+ */
30
+ configExists(): Promise<boolean>;
31
+ /**
32
+ * Load evaluation metadata
33
+ */
34
+ loadMetadata(): Promise<EvaluationMetadata | null>;
35
+ /**
36
+ * Save evaluation metadata
37
+ */
38
+ saveMetadata(metadata: EvaluationMetadata): Promise<void>;
39
+ /**
40
+ * Get the current iteration number, or -1 if none exists
41
+ */
42
+ getCurrentIterationNumber(): Promise<number>;
43
+ /**
44
+ * Create a new iteration
45
+ */
46
+ createNewIteration(): Promise<number>;
47
+ /**
48
+ * Get or create an iteration
49
+ * @param createNew - If true, always create a new iteration. Otherwise, use current iteration or create first one if none exists.
50
+ */
51
+ getOrCreateIteration(createNew: boolean): Promise<number>;
52
+ /**
53
+ * Load manifest from iteration
54
+ */
55
+ loadManifest(iteration: number): Promise<Manifest>;
56
+ /**
57
+ * Load progress from iteration
58
+ */
59
+ loadProgress(iteration: number): Promise<Progress | null>;
60
+ /**
61
+ * Save progress to current iteration
62
+ */
63
+ saveProgress(): Promise<void>;
64
+ /**
65
+ * Execute config.ts to generate manifest
66
+ */
67
+ generateManifest(configPath: string): Promise<Manifest>;
68
+ /**
69
+ * Execute config.ts file and return the manifest
70
+ */
71
+ private executeConfigFile;
72
+ /**
73
+ * Run the evaluation
74
+ */
75
+ run(options: EvaluationOptions): Promise<void>;
76
+ /**
77
+ * Handle state change from runner
78
+ */
79
+ private onRunStateChange;
80
+ /**
81
+ * Handle quit request
82
+ */
83
+ private handleQuit;
84
+ /**
85
+ * Open run in browser
86
+ */
87
+ private openRun;
88
+ }