@kradle/cli 0.0.16 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +62 -65
  2. package/dist/commands/agent/list.d.ts +4 -0
  3. package/dist/commands/agent/list.js +6 -4
  4. package/dist/commands/challenge/build.d.ts +9 -1
  5. package/dist/commands/challenge/build.js +40 -12
  6. package/dist/commands/challenge/create.d.ts +5 -1
  7. package/dist/commands/challenge/create.js +17 -18
  8. package/dist/commands/challenge/delete.d.ts +4 -1
  9. package/dist/commands/challenge/delete.js +5 -5
  10. package/dist/commands/challenge/list.d.ts +5 -0
  11. package/dist/commands/challenge/list.js +9 -10
  12. package/dist/commands/challenge/run.d.ts +8 -1
  13. package/dist/commands/challenge/run.js +13 -8
  14. package/dist/commands/challenge/watch.d.ts +4 -1
  15. package/dist/commands/challenge/watch.js +8 -8
  16. package/dist/commands/{evaluation → experiment}/create.d.ts +4 -0
  17. package/dist/commands/{evaluation → experiment}/create.js +22 -21
  18. package/dist/commands/{evaluation → experiment}/list.js +17 -19
  19. package/dist/commands/{evaluation → experiment}/run.d.ts +4 -1
  20. package/dist/commands/experiment/run.js +61 -0
  21. package/dist/commands/init.js +2 -2
  22. package/dist/lib/api-client.d.ts +29 -10
  23. package/dist/lib/api-client.js +81 -37
  24. package/dist/lib/arguments.d.ts +3 -2
  25. package/dist/lib/arguments.js +5 -3
  26. package/dist/lib/challenge.d.ts +13 -18
  27. package/dist/lib/challenge.js +60 -61
  28. package/dist/lib/experiment/experimenter.d.ts +87 -0
  29. package/dist/lib/{evaluation/evaluator.js → experiment/experimenter.js} +74 -72
  30. package/dist/lib/{evaluation → experiment}/index.d.ts +1 -1
  31. package/dist/lib/{evaluation → experiment}/index.js +1 -1
  32. package/dist/lib/{evaluation → experiment}/runner.js +2 -1
  33. package/dist/lib/{evaluation → experiment}/tui.d.ts +1 -1
  34. package/dist/lib/{evaluation → experiment}/tui.js +3 -3
  35. package/dist/lib/{evaluation → experiment}/types.d.ts +6 -4
  36. package/dist/lib/{evaluation → experiment}/types.js +4 -3
  37. package/dist/lib/flags.d.ts +47 -0
  38. package/dist/lib/flags.js +63 -0
  39. package/dist/lib/schemas.d.ts +32 -0
  40. package/dist/lib/schemas.js +8 -0
  41. package/dist/lib/utils.d.ts +9 -10
  42. package/dist/lib/utils.js +12 -12
  43. package/oclif.manifest.json +342 -64
  44. package/package.json +5 -6
  45. package/static/challenge.ts +12 -13
  46. package/static/experiment_template.ts +114 -0
  47. package/static/project_template/dev.env +5 -5
  48. package/static/project_template/prod.env +4 -4
  49. package/static/project_template/tsconfig.json +1 -1
  50. package/dist/commands/challenge/multi-upload.d.ts +0 -6
  51. package/dist/commands/challenge/multi-upload.js +0 -80
  52. package/dist/commands/evaluation/run.js +0 -61
  53. package/dist/lib/config.d.ts +0 -12
  54. package/dist/lib/config.js +0 -49
  55. package/dist/lib/evaluation/evaluator.d.ts +0 -88
  56. package/static/evaluation_template.ts +0 -69
  57. /package/dist/commands/{evaluation → experiment}/list.d.ts +0 -0
  58. /package/dist/lib/{evaluation → experiment}/runner.d.ts +0 -0
@@ -0,0 +1,114 @@
1
+ /**
2
+ * Experiment template for running challenge benchmarks.
3
+ *
4
+ * This file defines an experiment manifest that specifies which agents to test
5
+ * against a challenge, how many runs to perform, and how many agents participate
6
+ * in each run.
7
+ *
8
+ * Usage:
9
+ * 1. Set CHALLENGE_SLUG to the challenge you want to run the experiment on
10
+ * 2. Modify AGENTS array to include the agents you want to test
11
+ * 3. Adjust NUM_RUNS and NUM_AGENTS_PER_RUN as needed
12
+ * 4. Optionally add tags in ADDITIONAL_TAGS for filtering results later
13
+ *
14
+ * You can also entirely change how runs are generated by modifying the `main` function.
15
+ *
16
+ * @returns A Manifest object containing all run configurations
17
+ */
18
+ export function main(): Manifest {
19
+ // The challenge for this experiment (format: "username:challenge-name")
20
+ const CHALLENGE_SLUG = "[INSERT CHALLENGE SLUG HERE]";
21
+
22
+ // Pool of agents to sample from for each run
23
+ const AGENTS: string[] = [
24
+ "team-kradle:claude-sonnet-4",
25
+ "team-kradle:qwen3-coder",
26
+ "team-kradle:deepseek-chat-v3-1",
27
+ "team-kradle:grok-4",
28
+ "team-kradle:grok-code-fast-1",
29
+ "team-kradle:gpt-5",
30
+ "team-kradle:kimi-k2",
31
+ "team-kradle:gemini-2-5-flash",
32
+ "team-kradle:gemini-2-5-pro",
33
+ "team-kradle:glm-4-5-air",
34
+ "team-kradle:gpt-5-mini",
35
+ "team-kradle:o3-mini",
36
+ "team-kradle:codestral-2508",
37
+ ];
38
+
39
+ // Total number of game runs to execute
40
+ const NUM_RUNS = 200;
41
+
42
+ // Number of agents randomly selected for each run
43
+ const NUM_AGENTS_PER_RUN = 4;
44
+
45
+ // Optional tags for categorizing/filtering experiment results
46
+ const ADDITIONAL_TAGS: string[] = [];
47
+
48
+ const runs: RunConfig[] = [];
49
+
50
+ for (let i = 0; i < NUM_RUNS; i++) {
51
+ // Randomly sample agents for the run
52
+ const selectedAgents = sampleWithoutReplacement(AGENTS, NUM_AGENTS_PER_RUN);
53
+
54
+ runs.push({
55
+ challenge_slug: CHALLENGE_SLUG,
56
+ participants: selectedAgents.map((agent) => ({ agent })),
57
+ });
58
+ }
59
+
60
+ return { runs, tags: ADDITIONAL_TAGS };
61
+ }
62
+
63
+ // ------------------------------------------------------------------------------------------------
64
+ // Utility functions
65
+ // ------------------------------------------------------------------------------------------------
66
+
67
+ /**
68
+ * Randomly samples elements from an array without replacement.
69
+ *
70
+ * @param arr - The source array to sample from
71
+ * @param count - Number of elements to sample
72
+ * @returns Array of randomly selected elements
73
+ * @throws Error if count exceeds array length
74
+ */
75
+ function sampleWithoutReplacement<T>(arr: T[], count: number): T[] {
76
+ if (count > arr.length) {
77
+ throw new Error("Sample size cannot be larger than array length.");
78
+ }
79
+
80
+ const copy = [...arr];
81
+ const result: T[] = [];
82
+
83
+ for (let i = 0; i < count; i++) {
84
+ const idx = Math.floor(Math.random() * copy.length);
85
+ result.push(copy[idx]);
86
+ copy.splice(idx, 1);
87
+ }
88
+
89
+ return result;
90
+ }
91
+
92
+ /** A participant in a challenge run */
93
+ type Participant = {
94
+ /** Agent slug (format: "username:agent-name") */
95
+ agent: string;
96
+ /** Optional role assignment for role-based challenges */
97
+ role?: string;
98
+ };
99
+
100
+ /** Configuration for a single challenge run */
101
+ type RunConfig = {
102
+ /** The challenge to run (format: "username:challenge-name") */
103
+ challenge_slug: string;
104
+ /** List of participants for this run */
105
+ participants: Participant[];
106
+ };
107
+
108
+ /** The experiment manifest returned by main() */
109
+ type Manifest = {
110
+ /** Array of run configurations to execute */
111
+ runs: RunConfig[];
112
+ /** Optional tags for categorizing experiment results */
113
+ tags?: string[];
114
+ };
@@ -1,5 +1,5 @@
1
- WEB_API_URL=https://dev-api.kradle.ai/v0 #https://api.kradle.ai/v0
2
- WEB_URL=https://dev.kradle.ai #https:/.kradle.ai/workbench
3
- STUDIO_API_URL=http://localhost:2999/api/v0
4
- STUDIO_URL=kradle-dev://open #kradle://://open
5
- KRADLE_CHALLENGES_PATH=~/Documents/kradle-studio/challenges
1
+ KRADLE_API_URL=https://dev-api.kradle.ai/v0 #https://api.kradle.ai/v0
2
+ KRADLE_WEB_URL=https://dev.kradle.ai #https://kradle.ai
3
+ KRADLE_STUDIO_API_URL=http://localhost:2999/api/v0
4
+ KRADLE_STUDIO_URL=kradle-dev://open #kradle://open
5
+ KRADLE_CHALLENGES_PATH=~/Documents/kradle-studio-dev/challenges
@@ -1,5 +1,5 @@
1
- WEB_API_URL=https://api.kradle.ai/v0 #https://dev-api.kradle.ai/v0
2
- WEB_URL=https://kradle.ai #https://dev.kradle.ai
3
- STUDIO_API_URL=http://localhost:2999/api/v0
4
- STUDIO_URL=kradle://open #kradle-dev://://open
1
+ KRADLE_API_URL=https://api.kradle.ai/v0 #https://dev-api.kradle.ai/v0
2
+ KRADLE_WEB_URL=https://kradle.ai #https://dev.kradle.ai
3
+ KRADLE_STUDIO_API_URL=http://localhost:2999/api/v0
4
+ KRADLE_STUDIO_URL=kradle://open #kradle-dev://open
5
5
  KRADLE_CHALLENGES_PATH=~/Documents/kradle-studio/challenges
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "compilerOptions": {
3
3
  "target": "ES2020",
4
- "module": "CommonJS",
4
+ "module": "ES2020",
5
5
  "outDir": "build",
6
6
  "rootDir": ".",
7
7
  "moduleResolution": "node",
@@ -1,6 +0,0 @@
1
- import { Command } from "@oclif/core";
2
- export default class MultiUpload extends Command {
3
- static description: string;
4
- static examples: string[];
5
- run(): Promise<void>;
6
- }
@@ -1,80 +0,0 @@
1
- import { Command } from "@oclif/core";
2
- import enquirer from "enquirer";
3
- import { Listr } from "listr2";
4
- import pc from "picocolors";
5
- import { ApiClient } from "../../lib/api-client.js";
6
- import { Challenge } from "../../lib/challenge.js";
7
- import { loadConfig } from "../../lib/config.js";
8
- export default class MultiUpload extends Command {
9
- static description = "Interactively select and upload multiple challenges";
10
- static examples = ["<%= config.bin %> <%= command.id %>"];
11
- async run() {
12
- // Not necessary since we don't have any args, but oclif will raise a warning if we don't parse the args
13
- this.parse(MultiUpload);
14
- const config = loadConfig();
15
- const api = new ApiClient(config);
16
- this.log(pc.blue(">> Loading challenges..."));
17
- const [cloudChallenges, localChallenges, human] = await Promise.all([
18
- api.listChallenges(),
19
- Challenge.getLocalChallenges(),
20
- api.getHuman(),
21
- ]);
22
- // Get local challenges that exist
23
- const localChallengeIds = Object.keys(localChallenges);
24
- if (localChallengeIds.length === 0) {
25
- this.log(pc.yellow("No local challenges found"));
26
- return;
27
- }
28
- // Create choices with status information
29
- const cloudMap = new Map(cloudChallenges.map((c) => [c.slug, c]));
30
- const choices = localChallengeIds.map((id) => {
31
- const fullSlug = `${human.username}:${id}`;
32
- const inCloud = cloudMap.has(fullSlug);
33
- const status = inCloud ? pc.green("☁️ ") : pc.blue("💻");
34
- return {
35
- name: id,
36
- message: `${status} ${id}`,
37
- };
38
- });
39
- // Prompt user to select challenges
40
- let response;
41
- try {
42
- response = await enquirer.prompt({
43
- type: "multiselect",
44
- name: "challenges",
45
- message: "Select challenges to upload. ☁️ = exists in cloud, 📁 = exists locally only",
46
- choices: choices.map((c) => c.message),
47
- });
48
- }
49
- catch (error) {
50
- this.log(pc.yellow(">> No challenges selected"));
51
- return;
52
- }
53
- // Map back to challenge IDs
54
- const selectedChallenges = response.challenges.map((selected) => {
55
- const choice = choices.find((c) => c.message === selected);
56
- return choice?.name || selected.replace(/^[✓⊡]\s+/, "");
57
- });
58
- // Create tasks for each challenge
59
- const tasks = new Listr(selectedChallenges.map((challengeId) => ({
60
- title: challengeId,
61
- task: async () => {
62
- const challenge = new Challenge(challengeId, config);
63
- await challenge.build();
64
- await challenge.upload(api);
65
- },
66
- })), {
67
- concurrent: false,
68
- exitOnError: false,
69
- });
70
- try {
71
- await tasks.run();
72
- this.log(pc.green(`\n✓ Uploaded ${selectedChallenges.length} challenges`));
73
- }
74
- catch (error) {
75
- this.error(pc.red(`Some uploads failed: ${error instanceof Error ? error.message : String(error)}`), {
76
- exit: false,
77
- });
78
- }
79
- }
80
- }
@@ -1,61 +0,0 @@
1
- import { Args, Command, Flags } from "@oclif/core";
2
- import pc from "picocolors";
3
- import { ApiClient } from "../../lib/api-client.js";
4
- import { loadConfig } from "../../lib/config.js";
5
- import { Evaluator } from "../../lib/evaluation/evaluator.js";
6
- const DEFAULT_MAX_CONCURRENT = 5;
7
- export default class Run extends Command {
8
- static description = "Run an evaluation. If the evaluation had an ongoing iteration, it will resume from the last state.";
9
- static examples = [
10
- "<%= config.bin %> <%= command.id %> my-evaluation",
11
- "<%= config.bin %> <%= command.id %> my-evaluation --new",
12
- "<%= config.bin %> <%= command.id %> my-evaluation --max-concurrent 10",
13
- ];
14
- static args = {
15
- name: Args.string({
16
- description: "Name of the evaluation to run",
17
- required: true,
18
- }),
19
- };
20
- static flags = {
21
- new: Flags.boolean({
22
- char: "n",
23
- description: "Start a new iteration of the evaluation",
24
- default: false,
25
- }),
26
- "max-concurrent": Flags.integer({
27
- char: "m",
28
- description: "Maximum concurrent runs",
29
- default: DEFAULT_MAX_CONCURRENT,
30
- }),
31
- };
32
- async run() {
33
- const { args, flags } = await this.parse(Run);
34
- const config = loadConfig();
35
- const api = new ApiClient(config);
36
- const evaluator = new Evaluator(args.name, config, api);
37
- // Check if evaluation exists
38
- if (!(await evaluator.exists())) {
39
- this.error(pc.red(`Evaluation '${args.name}' does not exist. Run 'kradle evaluation init ${args.name}' first.`));
40
- }
41
- // Check if config.ts exists
42
- if (!(await evaluator.configExists())) {
43
- this.error(pc.red(`Config file not found at ${evaluator.configPath}`));
44
- }
45
- this.log(pc.blue(`>> Starting evaluation: ${args.name}`));
46
- if (flags.new) {
47
- this.log(pc.yellow(" --new: Starting a new iteration of the evaluation"));
48
- }
49
- try {
50
- await evaluator.run({
51
- new: flags.new,
52
- maxConcurrent: flags["max-concurrent"],
53
- openMetabase: true,
54
- });
55
- this.log(pc.green("\n✓ Evaluation complete!"));
56
- }
57
- catch (error) {
58
- this.error(pc.red(`Evaluation failed: ${error instanceof Error ? error.message : String(error)}`));
59
- }
60
- }
61
- }
@@ -1,12 +0,0 @@
1
- import { z } from "zod";
2
- export declare const ConfigSchema: z.ZodObject<{
3
- WEB_API_URL: z.ZodString;
4
- WEB_URL: z.ZodString;
5
- STUDIO_API_URL: z.ZodString;
6
- STUDIO_URL: z.ZodString;
7
- KRADLE_API_KEY: z.ZodString;
8
- KRADLE_CHALLENGES_PATH: z.ZodDefault<z.ZodString>;
9
- NAMESPACE: z.ZodDefault<z.ZodString>;
10
- }, z.core.$strip>;
11
- export type Config = z.infer<typeof ConfigSchema>;
12
- export declare function loadConfig(): Config;
@@ -1,49 +0,0 @@
1
- import os from "node:os";
2
- import path from "node:path";
3
- import { z } from "zod";
4
- import { untildify } from "./utils.js";
5
- export const ConfigSchema = z.object({
6
- WEB_API_URL: z.string().url(),
7
- WEB_URL: z.string().url(),
8
- STUDIO_API_URL: z.string().url(),
9
- STUDIO_URL: z.string(),
10
- KRADLE_API_KEY: z.string(),
11
- /**
12
- * Absolute path to the challenges directory. Defaults to ~/Documents/kradle-studio/challenges.
13
- */
14
- KRADLE_CHALLENGES_PATH: z.string().default(path.join(os.homedir(), "Documents", "kradle-studio", "challenges")),
15
- NAMESPACE: z.string().default("kradle"),
16
- });
17
- export function loadConfig() {
18
- try {
19
- /**
20
- * First, resolve tildes in the KRADLE_CHALLENGES_PATH
21
- */
22
- let challengesPath;
23
- if (process.env.KRADLE_CHALLENGES_PATH) {
24
- challengesPath = untildify(process.env.KRADLE_CHALLENGES_PATH);
25
- }
26
- const config = ConfigSchema.parse({
27
- WEB_API_URL: process.env.WEB_API_URL,
28
- WEB_URL: process.env.WEB_URL,
29
- STUDIO_API_URL: process.env.STUDIO_API_URL,
30
- STUDIO_URL: process.env.STUDIO_URL,
31
- KRADLE_API_KEY: process.env.KRADLE_API_KEY,
32
- KRADLE_CHALLENGES_PATH: challengesPath,
33
- NAMESPACE: process.env.NAMESPACE,
34
- });
35
- for (const field of Object.keys(config)) {
36
- if (!config[field]) {
37
- throw new Error(`Missing required config field: "${field}". Please check your .env file.`);
38
- }
39
- }
40
- return config;
41
- }
42
- catch (error) {
43
- if (error instanceof z.ZodError) {
44
- const missingFields = error.issues.map((e) => e.path.join(".")).join(", ");
45
- throw new Error(`Missing or invalid environment variables: ${missingFields}. Please check your .env file.`);
46
- }
47
- throw error;
48
- }
49
- }
@@ -1,88 +0,0 @@
1
- import type { ApiClient } from "../api-client.js";
2
- import type { Config } from "../config.js";
3
- import type { EvaluationMetadata, EvaluationOptions, Manifest, Progress } from "./types.js";
4
- export declare class Evaluator {
5
- private name;
6
- private config;
7
- private api;
8
- evaluationDir: string;
9
- metadataPath: string;
10
- private runner?;
11
- private tui?;
12
- private currentIteration?;
13
- constructor(name: string, config: Config, api: ApiClient);
14
- /**
15
- * Get paths for a specific iteration
16
- */
17
- private getIterationPaths;
18
- get configPath(): string;
19
- /**
20
- * Get the current iteration directory path
21
- */
22
- getCurrentIterationDir(): string;
23
- /**
24
- * Check if evaluation exists
25
- */
26
- exists(): Promise<boolean>;
27
- /**
28
- * Check if config.ts exists (master config)
29
- */
30
- configExists(): Promise<boolean>;
31
- /**
32
- * Load evaluation metadata
33
- */
34
- loadMetadata(): Promise<EvaluationMetadata | null>;
35
- /**
36
- * Save evaluation metadata
37
- */
38
- saveMetadata(metadata: EvaluationMetadata): Promise<void>;
39
- /**
40
- * Get the current iteration number, or -1 if none exists
41
- */
42
- getCurrentIterationNumber(): Promise<number>;
43
- /**
44
- * Create a new iteration
45
- */
46
- createNewIteration(): Promise<number>;
47
- /**
48
- * Get or create an iteration
49
- * @param createNew - If true, always create a new iteration. Otherwise, use current iteration or create first one if none exists.
50
- */
51
- getOrCreateIteration(createNew: boolean): Promise<number>;
52
- /**
53
- * Load manifest from iteration
54
- */
55
- loadManifest(iteration: number): Promise<Manifest>;
56
- /**
57
- * Load progress from iteration
58
- */
59
- loadProgress(iteration: number): Promise<Progress | null>;
60
- /**
61
- * Save progress to current iteration
62
- */
63
- saveProgress(): Promise<void>;
64
- /**
65
- * Execute config.ts to generate manifest
66
- */
67
- generateManifest(configPath: string): Promise<Manifest>;
68
- /**
69
- * Execute config.ts file and return the manifest
70
- */
71
- private executeConfigFile;
72
- /**
73
- * Run the evaluation
74
- */
75
- run(options: EvaluationOptions): Promise<void>;
76
- /**
77
- * Handle state change from runner
78
- */
79
- private onRunStateChange;
80
- /**
81
- * Handle quit request
82
- */
83
- private handleQuit;
84
- /**
85
- * Open run in browser
86
- */
87
- private openRun;
88
- }
@@ -1,69 +0,0 @@
1
- export function main(): Manifest {
2
- const CHALLENGE_SLUG = "[INSERT CHALLENGE SLUG HERE]";
3
-
4
- const AGENTS: string[] = [
5
- "team-kradle:claude-sonnet-4",
6
- "team-kradle:qwen3-coder",
7
- "team-kradle:deepseek-chat-v3-1",
8
- "team-kradle:grok-4",
9
- "team-kradle:grok-code-fast-1",
10
- "team-kradle:gpt-5",
11
- "team-kradle:kimi-k2",
12
- "team-kradle:gemini-2-5-flash",
13
- "team-kradle:gemini-2-5-pro",
14
- "team-kradle:glm-4-5-air",
15
- "team-kradle:gpt-5-mini",
16
- "team-kradle:o3-mini",
17
- "team-kradle:codestral-2508",
18
- ];
19
-
20
- const NUM_RUNS = 200;
21
- const NUM_AGENTS_PER_RUN = 4;
22
-
23
- const ADDITIONAL_TAGS: string[] = [];
24
-
25
- const runs: RunConfig[] = [];
26
-
27
- for (let i = 0; i < NUM_RUNS; i++) {
28
- const selectedAgents = sampleWithoutReplacement(AGENTS, NUM_AGENTS_PER_RUN);
29
-
30
- runs.push({
31
- challenge_slug: CHALLENGE_SLUG,
32
- participants: selectedAgents.map((agent) => ({ agent })),
33
- });
34
- }
35
-
36
- return { runs, tags: ADDITIONAL_TAGS };
37
- }
38
-
39
- function sampleWithoutReplacement<T>(arr: T[], count: number): T[] {
40
- if (count > arr.length) {
41
- throw new Error("Sample size cannot be larger than array length.");
42
- }
43
-
44
- const copy = [...arr];
45
- const result: T[] = [];
46
-
47
- for (let i = 0; i < count; i++) {
48
- const idx = Math.floor(Math.random() * copy.length);
49
- result.push(copy[idx]);
50
- copy.splice(idx, 1);
51
- }
52
-
53
- return result;
54
- }
55
-
56
- type Participant = {
57
- agent: string;
58
- role?: string;
59
- };
60
-
61
- type RunConfig = {
62
- challenge_slug: string;
63
- participants: Participant[];
64
- };
65
-
66
- type Manifest = {
67
- runs: RunConfig[];
68
- tags?: string[];
69
- };
File without changes