even-pf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ /node_modules
2
+ /build
3
+
4
+ /outputs
package/README.md ADDED
@@ -0,0 +1,20 @@
1
+ # Project Even-Playfield (E-PF)
2
+ AI-assisted responsible grading tool for programming assignments. A GPT-wrapper in CLI for CS graders.
3
+
4
+ To install dependencies:
5
+ ```bash
6
+ bun install
7
+ ```
8
+
9
+ To install as a tool globally:
10
+ ```bash
11
+ bun link
12
+ ```
13
+
14
+ To run:
15
+ ```bash
16
+ bunx e-pf
17
+ ```
18
+ Make sure you have a config file in your home or current directory. Alternatively, you can set environment variable `EPF_CONFIG_URL`.
19
+
20
+ This project was created using `bun init` in bun v1.3.2. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime.
package/bun.lock ADDED
@@ -0,0 +1,46 @@
1
+ {
2
+ "lockfileVersion": 1,
3
+ "configVersion": 1,
4
+ "workspaces": {
5
+ "": {
6
+ "name": "tools",
7
+ "dependencies": {
8
+ "@openrouter/sdk": "^0.5.1",
9
+ "chalk": "^5.6.2",
10
+ "smol-toml": "^1.5.2",
11
+ "zod-defaults": "^0.2.3",
12
+ },
13
+ "devDependencies": {
14
+ "@types/bun": "latest",
15
+ },
16
+ "peerDependencies": {
17
+ "typescript": "^5",
18
+ },
19
+ },
20
+ },
21
+ "packages": {
22
+ "@openrouter/sdk": ["@openrouter/sdk@0.5.1", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-Kl0N1jIj7A3lnkM5dO3SGP8JP3jAozzs6JWcHVuZUBt5DsGKxFGNH1Y15bCfsJiLNA2ylAQpCN3aNcgEYkkL5Q=="],
23
+
24
+ "@types/bun": ["@types/bun@1.3.2", "", { "dependencies": { "bun-types": "1.3.2" } }, "sha512-t15P7k5UIgHKkxwnMNkJbWlh/617rkDGEdSsDbu+qNHTaz9SKf7aC8fiIlUdD5RPpH6GEkP0cK7WlvmrEBRtWg=="],
25
+
26
+ "@types/node": ["@types/node@24.10.1", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ=="],
27
+
28
+ "@types/react": ["@types/react@19.2.6", "", { "dependencies": { "csstype": "^3.2.2" } }, "sha512-p/jUvulfgU7oKtj6Xpk8cA2Y1xKTtICGpJYeJXz2YVO2UcvjQgeRMLDGfDeqeRW2Ta+0QNFwcc8X3GH8SxZz6w=="],
29
+
30
+ "bun-types": ["bun-types@1.3.2", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-i/Gln4tbzKNuxP70OWhJRZz1MRfvqExowP7U6JKoI8cntFrtxg7RJK3jvz7wQW54UuvNC8tbKHHri5fy74FVqg=="],
31
+
32
+ "chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="],
33
+
34
+ "csstype": ["csstype@3.2.3", "", {}, "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ=="],
35
+
36
+ "smol-toml": ["smol-toml@1.5.2", "", {}, "sha512-QlaZEqcAH3/RtNyet1IPIYPsEWAaYyXXv1Krsi+1L/QHppjX4Ifm8MQsBISz9vE8cHicIq3clogsheili5vhaQ=="],
37
+
38
+ "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
39
+
40
+ "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="],
41
+
42
+ "zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="],
43
+
44
+ "zod-defaults": ["zod-defaults@0.2.3", "", { "peerDependencies": { "zod": "^4.1.12" } }, "sha512-7pYkOH1/c+Ril5AZUYtsbhMkehkI8CMqzFZ7YZXfC9SMLRvZuLyonQE7BAIVSNxeTpqTZmW5BLxGSzWMnKNdIw=="],
45
+ }
46
+ }
package/bunfig.toml ADDED
@@ -0,0 +1,2 @@
1
+ [console]
2
+ depth = 6
@@ -0,0 +1,42 @@
1
+ [openrouter]
2
+ api_key = ""
3
+ model = ""
4
+
5
+ [hyperparameters]
6
+ max_completion_tokens = 20000
7
+ temperature = 0.9
8
+ top_p = 1
9
+ frequency_penalty = 0
10
+ presence_penalty = 0
11
+ reasoning_effort = "high"
12
+
13
+ [[analysis_workflows]]
14
+ slug = ""
15
+ runs = 1
16
+ output_filename = ""
17
+ prompt = ""
18
+
19
+ [[analysis_workflows.input_files_searches]]
20
+ file_glob = ""
21
+ search_directory = "."
22
+ excluded_files = []
23
+
24
+ [[testing_workflows]]
25
+ slug = ""
26
+ runs = 1
27
+ output_filename = ""
28
+ setup_commands = []
29
+ cleanup_commands = []
30
+
31
+ [[testing_workflows.test_cases]]
32
+ name = ""
33
+ work_directory = "."
34
+ single_run_command = ""
35
+ interactive_steps = []
36
+
37
+ [testing_workflows.test_cases.single_run_expected_output]
38
+ prefix_strip_string = ""
39
+ postfix_strip_string = ""
40
+ substring = ""
41
+ llm_judge_input_mode = "NONE"
42
+ llm_judge_prompt = ""
package/package.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "name": "even-pf",
3
+ "version": "0.1.0",
4
+ "module": "src/index.ts",
5
+ "type": "module",
6
+ "license": "UNLICENSED",
7
+ "scripts": {
8
+ "start": "bun run src/index.ts",
9
+ "build": "bun build src/index.ts --compile --outfile build/epf",
10
+ "config-gen": "bun run --console-depth 6 src/generate-config.ts"
11
+ },
12
+ "bin": {
13
+ "e-pf": "src/index.ts"
14
+ },
15
+ "devDependencies": {
16
+ "@types/bun": "latest"
17
+ },
18
+ "peerDependencies": {
19
+ "typescript": "^5"
20
+ },
21
+ "dependencies": {
22
+ "@openrouter/sdk": "^0.5.1",
23
+ "chalk": "^5.6.2",
24
+ "smol-toml": "^1.5.2",
25
+ "zod-defaults": "^0.2.3"
26
+ },
27
+ "files": [
28
+ ".gitignore",
29
+ "src",
30
+ "bun.lock",
31
+ "bunfig.toml",
32
+ "epf.example.toml",
33
+ "package.json",
34
+ "README.md",
35
+ "tsconfig.json"
36
+ ]
37
+ }
@@ -0,0 +1,30 @@
1
+ import {stringify} from "smol-toml";
2
+ import {getDefaultsForSchema} from "zod-defaults";
3
+
4
+ import {
5
+ ConfigSchema,
6
+ AnalysisWorkflowEntrySchema,
7
+ FileSearchEntrySchema,
8
+ TestingWorkflowEntrySchema, TestCaseSchema
9
+ } from "./util/config-schema.ts";
10
+
11
+
12
+ console.log("generate-config.ts");
13
+
14
+ let defaultConfig = getDefaultsForSchema(ConfigSchema);
15
+ let defaultAnalysisWorkflowEntry = getDefaultsForSchema(AnalysisWorkflowEntrySchema);
16
+ let defaultFileSearchEntry = getDefaultsForSchema(FileSearchEntrySchema);
17
+ defaultAnalysisWorkflowEntry.input_files_searches = [defaultFileSearchEntry];
18
+ defaultConfig.analysis_workflows = [defaultAnalysisWorkflowEntry];
19
+ let defaultTestingWorkflowEntry = getDefaultsForSchema(TestingWorkflowEntrySchema);
20
+ defaultTestingWorkflowEntry.test_cases = [getDefaultsForSchema(TestCaseSchema)];
21
+ defaultConfig.testing_workflows = [defaultTestingWorkflowEntry];
22
+
23
+ console.log(defaultConfig);
24
+
25
+ const tomlString = stringify(defaultConfig);
26
+ const outputFilename = "epf.example.toml";
27
+ await Bun.file(outputFilename).write(tomlString);
28
+ console.log(`Default config written to ${outputFilename}`);
29
+
30
+ console.log("generate-config.ts done");
package/src/index.ts ADDED
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env bun
2
+
3
+ import {OpenRouter} from "@openrouter/sdk";
4
+
5
+ import {CONFIG} from "./util/config.ts";
6
+ import {executeTestingWorkflow} from "./workflow/testing-workflow.ts";
7
+ import {executeAnalysisWorkflow} from "./workflow/analysis-workflow.ts";
8
+ import type {WorkflowDependencies} from "./workflow";
9
+
10
+
11
+ console.log("EPF index.ts");
12
+
13
+ const workflowDependencies: WorkflowDependencies = {
14
+ openRouter: new OpenRouter({
15
+ apiKey: CONFIG.openrouter.api_key,
16
+ })
17
+ }
18
+
19
+ // Parallelize workflows with Promise.allSettled
20
+ const analysisWorkflows = CONFIG.analysis_workflows;
21
+ const testingWorkflows = CONFIG.testing_workflows;
22
+ console.log(`Starting execution of ${analysisWorkflows.length} workflows...`);
23
+ console.log(analysisWorkflows.map((w) => w.slug));
24
+ let workflowRuns: Promise<void>[] = [];
25
+ analysisWorkflows.forEach((workflow) => {
26
+ for (let i = 0; i < workflow.runs; i++) {
27
+ workflowRuns.push(executeAnalysisWorkflow(workflow, i+1, workflowDependencies));
28
+ }
29
+ });
30
+ testingWorkflows.forEach((workflow) => {
31
+ for (let i = 0; i < workflow.runs; i++) {
32
+ workflowRuns.push(executeTestingWorkflow(workflow, i+1, workflowDependencies));
33
+ }
34
+ });
35
+
36
+ const workflowsResults = await Promise.allSettled(workflowRuns);
37
+ // Summarize with indices to include slugs in failure logs
38
+ const failedIndices: number[] = [];
39
+ const succeededIndices: number[] = [];
40
+ workflowsResults.forEach((r, i) => {
41
+ if (r.status === "rejected") failedIndices.push(i);
42
+ else succeededIndices.push(i);
43
+ });
44
+
45
+ console.log(`Workflows completed. Succeeded: ${succeededIndices.length}; Failed: ${failedIndices.length}`);
46
+ if (failedIndices.length > 0) {
47
+ failedIndices.forEach((i) => {
48
+ const r = workflowsResults[i] as PromiseRejectedResult;
49
+ const slug = analysisWorkflows[i]?.slug ?? `#${i + 1}`;
50
+ console.warn(`Workflow '${slug}' failed:`, r.reason);
51
+ });
52
+ }
53
+
54
+ console.log("index.ts done");
@@ -0,0 +1,69 @@
1
+ import {z} from "zod";
2
+
3
+ export const FileSearchEntrySchema = z.object({
4
+ file_glob: z.string().min(1),
5
+ search_directory: z.string().default("."),
6
+ excluded_files: z.array(z.string()).default([]),
7
+ });
8
+
9
+ export const BaseWorkflowEntrySchema = z.object({
10
+ slug: z.string(),
11
+ runs: z.number().min(1).default(1),
12
+ input_files_searches: z.array(FileSearchEntrySchema).default([]),
13
+ output_filename: z.string().min(1),
14
+ });
15
+
16
+ export const AnalysisWorkflowEntrySchema = BaseWorkflowEntrySchema.extend({
17
+ prompt: z.string(),
18
+ })
19
+
20
+ export enum LLMJudgeInputModeEnum{
21
+ None = "NONE",
22
+ Diff = "DIFF",
23
+ Full = "FULL",
24
+ }
25
+ const LLMJudgeInputModeSchema = z.enum(LLMJudgeInputModeEnum);
26
+
27
+ const ExpectedOutputSchema = z.object({
28
+ prefix_strip_string: z.string().min(0),
29
+ postfix_strip_string: z.string().min(0),
30
+ substring: z.string().min(0),
31
+ llm_judge_input_mode: LLMJudgeInputModeSchema.default(LLMJudgeInputModeEnum.None),
32
+ llm_judge_prompt: z.string().min(0),
33
+ });
34
+
35
+ export const TestCaseSchema = z.object({
36
+ name: z.string(),
37
+ work_directory: z.string().default("."),
38
+ single_run_command: z.string(),
39
+ single_run_expected_output: ExpectedOutputSchema,
40
+ interactive_steps: z.array(z.object({
41
+ input: z.string(),
42
+ expected_output: ExpectedOutputSchema,
43
+ })),
44
+ });
45
+
46
+ export const TestingWorkflowEntrySchema = BaseWorkflowEntrySchema.extend({
47
+ setup_commands: z.array(z.string()).default([]),
48
+ test_cases: z.array(TestCaseSchema).default([]),
49
+ cleanup_commands: z.array(z.string()).default([]),
50
+ }).omit({
51
+ input_files_searches: true,
52
+ });
53
+
54
+ export const ConfigSchema = z.object({
55
+ openrouter: z.object({
56
+ api_key: z.string(),
57
+ model: z.string(),
58
+ }),
59
+ hyperparameters: z.object({
60
+ max_completion_tokens: z.number().min(1).default(20000),
61
+ temperature: z.number().min(0).max(1).default(0.9),
62
+ top_p: z.number().min(0).max(1).default(1),
63
+ frequency_penalty: z.number().min(-2).max(2).default(0),
64
+ presence_penalty: z.number().min(-2).max(2).default(0),
65
+ reasoning_effort: z.enum(["low", "medium", "high"]).default("high"),
66
+ }),
67
+ analysis_workflows: z.array(AnalysisWorkflowEntrySchema),
68
+ testing_workflows: z.array(TestingWorkflowEntrySchema),
69
+ });
@@ -0,0 +1,64 @@
1
+ import os from "node:os";
2
+ import { readFileSync, existsSync } from "node:fs";
3
+
4
+ import {z} from "zod";
5
+
6
+ import {ConfigSchema} from "./config-schema.ts";
7
+
8
+
9
+ const homeDir: string = os.homedir();
10
+ const defaultConfigFileName = "epf.toml";
11
+ const configURLEnvVar = "EPF_CONFIG_URL";
12
+
13
+ type Config = z.infer<typeof ConfigSchema>;
14
+
15
+ async function readConfig() {
16
+ console.log(`Loading config`);
17
+
18
+ let configFilePath: string;
19
+ if (process.argv.length >= 3) {
20
+ configFilePath = process.argv[2]!;
21
+ console.log(`Found config from command line argument: ${configFilePath}`);
22
+ }
23
+ else if (process.env[configURLEnvVar]) {
24
+ configFilePath = process.env[configURLEnvVar]!;
25
+ console.log(`Found config from environment variable ${configURLEnvVar}`);
26
+ }
27
+ else {
28
+ if (existsSync(defaultConfigFileName)) {
29
+ configFilePath = defaultConfigFileName;
30
+ console.log(`Found config from current directory`);
31
+ }
32
+ else if (existsSync(`${homeDir}/${defaultConfigFileName}`)) {
33
+ configFilePath = `${homeDir}/${defaultConfigFileName}`;
34
+ console.log(`Found config from home directory`);
35
+ }
36
+ else {
37
+ throw new Error(`Config file ${defaultConfigFileName} not found`);
38
+ }
39
+ }
40
+
41
+ let configFileContents: string;
42
+ if (/^https?:\/\//.test(configFilePath)) {
43
+ console.log(`Fetching config from URL: ${configFilePath}`);
44
+ const configResponse = await fetch(configFilePath);
45
+ if (!configResponse.ok) {
46
+ throw new Error(`Failed to fetch config from URL: ${configFilePath}, status: ${configResponse.status}`);
47
+ }
48
+ configFileContents = await configResponse.text();
49
+ } else {
50
+ console.log(`Loading config from file: ${configFilePath}`);
51
+ configFileContents = readFileSync(configFilePath).toString();
52
+ }
53
+
54
+ let obj = Bun.TOML.parse(configFileContents);
55
+ const parsedConfig = ConfigSchema.safeParse(obj);
56
+ if (!parsedConfig.success) {
57
+ console.error("Config file is invalid:", parsedConfig.error.format());
58
+ throw new Error("Config file is invalid");
59
+ }
60
+ console.log(`Config loaded from ${configFilePath}`);
61
+ return parsedConfig.data as Config;
62
+ }
63
+
64
+ export const CONFIG = await readConfig();
@@ -0,0 +1,87 @@
1
+ /**
2
+ * Helper class for generating file content payloads with language-specific formatting
3
+ */
4
+ export class FilePayloadGenerator {
5
+ private static readonly LANGUAGE_MAP: Record<string, string> = {
6
+ '.cs': 'csharp',
7
+ '.cpp': 'cpp',
8
+ '.cc': 'cpp',
9
+ '.cxx': 'cpp',
10
+ '.c': 'c',
11
+ '.h': 'cpp',
12
+ '.hpp': 'cpp',
13
+ '.hxx': 'cpp',
14
+ '.java': 'java',
15
+ '.js': 'javascript',
16
+ '.ts': 'typescript',
17
+ '.py': 'python',
18
+ '.rb': 'ruby',
19
+ '.go': 'go',
20
+ '.rs': 'rust',
21
+ '.swift': 'swift',
22
+ '.kt': 'kotlin',
23
+ '.php': 'php',
24
+ '.scala': 'scala',
25
+ '.sh': 'bash',
26
+ '.bash': 'bash',
27
+ '.zsh': 'zsh',
28
+ '.ps1': 'powershell',
29
+ '.md': 'markdown',
30
+ '.json': 'json',
31
+ '.xml': 'xml',
32
+ '.yaml': 'yaml',
33
+ '.yml': 'yaml',
34
+ '.toml': 'toml',
35
+ '.html': 'html',
36
+ '.css': 'css',
37
+ '.scss': 'scss',
38
+ '.sql': 'sql',
39
+ };
40
+
41
+ /**
42
+ * Get the language identifier for a file based on its extension
43
+ * @param filePath The path to the file
44
+ * @returns The language identifier (e.g., 'csharp', 'cpp')
45
+ */
46
+ private static getLanguageFromPath(filePath: string): string {
47
+ const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
48
+ return this.LANGUAGE_MAP[ext] || 'text';
49
+ }
50
+
51
+ /**
52
+ * Generate a formatted payload for a single file
53
+ * @param filePath The path to the file
54
+ * @param content The content of the file
55
+ * @returns A formatted string with file path and content in a code block
56
+ */
57
+ static formatFileContent(filePath: string, content: string): string {
58
+ const language = this.getLanguageFromPath(filePath);
59
+ return `${filePath}\n\`\`\`${language}\n${content}\n\`\`\``;
60
+ }
61
+
62
+ /**
63
+ * Generate payloads for multiple files
64
+ * @param files Array of file paths
65
+ * @returns Array of formatted file content strings
66
+ */
67
+ static async generatePayloads(files: string[]): Promise<string[]> {
68
+ const payloads: string[] = [];
69
+
70
+ for (const file of files) {
71
+ const content = await Bun.file(file).text();
72
+ payloads.push(this.formatFileContent(file, content));
73
+ }
74
+
75
+ return payloads;
76
+ }
77
+
78
+ /**
79
+ * Add a custom language mapping
80
+ * @param extension The file extension (including the dot, e.g., '.custom')
81
+ * @param language The language identifier to use
82
+ */
83
+ static addLanguageMapping(extension: string, language: string): void {
84
+ this.LANGUAGE_MAP[extension.toLowerCase()] = language;
85
+ }
86
+ }
87
+
@@ -0,0 +1,88 @@
1
+ import {Glob} from "bun";
2
+
3
+ import {CONFIG} from "../util/config.ts";
4
+ import {FilePayloadGenerator} from "../util/file-payload.ts";
5
+ import type {WorkflowDependencies} from "./index.ts";
6
+ import chalk from "chalk";
7
+
8
+
9
+ export async function executeAnalysisWorkflow(workflow: typeof CONFIG.analysis_workflows[number], runNum: number, deps: WorkflowDependencies) {
10
+ console.log(`Executing analysis workflow: ${workflow.slug}`);
11
+ const log = (...args: Parameters<typeof console.log>) => {
12
+ console.log(chalk.cyan(`[${workflow.slug}]`), ...args);
13
+ }
14
+ const warn = (...args: Parameters<typeof console.warn>) => {
15
+ console.warn(chalk.red(`[${workflow.slug}]`), ...args);
16
+ }
17
+
18
+ let allFiles = (
19
+ await Promise.all(
20
+ workflow.input_files_searches.map(async (fileSearch) => {
21
+ const fileExclusionsSet = new Set(fileSearch.excluded_files);
22
+ const glob = new Glob(fileSearch.file_glob);
23
+ const matches: string[] = [];
24
+ for await (const file of glob.scan(fileSearch.search_directory)) {
25
+ if (fileExclusionsSet.has(file)) {
26
+ log(`Excluding file: ${file}`);
27
+ continue;
28
+ }
29
+ matches.push(file);
30
+ }
31
+ log(`Found ${matches.length} files for search: ${fileSearch.file_glob} in ${fileSearch.search_directory}`, matches);
32
+ return matches;
33
+ })
34
+ )
35
+ ).flat();
36
+
37
+ if (allFiles.length === 0) {
38
+ warn(`No files found for workflow, skipping...`);
39
+ return;
40
+ }
41
+ log(`Found ${allFiles.length} files for workflow`);
42
+ const fileContentsPayload = await FilePayloadGenerator.generatePayloads(allFiles);
43
+
44
+ log("Sending chat completion request...");
45
+ let startTime = Date.now();
46
+ const seed = Math.floor(Date.now() / 1000);
47
+ let completion = await deps.openRouter.chat.send({
48
+ model: CONFIG.openrouter.model,
49
+ maxCompletionTokens: CONFIG.hyperparameters.max_completion_tokens,
50
+ messages: [
51
+ {
52
+ role: "system",
53
+ content: workflow.prompt,
54
+ },
55
+ {
56
+ role: "user",
57
+ content: fileContentsPayload.map((file) => {
58
+ return {
59
+ type: "text",
60
+ text: file,
61
+ }
62
+ }),
63
+ }
64
+ ],
65
+ stream: false,
66
+ seed: seed,
67
+ frequencyPenalty: CONFIG.hyperparameters.frequency_penalty,
68
+ presencePenalty: CONFIG.hyperparameters.presence_penalty,
69
+ temperature: CONFIG.hyperparameters.temperature,
70
+ reasoning: {
71
+ effort: CONFIG.hyperparameters.reasoning_effort,
72
+ },
73
+ });
74
+ log(`Completion response generated in ${(Date.now() - startTime) / 1000} seconds`);
75
+ if (completion.choices.length < 1){
76
+ warn("No choices returned from completion");
77
+ console.log(completion);
78
+ }
79
+ const completionText = completion.choices[0]?.message.content?.toString() ?? "";
80
+ // TODO: Add more template variables
81
+ const outputFileName = workflow.output_filename
82
+ .replaceAll("[seed]", seed.toString())
83
+ .replaceAll("[slug]", workflow.slug)
84
+ .replaceAll("[model]", `(${completion.model.replaceAll("/", "--")})`)
85
+ .replaceAll("[run]", runNum.toString());
86
+ await Bun.write(outputFileName, completionText);
87
+ log(`Completion written to ${outputFileName}`);
88
+ }
@@ -0,0 +1,5 @@
1
+ import type {OpenRouter} from "@openrouter/sdk";
2
+
3
+ export type WorkflowDependencies = {
4
+ openRouter: OpenRouter,
5
+ }
@@ -0,0 +1,152 @@
1
+ import {$} from "bun";
2
+
3
+ import {CONFIG} from "../util/config.ts";
4
+ import chalk from "chalk";
5
+ import {LLMJudgeInputModeEnum} from "../util/config-schema.ts";
6
+ import type {WorkflowDependencies} from "./index.ts";
7
+
8
+ export async function executeTestingWorkflow(workflow: typeof CONFIG.testing_workflows[number], runNum: number, deps: WorkflowDependencies) {
9
+ console.log(`Executing testing workflow: ${workflow.slug}`);
10
+ const log = (...args: Parameters<typeof console.log>) => {
11
+ console.log(chalk.cyan(`[${workflow.slug}]`), ...args);
12
+ }
13
+ const warn = (...args: Parameters<typeof console.warn>) => {
14
+ console.warn(chalk.red(`[${workflow.slug}]`), ...args);
15
+ }
16
+ const debug = (...args: Parameters<typeof console.debug>) => {
17
+ console.debug(chalk.gray(`[${workflow.slug}]`), ...args.map(e => typeof e === "string" ? chalk.gray(e) : e));
18
+ }
19
+
20
+ log(`$PATH: ${process.env.PATH}`);
21
+
22
+ for (const command of workflow.setup_commands){
23
+ log(`Running setup command: ${command}`);
24
+ await $`${{raw: command}}`.nothrow();
25
+ }
26
+
27
+ let testCasesResults: boolean[] = new Array(workflow.test_cases.length);
28
+ let testCasesResultsExplanation: string[] = new Array(workflow.test_cases.length);
29
+ for (let i = 0; i < workflow.test_cases.length; i++) {
30
+ try {
31
+ const testCase = workflow.test_cases[i]!;
32
+ log(`Running test case ${i + 1}/${workflow.test_cases.length}: ${testCase.name}`);
33
+
34
+ if (testCase.interactive_steps.length > 0) {
35
+ warn("Interactive steps are not supported in this version. Skipping interactive steps.");
36
+ continue;
37
+ }
38
+
39
+ let {stdout, stderr, exitCode} = await $`${{raw: testCase.single_run_command}}`.cwd(testCase.work_directory).nothrow().quiet();
40
+ console.log(); // Blank line for readability
41
+ debug(`Test case stdout (${stdout.length}):\n${stdout}`);
42
+ debug(`Test case stderr (${stderr.length}):\n${stderr}`);
43
+ debug(`Exit code: ${exitCode}`);
44
+
45
+ let commandOutput = stdout.toString();
46
+ if (testCase.single_run_expected_output.prefix_strip_string.length > 0) {
47
+ let prefixIndex = commandOutput.indexOf(testCase.single_run_expected_output.prefix_strip_string);
48
+ if (prefixIndex !== -1) {
49
+ commandOutput = commandOutput.substring(prefixIndex + testCase.single_run_expected_output.prefix_strip_string.length);
50
+ }
51
+ }
52
+ if (testCase.single_run_expected_output.postfix_strip_string.length > 0) {
53
+ let postfixIndex = commandOutput.lastIndexOf(testCase.single_run_expected_output.postfix_strip_string);
54
+ if (postfixIndex !== -1) {
55
+ commandOutput = commandOutput.substring(0, postfixIndex);
56
+ }
57
+ }
58
+ debug("Sanitized command output for evaluation:\n", commandOutput);
59
+
60
+ if (testCase.single_run_expected_output.llm_judge_input_mode == LLMJudgeInputModeEnum.None){
61
+ if (stdout.includes(testCase.single_run_expected_output.substring)) {
62
+ log(`Test case '${testCase.name}' passed: expected substring found in output.`);
63
+ testCasesResults[i] = true;
64
+ }
65
+ else {
66
+ warn(`Test case '${testCase.name}' failed: expected substring NOT found in output.`);
67
+ testCasesResults[i] = false;
68
+ }
69
+ }
70
+ else {
71
+ switch (testCase.single_run_expected_output.llm_judge_input_mode) {
72
+ case LLMJudgeInputModeEnum.Full:
73
+ log("Evaluating full output with LLM judge...");
74
+ const seed = Math.floor(Date.now() / 1000);
75
+ let completion = await deps.openRouter.chat.send({
76
+ model: CONFIG.openrouter.model,
77
+ maxCompletionTokens: CONFIG.hyperparameters.max_completion_tokens,
78
+ messages: [
79
+ {
80
+ role: "system",
81
+ content: testCase.single_run_expected_output.llm_judge_prompt,
82
+ },
83
+ {
84
+ role: "user",
85
+ content: JSON.stringify({
86
+ "expected_output_substring": testCase.single_run_expected_output.substring,
87
+ "actual_output": commandOutput,
88
+ }),
89
+ }
90
+ ],
91
+ stream: false,
92
+ seed: seed,
93
+ frequencyPenalty: CONFIG.hyperparameters.frequency_penalty,
94
+ presencePenalty: CONFIG.hyperparameters.presence_penalty,
95
+ temperature: 0,
96
+ reasoning: {
97
+ effort: CONFIG.hyperparameters.reasoning_effort,
98
+ },
99
+ });
100
+ if (completion.choices.length < 1){
101
+ warn("No choices returned from completion");
102
+ console.log(completion);
103
+ }
104
+ const completionText = completion.choices[0]?.message.content?.toString() ?? "";
105
+ log(`LLM judge completion:\n${completionText}`);
106
+ const llmJudgeResult = completionText.toLowerCase().includes("pass"); // TODO: More robust parsing
107
+ if (llmJudgeResult) {
108
+ log(chalk.green(`Test case '${testCase.name}' passed according to LLM judge.`));
109
+ testCasesResults[i] = true;
110
+ }
111
+ else {
112
+ warn(chalk.yellowBright(`Test case '${testCase.name}' failed according to LLM judge.`));
113
+ testCasesResults[i] = false;
114
+ }
115
+
116
+ try {
117
+ const judgeResultObject = JSON.parse(completionText);
118
+
119
+ if ("summary" in judgeResultObject) {
120
+ testCasesResultsExplanation[i] = judgeResultObject.summary;
121
+ }
122
+ } catch (e) {
123
+ warn("Failed to parse LLM judge output as JSON. Make sure the LLM prompt requests JSON output.");
124
+ }
125
+ break;
126
+ default:
127
+ warn(`LLM judge input mode '${testCase.single_run_expected_output.llm_judge_input_mode}' is not supported in this version. Skipping LLM judging.`);
128
+ break;
129
+ }
130
+ }
131
+ } catch (e) {
132
+ warn(`Error occurred while executing test case ${i + 1}:`, e);
133
+ testCasesResults[i] = false;
134
+ }
135
+ }
136
+ for (const command of workflow.cleanup_commands){
137
+ log(`Running cleanup command: ${command}`);
138
+ await $`${{raw: command}}`.nothrow();
139
+ }
140
+
141
+ const passedCount = testCasesResults.filter((r) => r).length;
142
+ log(`Testing workflow completed. Passed ${passedCount}/${workflow.test_cases.length} test cases.`);
143
+ console.table(testCasesResults.map((entry, idx) => {
144
+ return [
145
+ workflow.test_cases[idx]?.name,
146
+ entry ? chalk.green("PASS") : chalk.red("FAIL"),
147
+ testCasesResultsExplanation[idx] || "",
148
+ ];
149
+ }));
150
+
151
+ log(`Finished testing workflow: ${workflow.slug}`);
152
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "compilerOptions": {
3
+ // Environment setup & latest features
4
+ "lib": ["ESNext"],
5
+ "target": "ESNext",
6
+ "module": "Preserve",
7
+ "moduleDetection": "force",
8
+ "jsx": "react-jsx",
9
+ "allowJs": true,
10
+
11
+ // Bundler mode
12
+ "moduleResolution": "bundler",
13
+ "allowImportingTsExtensions": true,
14
+ "verbatimModuleSyntax": true,
15
+ "noEmit": true,
16
+
17
+ // Best practices
18
+ "strict": true,
19
+ "skipLibCheck": true,
20
+ "noFallthroughCasesInSwitch": true,
21
+ "noUncheckedIndexedAccess": true,
22
+ "noImplicitOverride": true,
23
+
24
+ // Some stricter flags (disabled by default)
25
+ "noUnusedLocals": false,
26
+ "noUnusedParameters": false,
27
+ "noPropertyAccessFromIndexSignature": false
28
+ }
29
+ }