even-pf 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/even-pf.js +47 -0
- package/package.json +19 -12
- package/.gitignore +0 -4
- package/bun.lock +0 -42
- package/bunfig.toml +0 -2
- package/epf.example.toml +0 -58
- package/src/generate-config.ts +0 -40
- package/src/index.ts +0 -59
- package/src/util/args.ts +0 -37
- package/src/util/config-schema.ts +0 -80
- package/src/util/config.ts +0 -69
- package/src/util/eval-harness.ts +0 -18
- package/src/util/file-payload.ts +0 -87
- package/src/util/llm.ts +0 -80
- package/src/util/output-viewer.ts +0 -54
- package/src/workflow/analysis-workflow.ts +0 -60
- package/src/workflow/index.ts +0 -9
- package/src/workflow/testing-workflow.ts +0 -127
- package/tsconfig.json +0 -29
package/bin/even-pf.js
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
"use strict";
|
|
3
|
+
|
|
4
|
+
const { spawnSync } = require("child_process");
|
|
5
|
+
const path = require("path");
|
|
6
|
+
|
|
7
|
+
// Map process.platform + process.arch to the sub-package name and binary filename
|
|
8
|
+
const PLATFORM_MAP = {
|
|
9
|
+
"linux-x64": { pkg: "even-pf-linux-x64", bin: "even-pf" },
|
|
10
|
+
"linux-arm64": { pkg: "even-pf-linux-arm64", bin: "even-pf" },
|
|
11
|
+
"win32-x64": { pkg: "even-pf-windows-x64", bin: "even-pf.exe" },
|
|
12
|
+
"darwin-x64": { pkg: "even-pf-darwin-x64", bin: "even-pf" },
|
|
13
|
+
"darwin-arm64":{ pkg: "even-pf-darwin-arm64", bin: "even-pf" },
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
const key = `${process.platform}-${process.arch}`;
|
|
17
|
+
const entry = PLATFORM_MAP[key];
|
|
18
|
+
|
|
19
|
+
if (!entry) {
|
|
20
|
+
console.error(
|
|
21
|
+
`even-pf: Unsupported platform/architecture: ${key}\n` +
|
|
22
|
+
`Supported: ${Object.keys(PLATFORM_MAP).join(", ")}`
|
|
23
|
+
);
|
|
24
|
+
process.exit(1);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
let binaryPath;
|
|
28
|
+
try {
|
|
29
|
+
// resolve the binary inside the optionally-installed platform sub-package
|
|
30
|
+
binaryPath = require.resolve(`${entry.pkg}/bin/${entry.bin}`);
|
|
31
|
+
} catch {
|
|
32
|
+
console.error(
|
|
33
|
+
`even-pf: Could not find the platform binary package "${entry.pkg}".\n` +
|
|
34
|
+
`Try reinstalling even-pf, or install the package manually:\n` +
|
|
35
|
+
` npm install ${entry.pkg}`
|
|
36
|
+
);
|
|
37
|
+
process.exit(1);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const result = spawnSync(binaryPath, process.argv.slice(2), { stdio: "inherit" });
|
|
41
|
+
|
|
42
|
+
if (result.error) {
|
|
43
|
+
console.error(`even-pf: Failed to launch binary: ${result.error.message}`);
|
|
44
|
+
process.exit(1);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
process.exit(result.status ?? 0);
|
package/package.json
CHANGED
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "even-pf",
|
|
3
|
-
"version": "0.2.
|
|
4
|
-
"
|
|
3
|
+
"version": "0.2.6",
|
|
4
|
+
"description": "AI-assisted responsible grading tool for programming assignments",
|
|
5
|
+
"module": "src/cli.ts",
|
|
5
6
|
"type": "module",
|
|
6
7
|
"license": "UNLICENSED",
|
|
7
8
|
"scripts": {
|
|
8
|
-
"start": "bun run src/
|
|
9
|
-
"build": "bun build src/
|
|
9
|
+
"start": "bun run src/cli.ts",
|
|
10
|
+
"build-dev": "bun build src/cli.ts --compile --outfile build/epf",
|
|
11
|
+
"build:all": "bun scripts/build-all.ts",
|
|
12
|
+
"publish:all": "bun scripts/publish-all.ts",
|
|
13
|
+
"publish:dry": "bun scripts/publish-all.ts --dry-run",
|
|
14
|
+
"bump": "bun scripts/bump-version.ts",
|
|
10
15
|
"config-gen": "bun run --console-depth 6 src/generate-config.ts"
|
|
11
16
|
},
|
|
12
17
|
"bin": {
|
|
13
|
-
"even-pf": "
|
|
18
|
+
"even-pf": "bin/even-pf.js"
|
|
14
19
|
},
|
|
15
20
|
"devDependencies": {
|
|
16
21
|
"@types/bun": "latest"
|
|
@@ -24,14 +29,16 @@
|
|
|
24
29
|
"smol-toml": "^1.6.0",
|
|
25
30
|
"zod-defaults": "^0.2.3"
|
|
26
31
|
},
|
|
32
|
+
"optionalDependencies": {
|
|
33
|
+
"even-pf-linux-x64": "0.2.6",
|
|
34
|
+
"even-pf-linux-arm64": "0.2.6",
|
|
35
|
+
"even-pf-windows-x64": "0.2.6",
|
|
36
|
+
"even-pf-darwin-x64": "0.2.6",
|
|
37
|
+
"even-pf-darwin-arm64": "0.2.6"
|
|
38
|
+
},
|
|
27
39
|
"files": [
|
|
28
|
-
".
|
|
29
|
-
"src",
|
|
30
|
-
"bun.lock",
|
|
31
|
-
"bunfig.toml",
|
|
32
|
-
"epf.example.toml",
|
|
40
|
+
"bin/even-pf.js",
|
|
33
41
|
"package.json",
|
|
34
|
-
"README.md"
|
|
35
|
-
"tsconfig.json"
|
|
42
|
+
"README.md"
|
|
36
43
|
]
|
|
37
44
|
}
|
package/.gitignore
DELETED
package/bun.lock
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"lockfileVersion": 1,
|
|
3
|
-
"configVersion": 1,
|
|
4
|
-
"workspaces": {
|
|
5
|
-
"": {
|
|
6
|
-
"name": "tools",
|
|
7
|
-
"dependencies": {
|
|
8
|
-
"@openrouter/sdk": "^0.5.1",
|
|
9
|
-
"chalk": "^5.6.2",
|
|
10
|
-
"smol-toml": "^1.6.0",
|
|
11
|
-
"zod-defaults": "^0.2.3",
|
|
12
|
-
},
|
|
13
|
-
"devDependencies": {
|
|
14
|
-
"@types/bun": "latest",
|
|
15
|
-
},
|
|
16
|
-
"peerDependencies": {
|
|
17
|
-
"typescript": "^5.9.3",
|
|
18
|
-
},
|
|
19
|
-
},
|
|
20
|
-
},
|
|
21
|
-
"packages": {
|
|
22
|
-
"@openrouter/sdk": ["@openrouter/sdk@0.5.1", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-Kl0N1jIj7A3lnkM5dO3SGP8JP3jAozzs6JWcHVuZUBt5DsGKxFGNH1Y15bCfsJiLNA2ylAQpCN3aNcgEYkkL5Q=="],
|
|
23
|
-
|
|
24
|
-
"@types/bun": ["@types/bun@1.3.9", "", { "dependencies": { "bun-types": "1.3.9" } }, "sha512-KQ571yULOdWJiMH+RIWIOZ7B2RXQGpL1YQrBtLIV3FqDcCu6FsbFUBwhdKUlCKUpS3PJDsHlJ1QKlpxoVR+xtw=="],
|
|
25
|
-
|
|
26
|
-
"@types/node": ["@types/node@24.10.1", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ=="],
|
|
27
|
-
|
|
28
|
-
"bun-types": ["bun-types@1.3.9", "", { "dependencies": { "@types/node": "*" } }, "sha512-+UBWWOakIP4Tswh0Bt0QD0alpTY8cb5hvgiYeWCMet9YukHbzuruIEeXC2D7nMJPB12kbh8C7XJykSexEqGKJg=="],
|
|
29
|
-
|
|
30
|
-
"chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="],
|
|
31
|
-
|
|
32
|
-
"smol-toml": ["smol-toml@1.6.0", "", {}, "sha512-4zemZi0HvTnYwLfrpk/CF9LOd9Lt87kAt50GnqhMpyF9U3poDAP2+iukq2bZsO/ufegbYehBkqINbsWxj4l4cw=="],
|
|
33
|
-
|
|
34
|
-
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
|
35
|
-
|
|
36
|
-
"undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="],
|
|
37
|
-
|
|
38
|
-
"zod": ["zod@4.1.12", "", {}, "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ=="],
|
|
39
|
-
|
|
40
|
-
"zod-defaults": ["zod-defaults@0.2.3", "", { "peerDependencies": { "zod": "^4.1.12" } }, "sha512-7pYkOH1/c+Ril5AZUYtsbhMkehkI8CMqzFZ7YZXfC9SMLRvZuLyonQE7BAIVSNxeTpqTZmW5BLxGSzWMnKNdIw=="],
|
|
41
|
-
}
|
|
42
|
-
}
|
package/bunfig.toml
DELETED
package/epf.example.toml
DELETED
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
[llm.models.general_analysis]
|
|
2
|
-
sdk = "openrouter"
|
|
3
|
-
model_name = ""
|
|
4
|
-
max_completion_tokens = 20000
|
|
5
|
-
temperature = 0.9
|
|
6
|
-
top_p = 1
|
|
7
|
-
frequency_penalty = 0
|
|
8
|
-
presence_penalty = 0
|
|
9
|
-
reasoning_effort = "high"
|
|
10
|
-
|
|
11
|
-
[llm.models.output_comparison]
|
|
12
|
-
sdk = "openrouter"
|
|
13
|
-
model_name = ""
|
|
14
|
-
max_completion_tokens = 20000
|
|
15
|
-
temperature = 0
|
|
16
|
-
top_p = 1
|
|
17
|
-
frequency_penalty = 0
|
|
18
|
-
presence_penalty = 0
|
|
19
|
-
reasoning_effort = "high"
|
|
20
|
-
|
|
21
|
-
[llm.prompt_replacement]
|
|
22
|
-
role = "role_placeholder"
|
|
23
|
-
|
|
24
|
-
[vendors.openrouter]
|
|
25
|
-
api_key = ""
|
|
26
|
-
|
|
27
|
-
[[analysis_workflows]]
|
|
28
|
-
slug = ""
|
|
29
|
-
model = "general_analysis"
|
|
30
|
-
runs = 1
|
|
31
|
-
output_filename = ""
|
|
32
|
-
prompt = ""
|
|
33
|
-
|
|
34
|
-
[[analysis_workflows.input_files_searches]]
|
|
35
|
-
file_glob = ""
|
|
36
|
-
search_directory = "."
|
|
37
|
-
excluded_files = []
|
|
38
|
-
|
|
39
|
-
[[testing_workflows]]
|
|
40
|
-
slug = ""
|
|
41
|
-
model = "general_analysis"
|
|
42
|
-
runs = 1
|
|
43
|
-
output_filename = ""
|
|
44
|
-
setup_commands = []
|
|
45
|
-
cleanup_commands = []
|
|
46
|
-
|
|
47
|
-
[[testing_workflows.test_cases]]
|
|
48
|
-
name = ""
|
|
49
|
-
work_directory = "."
|
|
50
|
-
single_run_command = ""
|
|
51
|
-
interactive_steps = []
|
|
52
|
-
|
|
53
|
-
[testing_workflows.test_cases.single_run_expected_output]
|
|
54
|
-
prefix_strip_string = ""
|
|
55
|
-
postfix_strip_string = ""
|
|
56
|
-
substring = ""
|
|
57
|
-
llm_judge_input_mode = "NONE"
|
|
58
|
-
llm_judge_prompt = ""
|
package/src/generate-config.ts
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import {stringify} from "smol-toml";
|
|
2
|
-
import {getDefaultsForSchema} from "zod-defaults";
|
|
3
|
-
|
|
4
|
-
import {
|
|
5
|
-
ConfigSchema,
|
|
6
|
-
AnalysisWorkflowEntrySchema,
|
|
7
|
-
FileSearchEntrySchema,
|
|
8
|
-
TestingWorkflowEntrySchema, TestCaseSchema, LLMConfigSchema, ModelConfigSchema
|
|
9
|
-
} from "./util/config-schema.ts";
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
console.log("generate-config.ts");
|
|
13
|
-
|
|
14
|
-
let defaultConfig = getDefaultsForSchema(ConfigSchema);
|
|
15
|
-
|
|
16
|
-
let defaultLLMConfig = getDefaultsForSchema(LLMConfigSchema);
|
|
17
|
-
defaultLLMConfig.prompt_replacement["role"] = "role_placeholder";
|
|
18
|
-
let defaultModelConfig = getDefaultsForSchema(ModelConfigSchema);
|
|
19
|
-
defaultLLMConfig.models["general_analysis"] = defaultModelConfig;
|
|
20
|
-
defaultLLMConfig.models["output_comparison"] = structuredClone(defaultModelConfig);
|
|
21
|
-
defaultLLMConfig.models["output_comparison"].temperature = 0;
|
|
22
|
-
defaultConfig.llm = defaultLLMConfig;
|
|
23
|
-
|
|
24
|
-
let defaultAnalysisWorkflowEntry = getDefaultsForSchema(AnalysisWorkflowEntrySchema);
|
|
25
|
-
let defaultFileSearchEntry = getDefaultsForSchema(FileSearchEntrySchema);
|
|
26
|
-
defaultAnalysisWorkflowEntry.input_files_searches = [defaultFileSearchEntry];
|
|
27
|
-
defaultConfig.analysis_workflows = [defaultAnalysisWorkflowEntry];
|
|
28
|
-
|
|
29
|
-
let defaultTestingWorkflowEntry = getDefaultsForSchema(TestingWorkflowEntrySchema);
|
|
30
|
-
defaultTestingWorkflowEntry.test_cases = [getDefaultsForSchema(TestCaseSchema)];
|
|
31
|
-
defaultConfig.testing_workflows = [defaultTestingWorkflowEntry];
|
|
32
|
-
|
|
33
|
-
console.log(defaultConfig);
|
|
34
|
-
|
|
35
|
-
const tomlString = stringify(defaultConfig);
|
|
36
|
-
const outputFilename = "epf.example.toml";
|
|
37
|
-
await Bun.file(outputFilename).write(tomlString);
|
|
38
|
-
console.log(`Default config written to ${outputFilename}`);
|
|
39
|
-
|
|
40
|
-
console.log("generate-config.ts done");
|
package/src/index.ts
DELETED
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
|
|
3
|
-
import {OpenRouter} from "@openrouter/sdk";
|
|
4
|
-
|
|
5
|
-
import {CONFIG} from "./util/config.ts";
|
|
6
|
-
import {executeTestingWorkflow} from "./workflow/testing-workflow.ts";
|
|
7
|
-
import {executeAnalysisWorkflow} from "./workflow/analysis-workflow.ts";
|
|
8
|
-
import type {WorkflowDependencies} from "./workflow";
|
|
9
|
-
import {OutputViewer} from "./util/output-viewer.ts";
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
console.log("EPF index.ts");
|
|
13
|
-
|
|
14
|
-
const workflowDependencies: WorkflowDependencies = {
|
|
15
|
-
seed: Math.floor(Date.now() / 1000),
|
|
16
|
-
openRouter: new OpenRouter({
|
|
17
|
-
apiKey: CONFIG.vendors.openrouter.api_key,
|
|
18
|
-
}),
|
|
19
|
-
outputViewer: new OutputViewer(),
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
// Parallelize workflows with Promise.allSettled
|
|
23
|
-
const analysisWorkflows = CONFIG.analysis_workflows;
|
|
24
|
-
const testingWorkflows = CONFIG.testing_workflows;
|
|
25
|
-
console.log(`Starting execution of ${analysisWorkflows.length} workflows...`);
|
|
26
|
-
console.log(analysisWorkflows.map((w) => w.slug));
|
|
27
|
-
let workflowRuns: Promise<void>[] = [];
|
|
28
|
-
analysisWorkflows.forEach((workflow) => {
|
|
29
|
-
for (let i = 0; i < workflow.runs; i++) {
|
|
30
|
-
workflowRuns.push(executeAnalysisWorkflow(workflow, i+1, workflowDependencies));
|
|
31
|
-
}
|
|
32
|
-
});
|
|
33
|
-
testingWorkflows.forEach((workflow) => {
|
|
34
|
-
for (let i = 0; i < workflow.runs; i++) {
|
|
35
|
-
workflowRuns.push(executeTestingWorkflow(workflow, i+1, workflowDependencies));
|
|
36
|
-
}
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
const workflowsResults = await Promise.allSettled(workflowRuns);
|
|
40
|
-
// Summarize with indices to include slugs in failure logs
|
|
41
|
-
const failedIndices: number[] = [];
|
|
42
|
-
const succeededIndices: number[] = [];
|
|
43
|
-
workflowsResults.forEach((r, i) => {
|
|
44
|
-
if (r.status === "rejected") failedIndices.push(i);
|
|
45
|
-
else succeededIndices.push(i);
|
|
46
|
-
});
|
|
47
|
-
|
|
48
|
-
console.log(`Workflows completed. Succeeded: ${succeededIndices.length}; Failed: ${failedIndices.length}`);
|
|
49
|
-
if (failedIndices.length > 0) {
|
|
50
|
-
failedIndices.forEach((i) => {
|
|
51
|
-
const r = workflowsResults[i] as PromiseRejectedResult;
|
|
52
|
-
const slug = analysisWorkflows[i]?.slug ?? `#${i + 1}`;
|
|
53
|
-
console.warn(`Workflow '${slug}' failed:`, r.reason);
|
|
54
|
-
});
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
workflowDependencies.outputViewer.display();
|
|
58
|
-
|
|
59
|
-
console.log("index.ts done");
|
package/src/util/args.ts
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import {parseArgs} from "util";
|
|
2
|
-
|
|
3
|
-
// console.log(Bun.argv);
|
|
4
|
-
export const ARGS = parseArgs({
|
|
5
|
-
args: Bun.argv,
|
|
6
|
-
options: {
|
|
7
|
-
// mode: {
|
|
8
|
-
// type: "string",
|
|
9
|
-
// short: "M",
|
|
10
|
-
// default: "run",
|
|
11
|
-
// },
|
|
12
|
-
config: {
|
|
13
|
-
type: "string",
|
|
14
|
-
short: "C",
|
|
15
|
-
},
|
|
16
|
-
dir: {
|
|
17
|
-
type: "string",
|
|
18
|
-
short: "D",
|
|
19
|
-
default: ".",
|
|
20
|
-
},
|
|
21
|
-
skip_workflow: {
|
|
22
|
-
type: "string",
|
|
23
|
-
short: "S",
|
|
24
|
-
multiple: true,
|
|
25
|
-
},
|
|
26
|
-
completion_inputs_destination: {
|
|
27
|
-
type: "string",
|
|
28
|
-
},
|
|
29
|
-
},
|
|
30
|
-
strict: true,
|
|
31
|
-
allowPositionals: true,
|
|
32
|
-
});
|
|
33
|
-
|
|
34
|
-
// export enum RunMode {
|
|
35
|
-
// Run = "run",
|
|
36
|
-
// Eval = "eval",
|
|
37
|
-
// }
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
import {z} from "zod";
|
|
2
|
-
|
|
3
|
-
export const ModelConfigSchema = z.object({
|
|
4
|
-
sdk: z.enum(["openrouter"]).default("openrouter"),
|
|
5
|
-
model_name: z.string().default(""),
|
|
6
|
-
max_completion_tokens: z.number().min(1).default(20000),
|
|
7
|
-
temperature: z.number().min(0).max(1).default(0.9),
|
|
8
|
-
top_p: z.number().min(0).max(1).default(1),
|
|
9
|
-
frequency_penalty: z.number().min(-2).max(2).default(0),
|
|
10
|
-
presence_penalty: z.number().min(-2).max(2).default(0),
|
|
11
|
-
reasoning_effort: z.enum(["low", "medium", "high"]).default("high"),
|
|
12
|
-
});
|
|
13
|
-
|
|
14
|
-
export const LLMConfigSchema = z.object({
|
|
15
|
-
models: z.record(z.string(), ModelConfigSchema),
|
|
16
|
-
prompt_replacement: z.record(z.string(), z.string()),
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
export const FileSearchEntrySchema = z.object({
|
|
20
|
-
file_glob: z.string().min(1),
|
|
21
|
-
search_directory: z.string().default("."),
|
|
22
|
-
excluded_files: z.array(z.string()).default([]),
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
export const BaseWorkflowEntrySchema = z.object({
|
|
26
|
-
slug: z.string(),
|
|
27
|
-
model: z.string().default("general_analysis"),
|
|
28
|
-
runs: z.number().min(1).default(1),
|
|
29
|
-
input_files_searches: z.array(FileSearchEntrySchema).default([]),
|
|
30
|
-
output_filename: z.string().min(1),
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
export const AnalysisWorkflowEntrySchema = BaseWorkflowEntrySchema.extend({
|
|
34
|
-
prompt: z.string(),
|
|
35
|
-
})
|
|
36
|
-
|
|
37
|
-
export enum LLMJudgeInputModeEnum{
|
|
38
|
-
None = "NONE",
|
|
39
|
-
Diff = "DIFF",
|
|
40
|
-
Full = "FULL",
|
|
41
|
-
}
|
|
42
|
-
const LLMJudgeInputModeSchema = z.enum(LLMJudgeInputModeEnum);
|
|
43
|
-
|
|
44
|
-
const ExpectedOutputSchema = z.object({
|
|
45
|
-
prefix_strip_string: z.string().min(0),
|
|
46
|
-
postfix_strip_string: z.string().min(0),
|
|
47
|
-
substring: z.string().min(0),
|
|
48
|
-
llm_judge_input_mode: LLMJudgeInputModeSchema.default(LLMJudgeInputModeEnum.None),
|
|
49
|
-
llm_judge_prompt: z.string().min(0),
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
export const TestCaseSchema = z.object({
|
|
53
|
-
name: z.string(),
|
|
54
|
-
work_directory: z.string().default("."),
|
|
55
|
-
single_run_command: z.string(),
|
|
56
|
-
single_run_expected_output: ExpectedOutputSchema,
|
|
57
|
-
interactive_steps: z.array(z.object({
|
|
58
|
-
input: z.string(),
|
|
59
|
-
expected_output: ExpectedOutputSchema,
|
|
60
|
-
})),
|
|
61
|
-
});
|
|
62
|
-
|
|
63
|
-
export const TestingWorkflowEntrySchema = BaseWorkflowEntrySchema.extend({
|
|
64
|
-
setup_commands: z.array(z.string()).default([]),
|
|
65
|
-
test_cases: z.array(TestCaseSchema).default([]),
|
|
66
|
-
cleanup_commands: z.array(z.string()).default([]),
|
|
67
|
-
}).omit({
|
|
68
|
-
input_files_searches: true,
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
export const ConfigSchema = z.object({
|
|
72
|
-
llm: LLMConfigSchema,
|
|
73
|
-
vendors: z.object({
|
|
74
|
-
openrouter: z.object({
|
|
75
|
-
api_key: z.string(),
|
|
76
|
-
}),
|
|
77
|
-
}),
|
|
78
|
-
analysis_workflows: z.array(AnalysisWorkflowEntrySchema),
|
|
79
|
-
testing_workflows: z.array(TestingWorkflowEntrySchema),
|
|
80
|
-
});
|
package/src/util/config.ts
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import os from "node:os";
|
|
2
|
-
import { readFileSync, existsSync } from "node:fs";
|
|
3
|
-
|
|
4
|
-
import {z} from "zod";
|
|
5
|
-
|
|
6
|
-
import {ARGS} from "./args.ts";
|
|
7
|
-
import {ConfigSchema} from "./config-schema.ts";
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
const homeDir: string = os.homedir();
|
|
11
|
-
const defaultConfigFileName = "epf.toml";
|
|
12
|
-
const configURLEnvVar = "EPF_CONFIG_URL";
|
|
13
|
-
|
|
14
|
-
type Config = z.infer<typeof ConfigSchema>;
|
|
15
|
-
|
|
16
|
-
async function readConfig() {
|
|
17
|
-
console.log(`Loading config`);
|
|
18
|
-
|
|
19
|
-
let configFilePath: string;
|
|
20
|
-
if (ARGS.values.config && ARGS.values.config.trim().length > 0) {
|
|
21
|
-
configFilePath = ARGS.values.config.trim();
|
|
22
|
-
console.log(`Found config from command line argument: ${configFilePath}`);
|
|
23
|
-
}
|
|
24
|
-
else if (process.env[configURLEnvVar]) {
|
|
25
|
-
configFilePath = process.env[configURLEnvVar]!;
|
|
26
|
-
console.log(`Found config from environment variable ${configURLEnvVar}`);
|
|
27
|
-
}
|
|
28
|
-
else {
|
|
29
|
-
if (existsSync(defaultConfigFileName)) {
|
|
30
|
-
configFilePath = defaultConfigFileName;
|
|
31
|
-
console.log(`Found config from current directory`);
|
|
32
|
-
}
|
|
33
|
-
else if (existsSync(`${homeDir}/${defaultConfigFileName}`)) {
|
|
34
|
-
configFilePath = `${homeDir}/${defaultConfigFileName}`;
|
|
35
|
-
console.log(`Found config from home directory`);
|
|
36
|
-
}
|
|
37
|
-
else {
|
|
38
|
-
throw new Error(`Config file ${defaultConfigFileName} not found`);
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
let configFileContents: string;
|
|
43
|
-
if (/^https?:\/\//.test(configFilePath)) {
|
|
44
|
-
console.log(`Fetching config from URL: ${configFilePath}`);
|
|
45
|
-
const configResponse = await fetch(configFilePath);
|
|
46
|
-
if (!configResponse.ok) {
|
|
47
|
-
throw new Error(`Failed to fetch config from URL: ${configFilePath}, status: ${configResponse.status}`);
|
|
48
|
-
}
|
|
49
|
-
configFileContents = await configResponse.text();
|
|
50
|
-
} else {
|
|
51
|
-
console.log(`Loading config from local file: ${configFilePath}`);
|
|
52
|
-
configFileContents = readFileSync(configFilePath).toString();
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
console.assert(configFileContents.trim().length > 0, "Config file is empty");
|
|
56
|
-
|
|
57
|
-
let obj = Bun.TOML.parse(configFileContents);
|
|
58
|
-
const parsedConfig = ConfigSchema.safeParse(obj);
|
|
59
|
-
if (!parsedConfig.success) {
|
|
60
|
-
console.error("Config file is invalid:", parsedConfig.error.format());
|
|
61
|
-
console.log(`Config file contents:\n${configFileContents}`);
|
|
62
|
-
console.log(parsedConfig);
|
|
63
|
-
throw new Error("Config file is invalid");
|
|
64
|
-
}
|
|
65
|
-
console.log(`Config loaded from ${configFilePath}`);
|
|
66
|
-
return parsedConfig.data as Config;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
export const CONFIG = await readConfig();
|
package/src/util/eval-harness.ts
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import { appendFile } from "node:fs/promises";
|
|
2
|
-
|
|
3
|
-
import {ARGS} from "./args.ts";
|
|
4
|
-
|
|
5
|
-
export async function recordCompletionInput(inputs: {role: string, content: any}[]) {
|
|
6
|
-
let completionInputsDestination = ARGS.values.completion_inputs_destination;
|
|
7
|
-
if (!completionInputsDestination) {
|
|
8
|
-
return;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
if (!(await Bun.file(completionInputsDestination).exists())){
|
|
12
|
-
console.warn(`Completion inputs destination file ${completionInputsDestination} does not exist`);
|
|
13
|
-
return;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
await appendFile(completionInputsDestination, JSON.stringify(completionInputsDestination)+"\n");
|
|
17
|
-
console.log(`Recorded completion inputs to ${completionInputsDestination}`);
|
|
18
|
-
}
|
package/src/util/file-payload.ts
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Helper class for generating file content payloads with language-specific formatting
|
|
3
|
-
*/
|
|
4
|
-
export class FilePayloadGenerator {
|
|
5
|
-
private static readonly LANGUAGE_MAP: Record<string, string> = {
|
|
6
|
-
'.cs': 'csharp',
|
|
7
|
-
'.cpp': 'cpp',
|
|
8
|
-
'.cc': 'cpp',
|
|
9
|
-
'.cxx': 'cpp',
|
|
10
|
-
'.c': 'c',
|
|
11
|
-
'.h': 'cpp',
|
|
12
|
-
'.hpp': 'cpp',
|
|
13
|
-
'.hxx': 'cpp',
|
|
14
|
-
'.java': 'java',
|
|
15
|
-
'.js': 'javascript',
|
|
16
|
-
'.ts': 'typescript',
|
|
17
|
-
'.py': 'python',
|
|
18
|
-
'.rb': 'ruby',
|
|
19
|
-
'.go': 'go',
|
|
20
|
-
'.rs': 'rust',
|
|
21
|
-
'.swift': 'swift',
|
|
22
|
-
'.kt': 'kotlin',
|
|
23
|
-
'.php': 'php',
|
|
24
|
-
'.scala': 'scala',
|
|
25
|
-
'.sh': 'bash',
|
|
26
|
-
'.bash': 'bash',
|
|
27
|
-
'.zsh': 'zsh',
|
|
28
|
-
'.ps1': 'powershell',
|
|
29
|
-
'.md': 'markdown',
|
|
30
|
-
'.json': 'json',
|
|
31
|
-
'.xml': 'xml',
|
|
32
|
-
'.yaml': 'yaml',
|
|
33
|
-
'.yml': 'yaml',
|
|
34
|
-
'.toml': 'toml',
|
|
35
|
-
'.html': 'html',
|
|
36
|
-
'.css': 'css',
|
|
37
|
-
'.scss': 'scss',
|
|
38
|
-
'.sql': 'sql',
|
|
39
|
-
};
|
|
40
|
-
|
|
41
|
-
/**
|
|
42
|
-
* Get the language identifier for a file based on its extension
|
|
43
|
-
* @param filePath The path to the file
|
|
44
|
-
* @returns The language identifier (e.g., 'csharp', 'cpp')
|
|
45
|
-
*/
|
|
46
|
-
private static getLanguageFromPath(filePath: string): string {
|
|
47
|
-
const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
|
|
48
|
-
return this.LANGUAGE_MAP[ext] || 'text';
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
/**
|
|
52
|
-
* Generate a formatted payload for a single file
|
|
53
|
-
* @param filePath The path to the file
|
|
54
|
-
* @param content The content of the file
|
|
55
|
-
* @returns A formatted string with file path and content in a code block
|
|
56
|
-
*/
|
|
57
|
-
static formatFileContent(filePath: string, content: string): string {
|
|
58
|
-
const language = this.getLanguageFromPath(filePath);
|
|
59
|
-
return `${filePath}\n\`\`\`${language}\n${content}\n\`\`\``;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
* Generate payloads for multiple files
|
|
64
|
-
* @param files Array of file paths
|
|
65
|
-
* @returns Array of formatted file content strings
|
|
66
|
-
*/
|
|
67
|
-
static async generatePayloads(files: string[]): Promise<string[]> {
|
|
68
|
-
const payloads: string[] = [];
|
|
69
|
-
|
|
70
|
-
for (const file of files) {
|
|
71
|
-
const content = await Bun.file(file).text();
|
|
72
|
-
payloads.push(this.formatFileContent(file, content));
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
return payloads;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Add a custom language mapping
|
|
80
|
-
* @param extension The file extension (including the dot, e.g., '.custom')
|
|
81
|
-
* @param language The language identifier to use
|
|
82
|
-
*/
|
|
83
|
-
static addLanguageMapping(extension: string, language: string): void {
|
|
84
|
-
this.LANGUAGE_MAP[extension.toLowerCase()] = language;
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
|
package/src/util/llm.ts
DELETED
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
import type {SystemMessage, UserMessage} from "@openrouter/sdk/models";
|
|
2
|
-
|
|
3
|
-
import {CONFIG} from "./config.ts";
|
|
4
|
-
import type {WorkflowDependencies} from "../workflow";
|
|
5
|
-
import {recordCompletionInput} from "./eval-harness.ts";
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
export async function generateCompletion(deps: WorkflowDependencies,
|
|
9
|
-
log: (..._: any[])=>void,
|
|
10
|
-
warn: (..._: any[])=>void,
|
|
11
|
-
model: string,
|
|
12
|
-
systemPrompt: string,
|
|
13
|
-
content: UserMessage["content"]) {
|
|
14
|
-
let modelSettings = CONFIG.llm.models[model];
|
|
15
|
-
if (!modelSettings) {
|
|
16
|
-
throw new Error(`No model settings found for model "${model}"`);
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
let replacedCount = 0;
|
|
20
|
-
for (const [replacementKey, replacementValue] of Object.entries(CONFIG.llm.prompt_replacement)) {
|
|
21
|
-
if (systemPrompt.includes(replacementKey)) {replacedCount++}
|
|
22
|
-
systemPrompt = systemPrompt.replaceAll(`{{${replacementKey}}}`, replacementValue);
|
|
23
|
-
if (typeof content === "string") {
|
|
24
|
-
if (content.includes(replacementKey)) {replacedCount++}
|
|
25
|
-
content = content.replaceAll(`{{${replacementKey}}}`, replacementValue);
|
|
26
|
-
}
|
|
27
|
-
else {
|
|
28
|
-
for (let i = 0; i < content.length; i++) {
|
|
29
|
-
const element = content[i];
|
|
30
|
-
if (element && "type" in element && element.type === "text" && typeof element.text === "string") {
|
|
31
|
-
if (element.text.includes(replacementKey)) {replacedCount++}
|
|
32
|
-
content[i] = {
|
|
33
|
-
...element,
|
|
34
|
-
text: element.text.replaceAll(`{{${replacementKey}}}`, replacementValue),
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
log(`Replaced ${replacedCount} instances of prompt variables in system prompt and content`);
|
|
42
|
-
|
|
43
|
-
let messages: (SystemMessage | UserMessage)[] = [
|
|
44
|
-
{
|
|
45
|
-
role: "system",
|
|
46
|
-
content: systemPrompt,
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
role: "user",
|
|
50
|
-
content: content,
|
|
51
|
-
}
|
|
52
|
-
];
|
|
53
|
-
setTimeout(async ()=> await recordCompletionInput(messages), 5);
|
|
54
|
-
|
|
55
|
-
log("Sending chat completion request...");
|
|
56
|
-
let startTime = Date.now();
|
|
57
|
-
let completion = await deps.openRouter.chat.send({
|
|
58
|
-
model: modelSettings.model_name,
|
|
59
|
-
maxCompletionTokens: modelSettings.max_completion_tokens,
|
|
60
|
-
messages: messages,
|
|
61
|
-
stream: false,
|
|
62
|
-
seed: deps.seed,
|
|
63
|
-
frequencyPenalty: modelSettings.frequency_penalty,
|
|
64
|
-
presencePenalty: modelSettings.presence_penalty,
|
|
65
|
-
temperature: modelSettings.temperature,
|
|
66
|
-
reasoning: {
|
|
67
|
-
effort: modelSettings.reasoning_effort,
|
|
68
|
-
},
|
|
69
|
-
});
|
|
70
|
-
log(`Completion response generated in ${(Date.now() - startTime) / 1000} seconds`);
|
|
71
|
-
if (completion.choices.length < 1){
|
|
72
|
-
warn("No choices returned from completion");
|
|
73
|
-
console.log(completion);
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
return {
|
|
77
|
-
text: completion.choices[0]?.message.content?.toString() ?? "",
|
|
78
|
-
model: completion.model,
|
|
79
|
-
};
|
|
80
|
-
}
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import chalk from "chalk";
|
|
2
|
-
|
|
3
|
-
type FileRecord = {
|
|
4
|
-
type: "markdown" | "text";
|
|
5
|
-
content: string;
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
export class OutputViewer {
|
|
9
|
-
filesRecords: Record<string, FileRecord> = {};
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
addFile(filename: string, _: FileRecord): void {
|
|
13
|
-
this.filesRecords[filename] = _;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
private serve(): void {
|
|
17
|
-
let files = Object.entries(this.filesRecords).sort((a, b) => a[0].localeCompare(b[0]));
|
|
18
|
-
|
|
19
|
-
let server = Bun.serve({
|
|
20
|
-
port: 0,
|
|
21
|
-
routes: {
|
|
22
|
-
"/:slug": (req) => {
|
|
23
|
-
let slug = req.params.slug;
|
|
24
|
-
console.log(`Request for slug: "${slug}"`);
|
|
25
|
-
return new Response(this.filesRecords[slug]?.content ?? "Not Found");
|
|
26
|
-
}
|
|
27
|
-
},
|
|
28
|
-
fetch(req) {
|
|
29
|
-
return new Response("Not Found (fallback)", { status: 404 });
|
|
30
|
-
},
|
|
31
|
-
});
|
|
32
|
-
console.log(server.url);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
display() {
|
|
36
|
-
if (Object.keys(this.filesRecords).length === 0) {
|
|
37
|
-
console.warn("No files to display");
|
|
38
|
-
return;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
console.log("Click the following links to view the outputs in your browser:");
|
|
42
|
-
|
|
43
|
-
const FRONTEND_URL = "https://ta-tools-dashboard.vercel.app/tools/md-viewer"; //TODO: not hardcode this
|
|
44
|
-
let files = Object.entries(this.filesRecords).sort((a, b) => a[0].localeCompare(b[0]));
|
|
45
|
-
for (const [filename, fileRecord] of files) {
|
|
46
|
-
let params = new URLSearchParams();
|
|
47
|
-
params.set("name", filename);
|
|
48
|
-
params.set("comp", "gzip");
|
|
49
|
-
params.set("data", Bun.gzipSync(fileRecord.content).toBase64());
|
|
50
|
-
let url = `${FRONTEND_URL}#${params.toString()}`;
|
|
51
|
-
console.log(`${chalk.cyan(filename)}: ${url}`);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
}
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import {Glob} from "bun";
|
|
2
|
-
|
|
3
|
-
import chalk from "chalk";
|
|
4
|
-
|
|
5
|
-
import {CONFIG} from "../util/config.ts";
|
|
6
|
-
import {FilePayloadGenerator} from "../util/file-payload.ts";
|
|
7
|
-
import type {WorkflowDependencies} from "./index.ts";
|
|
8
|
-
import {generateCompletion} from "../util/llm.ts";
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
export async function executeAnalysisWorkflow(workflow: typeof CONFIG.analysis_workflows[number], runNum: number, deps: WorkflowDependencies) {
|
|
12
|
-
console.log(`Executing analysis workflow: ${workflow.slug}`);
|
|
13
|
-
const log = (...args: Parameters<typeof console.log>) => {
|
|
14
|
-
console.log(chalk.cyan(`[${workflow.slug}]`), ...args);
|
|
15
|
-
}
|
|
16
|
-
const warn = (...args: Parameters<typeof console.warn>) => {
|
|
17
|
-
console.warn(chalk.red(`[${workflow.slug}]`), ...args);
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
let allFiles = (
|
|
21
|
-
await Promise.all(
|
|
22
|
-
workflow.input_files_searches.map(async (fileSearch) => {
|
|
23
|
-
const fileExclusionsSet = new Set(fileSearch.excluded_files);
|
|
24
|
-
const glob = new Glob(fileSearch.file_glob);
|
|
25
|
-
const matches: string[] = [];
|
|
26
|
-
for await (const file of glob.scan(fileSearch.search_directory)) {
|
|
27
|
-
if (fileExclusionsSet.has(file)) {
|
|
28
|
-
log(`Excluding file: ${file}`);
|
|
29
|
-
continue;
|
|
30
|
-
}
|
|
31
|
-
matches.push(file);
|
|
32
|
-
}
|
|
33
|
-
log(`Found ${matches.length} files for search: ${fileSearch.file_glob} in ${fileSearch.search_directory}`, matches);
|
|
34
|
-
return matches;
|
|
35
|
-
})
|
|
36
|
-
)
|
|
37
|
-
).flat();
|
|
38
|
-
|
|
39
|
-
if (allFiles.length === 0) {
|
|
40
|
-
warn(`No files found for workflow, skipping...`);
|
|
41
|
-
return;
|
|
42
|
-
}
|
|
43
|
-
log(`Found ${allFiles.length} files for workflow`);
|
|
44
|
-
const fileContentsPayload = await FilePayloadGenerator.generatePayloads(allFiles);
|
|
45
|
-
const completion = await generateCompletion(deps, log, warn, workflow.model, workflow.prompt, fileContentsPayload.map((file) => {
|
|
46
|
-
return {
|
|
47
|
-
type: "text",
|
|
48
|
-
text: file,
|
|
49
|
-
}
|
|
50
|
-
}));
|
|
51
|
-
|
|
52
|
-
const outputFileName = workflow.output_filename
|
|
53
|
-
.replaceAll("[seed]", deps.seed.toString())
|
|
54
|
-
.replaceAll("[slug]", workflow.slug)
|
|
55
|
-
.replaceAll("[model]", `(${completion.model.replaceAll("/", "--")})`)
|
|
56
|
-
.replaceAll("[run]", runNum.toString());
|
|
57
|
-
await Bun.write(outputFileName, completion.text);
|
|
58
|
-
log(`Completion written to ${outputFileName}`);
|
|
59
|
-
deps.outputViewer.addFile(outputFileName, {type: "markdown", content: completion.text});
|
|
60
|
-
}
|
package/src/workflow/index.ts
DELETED
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
import {$} from "bun";
|
|
2
|
-
|
|
3
|
-
import {CONFIG} from "../util/config.ts";
|
|
4
|
-
import chalk from "chalk";
|
|
5
|
-
import {LLMJudgeInputModeEnum} from "../util/config-schema.ts";
|
|
6
|
-
import type {WorkflowDependencies} from "./index.ts";
|
|
7
|
-
import {generateCompletion} from "../util/llm.ts";
|
|
8
|
-
|
|
9
|
-
export async function executeTestingWorkflow(workflow: typeof CONFIG.testing_workflows[number], runNum: number, deps: WorkflowDependencies) {
|
|
10
|
-
console.log(`Executing testing workflow: ${workflow.slug}`);
|
|
11
|
-
const log = (...args: Parameters<typeof console.log>) => {
|
|
12
|
-
console.log(chalk.cyan(`[${workflow.slug}]`), ...args);
|
|
13
|
-
}
|
|
14
|
-
const warn = (...args: Parameters<typeof console.warn>) => {
|
|
15
|
-
console.warn(chalk.red(`[${workflow.slug}]`), ...args);
|
|
16
|
-
}
|
|
17
|
-
const debug = (...args: Parameters<typeof console.debug>) => {
|
|
18
|
-
console.debug(chalk.gray(`[${workflow.slug}]`), ...args.map(e => typeof e === "string" ? chalk.gray(e) : e));
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
log(`$PATH: ${process.env.PATH}`);
|
|
22
|
-
|
|
23
|
-
for (const command of workflow.setup_commands){
|
|
24
|
-
log(`Running setup command: ${command}`);
|
|
25
|
-
await $`${{raw: command}}`.nothrow();
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
let testCasesResults: boolean[] = new Array(workflow.test_cases.length);
|
|
29
|
-
let testCasesResultsExplanation: string[] = new Array(workflow.test_cases.length);
|
|
30
|
-
for (let i = 0; i < workflow.test_cases.length; i++) {
|
|
31
|
-
try {
|
|
32
|
-
const testCase = workflow.test_cases[i]!;
|
|
33
|
-
log(`Running test case ${i + 1}/${workflow.test_cases.length}: ${testCase.name}`);
|
|
34
|
-
|
|
35
|
-
if (testCase.interactive_steps.length > 0) {
|
|
36
|
-
warn("Interactive steps are not supported in this version. Skipping interactive steps.");
|
|
37
|
-
continue;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
let {stdout, stderr, exitCode} = await $`${{raw: testCase.single_run_command}}`.cwd(testCase.work_directory).nothrow().quiet();
|
|
41
|
-
console.log(); // Blank line for readability
|
|
42
|
-
debug(`Test case stdout (${stdout.length}):\n${stdout}`);
|
|
43
|
-
debug(`Test case stderr (${stderr.length}):\n${stderr}`);
|
|
44
|
-
debug(`Exit code: ${exitCode}`);
|
|
45
|
-
|
|
46
|
-
let commandOutput = stdout.toString();
|
|
47
|
-
if (testCase.single_run_expected_output.prefix_strip_string.length > 0) {
|
|
48
|
-
let prefixIndex = commandOutput.indexOf(testCase.single_run_expected_output.prefix_strip_string);
|
|
49
|
-
if (prefixIndex !== -1) {
|
|
50
|
-
commandOutput = commandOutput.substring(prefixIndex + testCase.single_run_expected_output.prefix_strip_string.length);
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
if (testCase.single_run_expected_output.postfix_strip_string.length > 0) {
|
|
54
|
-
let postfixIndex = commandOutput.lastIndexOf(testCase.single_run_expected_output.postfix_strip_string);
|
|
55
|
-
if (postfixIndex !== -1) {
|
|
56
|
-
commandOutput = commandOutput.substring(0, postfixIndex);
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
debug("Sanitized command output for evaluation:\n", commandOutput);
|
|
60
|
-
|
|
61
|
-
if (testCase.single_run_expected_output.llm_judge_input_mode == LLMJudgeInputModeEnum.None){
|
|
62
|
-
if (stdout.includes(testCase.single_run_expected_output.substring)) {
|
|
63
|
-
log(`Test case '${testCase.name}' passed: expected substring found in output.`);
|
|
64
|
-
testCasesResults[i] = true;
|
|
65
|
-
}
|
|
66
|
-
else {
|
|
67
|
-
warn(`Test case '${testCase.name}' failed: expected substring NOT found in output.`);
|
|
68
|
-
testCasesResults[i] = false;
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
else {
|
|
72
|
-
switch (testCase.single_run_expected_output.llm_judge_input_mode) {
|
|
73
|
-
case LLMJudgeInputModeEnum.Full:
|
|
74
|
-
log("Evaluating full output with LLM judge...");
|
|
75
|
-
const completion = await generateCompletion(deps, log, warn, workflow.model, testCase.single_run_expected_output.llm_judge_prompt, JSON.stringify({
|
|
76
|
-
"expected_output_substring": testCase.single_run_expected_output.substring,
|
|
77
|
-
"actual_output": commandOutput,
|
|
78
|
-
}));
|
|
79
|
-
const completionText = completion.text;
|
|
80
|
-
log(`LLM judge completion:\n${completionText}`);
|
|
81
|
-
const llmJudgeResult = completionText.toLowerCase().includes("pass"); // TODO: More robust parsing
|
|
82
|
-
if (llmJudgeResult) {
|
|
83
|
-
log(chalk.green(`Test case '${testCase.name}' passed according to LLM judge.`));
|
|
84
|
-
testCasesResults[i] = true;
|
|
85
|
-
}
|
|
86
|
-
else {
|
|
87
|
-
warn(chalk.yellowBright(`Test case '${testCase.name}' failed according to LLM judge.`));
|
|
88
|
-
testCasesResults[i] = false;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
try {
|
|
92
|
-
const judgeResultObject = JSON.parse(completionText);
|
|
93
|
-
|
|
94
|
-
if ("summary" in judgeResultObject) {
|
|
95
|
-
testCasesResultsExplanation[i] = judgeResultObject.summary;
|
|
96
|
-
}
|
|
97
|
-
} catch (e) {
|
|
98
|
-
warn("Failed to parse LLM judge output as JSON. Make sure the LLM prompt requests JSON output.");
|
|
99
|
-
}
|
|
100
|
-
break;
|
|
101
|
-
default:
|
|
102
|
-
warn(`LLM judge input mode '${testCase.single_run_expected_output.llm_judge_input_mode}' is not supported in this version. Skipping LLM judging.`);
|
|
103
|
-
break;
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
} catch (e) {
|
|
107
|
-
warn(`Error occurred while executing test case ${i + 1}:`, e);
|
|
108
|
-
testCasesResults[i] = false;
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
for (const command of workflow.cleanup_commands){
|
|
112
|
-
log(`Running cleanup command: ${command}`);
|
|
113
|
-
await $`${{raw: command}}`.nothrow();
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
const passedCount = testCasesResults.filter((r) => r).length;
|
|
117
|
-
log(`Testing workflow completed. Passed ${passedCount}/${workflow.test_cases.length} test cases.`);
|
|
118
|
-
console.table(testCasesResults.map((entry, idx) => {
|
|
119
|
-
return [
|
|
120
|
-
workflow.test_cases[idx]?.name,
|
|
121
|
-
entry ? chalk.green("PASS") : chalk.red("FAIL"),
|
|
122
|
-
testCasesResultsExplanation[idx] || "",
|
|
123
|
-
];
|
|
124
|
-
}));
|
|
125
|
-
|
|
126
|
-
log(`Finished testing workflow: ${workflow.slug}`);
|
|
127
|
-
}
|
package/tsconfig.json
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"compilerOptions": {
|
|
3
|
-
// Environment setup & latest features
|
|
4
|
-
"lib": ["ESNext"],
|
|
5
|
-
"target": "ESNext",
|
|
6
|
-
"module": "Preserve",
|
|
7
|
-
"moduleDetection": "force",
|
|
8
|
-
"jsx": "react-jsx",
|
|
9
|
-
"allowJs": true,
|
|
10
|
-
|
|
11
|
-
// Bundler mode
|
|
12
|
-
"moduleResolution": "bundler",
|
|
13
|
-
"allowImportingTsExtensions": true,
|
|
14
|
-
"verbatimModuleSyntax": true,
|
|
15
|
-
"noEmit": true,
|
|
16
|
-
|
|
17
|
-
// Best practices
|
|
18
|
-
"strict": true,
|
|
19
|
-
"skipLibCheck": true,
|
|
20
|
-
"noFallthroughCasesInSwitch": true,
|
|
21
|
-
"noUncheckedIndexedAccess": true,
|
|
22
|
-
"noImplicitOverride": true,
|
|
23
|
-
|
|
24
|
-
// Some stricter flags (disabled by default)
|
|
25
|
-
"noUnusedLocals": false,
|
|
26
|
-
"noUnusedParameters": false,
|
|
27
|
-
"noPropertyAccessFromIndexSignature": false
|
|
28
|
-
}
|
|
29
|
-
}
|