even-pf 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitignore +0 -0
- package/README.md +8 -4
- package/bun.lock +5 -9
- package/bunfig.toml +0 -0
- package/epf.example.toml +21 -5
- package/package.json +4 -4
- package/src/generate-config.ts +11 -1
- package/src/index.ts +2 -1
- package/src/util/args.ts +37 -0
- package/src/util/config-schema.ts +22 -11
- package/src/util/config.ts +9 -4
- package/src/util/eval-harness.ts +18 -0
- package/src/util/file-payload.ts +0 -0
- package/src/util/llm.ts +80 -0
- package/src/util/output-viewer.ts +32 -0
- package/src/workflow/analysis-workflow.ts +12 -41
- package/src/workflow/index.ts +1 -0
- package/src/workflow/testing-workflow.ts +6 -31
- package/tsconfig.json +0 -0
package/.gitignore
CHANGED
|
File without changes
|
package/README.md
CHANGED
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
# Project Even-Playfield (E-PF)
|
|
2
2
|
AI-assisted responsible grading tool for programming assignments. A GPT-wrapper in CLI for CS graders.
|
|
3
3
|
|
|
4
|
+
## Usage
|
|
5
|
+
Make sure you have [Bun](https://bun.com) installed.
|
|
6
|
+
To run:
|
|
7
|
+
```bash
|
|
8
|
+
bunx even-pf [config]
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Development
|
|
4
12
|
To install dependencies:
|
|
5
13
|
```bash
|
|
6
14
|
bun install
|
|
@@ -11,10 +19,6 @@ To install as a tool globally:
|
|
|
11
19
|
bun link
|
|
12
20
|
```
|
|
13
21
|
|
|
14
|
-
To run:
|
|
15
|
-
```bash
|
|
16
|
-
bunx e-pf
|
|
17
|
-
```
|
|
18
22
|
Make sure you have a config file in your home or current directory. Alternatively, you can set environment variable `EPF_CONFIG_URL`.
|
|
19
23
|
|
|
20
24
|
This project was created using `bun init` in bun v1.3.2. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime.
|
package/bun.lock
CHANGED
|
@@ -7,33 +7,29 @@
|
|
|
7
7
|
"dependencies": {
|
|
8
8
|
"@openrouter/sdk": "^0.5.1",
|
|
9
9
|
"chalk": "^5.6.2",
|
|
10
|
-
"smol-toml": "^1.
|
|
10
|
+
"smol-toml": "^1.6.0",
|
|
11
11
|
"zod-defaults": "^0.2.3",
|
|
12
12
|
},
|
|
13
13
|
"devDependencies": {
|
|
14
14
|
"@types/bun": "latest",
|
|
15
15
|
},
|
|
16
16
|
"peerDependencies": {
|
|
17
|
-
"typescript": "^5",
|
|
17
|
+
"typescript": "^5.9.3",
|
|
18
18
|
},
|
|
19
19
|
},
|
|
20
20
|
},
|
|
21
21
|
"packages": {
|
|
22
22
|
"@openrouter/sdk": ["@openrouter/sdk@0.5.1", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-Kl0N1jIj7A3lnkM5dO3SGP8JP3jAozzs6JWcHVuZUBt5DsGKxFGNH1Y15bCfsJiLNA2ylAQpCN3aNcgEYkkL5Q=="],
|
|
23
23
|
|
|
24
|
-
"@types/bun": ["@types/bun@1.3.
|
|
24
|
+
"@types/bun": ["@types/bun@1.3.9", "", { "dependencies": { "bun-types": "1.3.9" } }, "sha512-KQ571yULOdWJiMH+RIWIOZ7B2RXQGpL1YQrBtLIV3FqDcCu6FsbFUBwhdKUlCKUpS3PJDsHlJ1QKlpxoVR+xtw=="],
|
|
25
25
|
|
|
26
26
|
"@types/node": ["@types/node@24.10.1", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ=="],
|
|
27
27
|
|
|
28
|
-
"
|
|
29
|
-
|
|
30
|
-
"bun-types": ["bun-types@1.3.2", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-i/Gln4tbzKNuxP70OWhJRZz1MRfvqExowP7U6JKoI8cntFrtxg7RJK3jvz7wQW54UuvNC8tbKHHri5fy74FVqg=="],
|
|
28
|
+
"bun-types": ["bun-types@1.3.9", "", { "dependencies": { "@types/node": "*" } }, "sha512-+UBWWOakIP4Tswh0Bt0QD0alpTY8cb5hvgiYeWCMet9YukHbzuruIEeXC2D7nMJPB12kbh8C7XJykSexEqGKJg=="],
|
|
31
29
|
|
|
32
30
|
"chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="],
|
|
33
31
|
|
|
34
|
-
"
|
|
35
|
-
|
|
36
|
-
"smol-toml": ["smol-toml@1.5.2", "", {}, "sha512-QlaZEqcAH3/RtNyet1IPIYPsEWAaYyXXv1Krsi+1L/QHppjX4Ifm8MQsBISz9vE8cHicIq3clogsheili5vhaQ=="],
|
|
32
|
+
"smol-toml": ["smol-toml@1.6.0", "", {}, "sha512-4zemZi0HvTnYwLfrpk/CF9LOd9Lt87kAt50GnqhMpyF9U3poDAP2+iukq2bZsO/ufegbYehBkqINbsWxj4l4cw=="],
|
|
37
33
|
|
|
38
34
|
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
|
39
35
|
|
package/bunfig.toml
CHANGED
|
File without changes
|
package/epf.example.toml
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
[
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
[hyperparameters]
|
|
1
|
+
[llm.models.general_analysis]
|
|
2
|
+
sdk = "openrouter"
|
|
3
|
+
model_name = ""
|
|
6
4
|
max_completion_tokens = 20000
|
|
7
5
|
temperature = 0.9
|
|
8
6
|
top_p = 1
|
|
@@ -10,8 +8,25 @@ frequency_penalty = 0
|
|
|
10
8
|
presence_penalty = 0
|
|
11
9
|
reasoning_effort = "high"
|
|
12
10
|
|
|
11
|
+
[llm.models.output_comparison]
|
|
12
|
+
sdk = "openrouter"
|
|
13
|
+
model_name = ""
|
|
14
|
+
max_completion_tokens = 20000
|
|
15
|
+
temperature = 0
|
|
16
|
+
top_p = 1
|
|
17
|
+
frequency_penalty = 0
|
|
18
|
+
presence_penalty = 0
|
|
19
|
+
reasoning_effort = "high"
|
|
20
|
+
|
|
21
|
+
[llm.prompt_replacement]
|
|
22
|
+
role = "role_placeholder"
|
|
23
|
+
|
|
24
|
+
[vendors.openrouter]
|
|
25
|
+
api_key = ""
|
|
26
|
+
|
|
13
27
|
[[analysis_workflows]]
|
|
14
28
|
slug = ""
|
|
29
|
+
model = "general_analysis"
|
|
15
30
|
runs = 1
|
|
16
31
|
output_filename = ""
|
|
17
32
|
prompt = ""
|
|
@@ -23,6 +38,7 @@ excluded_files = []
|
|
|
23
38
|
|
|
24
39
|
[[testing_workflows]]
|
|
25
40
|
slug = ""
|
|
41
|
+
model = "general_analysis"
|
|
26
42
|
runs = 1
|
|
27
43
|
output_filename = ""
|
|
28
44
|
setup_commands = []
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "even-pf",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"module": "src/index.ts",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "UNLICENSED",
|
|
@@ -10,18 +10,18 @@
|
|
|
10
10
|
"config-gen": "bun run --console-depth 6 src/generate-config.ts"
|
|
11
11
|
},
|
|
12
12
|
"bin": {
|
|
13
|
-
"
|
|
13
|
+
"even-pf": "src/index.ts"
|
|
14
14
|
},
|
|
15
15
|
"devDependencies": {
|
|
16
16
|
"@types/bun": "latest"
|
|
17
17
|
},
|
|
18
18
|
"peerDependencies": {
|
|
19
|
-
"typescript": "^5"
|
|
19
|
+
"typescript": "^5.9.3"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
22
|
"@openrouter/sdk": "^0.5.1",
|
|
23
23
|
"chalk": "^5.6.2",
|
|
24
|
-
"smol-toml": "^1.
|
|
24
|
+
"smol-toml": "^1.6.0",
|
|
25
25
|
"zod-defaults": "^0.2.3"
|
|
26
26
|
},
|
|
27
27
|
"files": [
|
package/src/generate-config.ts
CHANGED
|
@@ -5,17 +5,27 @@ import {
|
|
|
5
5
|
ConfigSchema,
|
|
6
6
|
AnalysisWorkflowEntrySchema,
|
|
7
7
|
FileSearchEntrySchema,
|
|
8
|
-
TestingWorkflowEntrySchema, TestCaseSchema
|
|
8
|
+
TestingWorkflowEntrySchema, TestCaseSchema, LLMConfigSchema, ModelConfigSchema
|
|
9
9
|
} from "./util/config-schema.ts";
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
console.log("generate-config.ts");
|
|
13
13
|
|
|
14
14
|
let defaultConfig = getDefaultsForSchema(ConfigSchema);
|
|
15
|
+
|
|
16
|
+
let defaultLLMConfig = getDefaultsForSchema(LLMConfigSchema);
|
|
17
|
+
defaultLLMConfig.prompt_replacement["role"] = "role_placeholder";
|
|
18
|
+
let defaultModelConfig = getDefaultsForSchema(ModelConfigSchema);
|
|
19
|
+
defaultLLMConfig.models["general_analysis"] = defaultModelConfig;
|
|
20
|
+
defaultLLMConfig.models["output_comparison"] = structuredClone(defaultModelConfig);
|
|
21
|
+
defaultLLMConfig.models["output_comparison"].temperature = 0;
|
|
22
|
+
defaultConfig.llm = defaultLLMConfig;
|
|
23
|
+
|
|
15
24
|
let defaultAnalysisWorkflowEntry = getDefaultsForSchema(AnalysisWorkflowEntrySchema);
|
|
16
25
|
let defaultFileSearchEntry = getDefaultsForSchema(FileSearchEntrySchema);
|
|
17
26
|
defaultAnalysisWorkflowEntry.input_files_searches = [defaultFileSearchEntry];
|
|
18
27
|
defaultConfig.analysis_workflows = [defaultAnalysisWorkflowEntry];
|
|
28
|
+
|
|
19
29
|
let defaultTestingWorkflowEntry = getDefaultsForSchema(TestingWorkflowEntrySchema);
|
|
20
30
|
defaultTestingWorkflowEntry.test_cases = [getDefaultsForSchema(TestCaseSchema)];
|
|
21
31
|
defaultConfig.testing_workflows = [defaultTestingWorkflowEntry];
|
package/src/index.ts
CHANGED
|
@@ -11,8 +11,9 @@ import type {WorkflowDependencies} from "./workflow";
|
|
|
11
11
|
console.log("EPF index.ts");
|
|
12
12
|
|
|
13
13
|
const workflowDependencies: WorkflowDependencies = {
|
|
14
|
+
seed: Math.floor(Date.now() / 1000),
|
|
14
15
|
openRouter: new OpenRouter({
|
|
15
|
-
apiKey: CONFIG.openrouter.api_key,
|
|
16
|
+
apiKey: CONFIG.vendors.openrouter.api_key,
|
|
16
17
|
})
|
|
17
18
|
}
|
|
18
19
|
|
package/src/util/args.ts
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import {parseArgs} from "util";
|
|
2
|
+
|
|
3
|
+
// console.log(Bun.argv);
|
|
4
|
+
export const ARGS = parseArgs({
|
|
5
|
+
args: Bun.argv,
|
|
6
|
+
options: {
|
|
7
|
+
// mode: {
|
|
8
|
+
// type: "string",
|
|
9
|
+
// short: "M",
|
|
10
|
+
// default: "run",
|
|
11
|
+
// },
|
|
12
|
+
config: {
|
|
13
|
+
type: "string",
|
|
14
|
+
short: "C",
|
|
15
|
+
},
|
|
16
|
+
dir: {
|
|
17
|
+
type: "string",
|
|
18
|
+
short: "D",
|
|
19
|
+
default: ".",
|
|
20
|
+
},
|
|
21
|
+
skip_workflow: {
|
|
22
|
+
type: "string",
|
|
23
|
+
short: "S",
|
|
24
|
+
multiple: true,
|
|
25
|
+
},
|
|
26
|
+
completion_inputs_destination: {
|
|
27
|
+
type: "string",
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
strict: true,
|
|
31
|
+
allowPositionals: true,
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
// export enum RunMode {
|
|
35
|
+
// Run = "run",
|
|
36
|
+
// Eval = "eval",
|
|
37
|
+
// }
|
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
import {z} from "zod";
|
|
2
2
|
|
|
3
|
+
export const ModelConfigSchema = z.object({
|
|
4
|
+
sdk: z.enum(["openrouter"]).default("openrouter"),
|
|
5
|
+
model_name: z.string().default(""),
|
|
6
|
+
max_completion_tokens: z.number().min(1).default(20000),
|
|
7
|
+
temperature: z.number().min(0).max(1).default(0.9),
|
|
8
|
+
top_p: z.number().min(0).max(1).default(1),
|
|
9
|
+
frequency_penalty: z.number().min(-2).max(2).default(0),
|
|
10
|
+
presence_penalty: z.number().min(-2).max(2).default(0),
|
|
11
|
+
reasoning_effort: z.enum(["low", "medium", "high"]).default("high"),
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
export const LLMConfigSchema = z.object({
|
|
15
|
+
models: z.record(z.string(), ModelConfigSchema),
|
|
16
|
+
prompt_replacement: z.record(z.string(), z.string()),
|
|
17
|
+
});
|
|
18
|
+
|
|
3
19
|
export const FileSearchEntrySchema = z.object({
|
|
4
20
|
file_glob: z.string().min(1),
|
|
5
21
|
search_directory: z.string().default("."),
|
|
@@ -8,6 +24,7 @@ export const FileSearchEntrySchema = z.object({
|
|
|
8
24
|
|
|
9
25
|
export const BaseWorkflowEntrySchema = z.object({
|
|
10
26
|
slug: z.string(),
|
|
27
|
+
model: z.string().default("general_analysis"),
|
|
11
28
|
runs: z.number().min(1).default(1),
|
|
12
29
|
input_files_searches: z.array(FileSearchEntrySchema).default([]),
|
|
13
30
|
output_filename: z.string().min(1),
|
|
@@ -52,17 +69,11 @@ export const TestingWorkflowEntrySchema = BaseWorkflowEntrySchema.extend({
|
|
|
52
69
|
});
|
|
53
70
|
|
|
54
71
|
export const ConfigSchema = z.object({
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
max_completion_tokens: z.number().min(1).default(20000),
|
|
61
|
-
temperature: z.number().min(0).max(1).default(0.9),
|
|
62
|
-
top_p: z.number().min(0).max(1).default(1),
|
|
63
|
-
frequency_penalty: z.number().min(-2).max(2).default(0),
|
|
64
|
-
presence_penalty: z.number().min(-2).max(2).default(0),
|
|
65
|
-
reasoning_effort: z.enum(["low", "medium", "high"]).default("high"),
|
|
72
|
+
llm: LLMConfigSchema,
|
|
73
|
+
vendors: z.object({
|
|
74
|
+
openrouter: z.object({
|
|
75
|
+
api_key: z.string(),
|
|
76
|
+
}),
|
|
66
77
|
}),
|
|
67
78
|
analysis_workflows: z.array(AnalysisWorkflowEntrySchema),
|
|
68
79
|
testing_workflows: z.array(TestingWorkflowEntrySchema),
|
package/src/util/config.ts
CHANGED
|
@@ -3,6 +3,7 @@ import { readFileSync, existsSync } from "node:fs";
|
|
|
3
3
|
|
|
4
4
|
import {z} from "zod";
|
|
5
5
|
|
|
6
|
+
import {ARGS} from "./args.ts";
|
|
6
7
|
import {ConfigSchema} from "./config-schema.ts";
|
|
7
8
|
|
|
8
9
|
|
|
@@ -16,8 +17,8 @@ async function readConfig() {
|
|
|
16
17
|
console.log(`Loading config`);
|
|
17
18
|
|
|
18
19
|
let configFilePath: string;
|
|
19
|
-
if (
|
|
20
|
-
configFilePath =
|
|
20
|
+
if (ARGS.values.config && ARGS.values.config.trim().length > 0) {
|
|
21
|
+
configFilePath = ARGS.values.config.trim();
|
|
21
22
|
console.log(`Found config from command line argument: ${configFilePath}`);
|
|
22
23
|
}
|
|
23
24
|
else if (process.env[configURLEnvVar]) {
|
|
@@ -47,14 +48,18 @@ async function readConfig() {
|
|
|
47
48
|
}
|
|
48
49
|
configFileContents = await configResponse.text();
|
|
49
50
|
} else {
|
|
50
|
-
console.log(`Loading config from file: ${configFilePath}`);
|
|
51
|
+
console.log(`Loading config from local file: ${configFilePath}`);
|
|
51
52
|
configFileContents = readFileSync(configFilePath).toString();
|
|
52
53
|
}
|
|
53
|
-
|
|
54
|
+
|
|
55
|
+
console.assert(configFileContents.trim().length > 0, "Config file is empty");
|
|
56
|
+
|
|
54
57
|
let obj = Bun.TOML.parse(configFileContents);
|
|
55
58
|
const parsedConfig = ConfigSchema.safeParse(obj);
|
|
56
59
|
if (!parsedConfig.success) {
|
|
57
60
|
console.error("Config file is invalid:", parsedConfig.error.format());
|
|
61
|
+
console.log(`Config file contents:\n${configFileContents}`);
|
|
62
|
+
console.log(parsedConfig);
|
|
58
63
|
throw new Error("Config file is invalid");
|
|
59
64
|
}
|
|
60
65
|
console.log(`Config loaded from ${configFilePath}`);
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { appendFile } from "node:fs/promises";
|
|
2
|
+
|
|
3
|
+
import {ARGS} from "./args.ts";
|
|
4
|
+
|
|
5
|
+
export async function recordCompletionInput(inputs: {role: string, content: any}[]) {
|
|
6
|
+
let completionInputsDestination = ARGS.values.completion_inputs_destination;
|
|
7
|
+
if (!completionInputsDestination) {
|
|
8
|
+
return;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
if (!(await Bun.file(completionInputsDestination).exists())){
|
|
12
|
+
console.warn(`Completion inputs destination file ${completionInputsDestination} does not exist`);
|
|
13
|
+
return;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
await appendFile(completionInputsDestination, JSON.stringify(completionInputsDestination)+"\n");
|
|
17
|
+
console.log(`Recorded completion inputs to ${completionInputsDestination}`);
|
|
18
|
+
}
|
package/src/util/file-payload.ts
CHANGED
|
File without changes
|
package/src/util/llm.ts
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import type {SystemMessage, UserMessage} from "@openrouter/sdk/models";
|
|
2
|
+
|
|
3
|
+
import {CONFIG} from "./config.ts";
|
|
4
|
+
import type {WorkflowDependencies} from "../workflow";
|
|
5
|
+
import {recordCompletionInput} from "./eval-harness.ts";
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
export async function generateCompletion(deps: WorkflowDependencies,
|
|
9
|
+
log: (..._: any[])=>void,
|
|
10
|
+
warn: (..._: any[])=>void,
|
|
11
|
+
model: string,
|
|
12
|
+
systemPrompt: string,
|
|
13
|
+
content: UserMessage["content"]) {
|
|
14
|
+
let modelSettings = CONFIG.llm.models[model];
|
|
15
|
+
if (!modelSettings) {
|
|
16
|
+
throw new Error(`No model settings found for model "${model}"`);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
let replacedCount = 0;
|
|
20
|
+
for (const [replacementKey, replacementValue] of Object.entries(CONFIG.llm.prompt_replacement)) {
|
|
21
|
+
if (systemPrompt.includes(replacementKey)) {replacedCount++}
|
|
22
|
+
systemPrompt = systemPrompt.replaceAll(`{{${replacementKey}}}`, replacementValue);
|
|
23
|
+
if (typeof content === "string") {
|
|
24
|
+
if (content.includes(replacementKey)) {replacedCount++}
|
|
25
|
+
content = content.replaceAll(`{{${replacementKey}}}`, replacementValue);
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
for (let i = 0; i < content.length; i++) {
|
|
29
|
+
const element = content[i];
|
|
30
|
+
if (element && "type" in element && element.type === "text" && typeof element.text === "string") {
|
|
31
|
+
if (element.text.includes(replacementKey)) {replacedCount++}
|
|
32
|
+
content[i] = {
|
|
33
|
+
...element,
|
|
34
|
+
text: element.text.replaceAll(`{{${replacementKey}}}`, replacementValue),
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
log(`Replaced ${replacedCount} instances of prompt variables in system prompt and content`);
|
|
42
|
+
|
|
43
|
+
let messages: (SystemMessage | UserMessage)[] = [
|
|
44
|
+
{
|
|
45
|
+
role: "system",
|
|
46
|
+
content: systemPrompt,
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
role: "user",
|
|
50
|
+
content: content,
|
|
51
|
+
}
|
|
52
|
+
];
|
|
53
|
+
setTimeout(async ()=> await recordCompletionInput(messages), 5);
|
|
54
|
+
|
|
55
|
+
log("Sending chat completion request...");
|
|
56
|
+
let startTime = Date.now();
|
|
57
|
+
let completion = await deps.openRouter.chat.send({
|
|
58
|
+
model: modelSettings.model_name,
|
|
59
|
+
maxCompletionTokens: modelSettings.max_completion_tokens,
|
|
60
|
+
messages: messages,
|
|
61
|
+
stream: false,
|
|
62
|
+
seed: deps.seed,
|
|
63
|
+
frequencyPenalty: modelSettings.frequency_penalty,
|
|
64
|
+
presencePenalty: modelSettings.presence_penalty,
|
|
65
|
+
temperature: modelSettings.temperature,
|
|
66
|
+
reasoning: {
|
|
67
|
+
effort: modelSettings.reasoning_effort,
|
|
68
|
+
},
|
|
69
|
+
});
|
|
70
|
+
log(`Completion response generated in ${(Date.now() - startTime) / 1000} seconds`);
|
|
71
|
+
if (completion.choices.length < 1){
|
|
72
|
+
warn("No choices returned from completion");
|
|
73
|
+
console.log(completion);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
text: completion.choices[0]?.message.content?.toString() ?? "",
|
|
78
|
+
model: completion.model,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
type FileRecord = {
|
|
2
|
+
type: "markdown" | "text";
|
|
3
|
+
content: string;
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
export class OutputViewer {
|
|
7
|
+
filesRecords: Record<string, FileRecord> = {};
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
addFile(filename: string, _: FileRecord): void {
|
|
11
|
+
this.filesRecords[filename] = _;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
display(): void {
|
|
15
|
+
let files = Object.entries(this.filesRecords).sort((a, b) => a[0].localeCompare(b[0]));
|
|
16
|
+
|
|
17
|
+
let server = Bun.serve({
|
|
18
|
+
port: 0,
|
|
19
|
+
routes: {
|
|
20
|
+
"/:slug": (req) => {
|
|
21
|
+
let slug = req.params.slug;
|
|
22
|
+
console.log(`Request for slug: "${slug}"`);
|
|
23
|
+
return new Response(this.filesRecords[slug]?.content ?? "Not Found");
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
fetch(req) {
|
|
27
|
+
return new Response("Not Found (fallback)", { status: 404 });
|
|
28
|
+
},
|
|
29
|
+
});
|
|
30
|
+
console.log(server.url);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import {Glob} from "bun";
|
|
2
2
|
|
|
3
|
+
import chalk from "chalk";
|
|
4
|
+
|
|
3
5
|
import {CONFIG} from "../util/config.ts";
|
|
4
6
|
import {FilePayloadGenerator} from "../util/file-payload.ts";
|
|
5
7
|
import type {WorkflowDependencies} from "./index.ts";
|
|
6
|
-
import
|
|
8
|
+
import {generateCompletion} from "../util/llm.ts";
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
export async function executeAnalysisWorkflow(workflow: typeof CONFIG.analysis_workflows[number], runNum: number, deps: WorkflowDependencies) {
|
|
@@ -40,49 +42,18 @@ export async function executeAnalysisWorkflow(workflow: typeof CONFIG.analysis_w
|
|
|
40
42
|
}
|
|
41
43
|
log(`Found ${allFiles.length} files for workflow`);
|
|
42
44
|
const fileContentsPayload = await FilePayloadGenerator.generatePayloads(allFiles);
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
messages: [
|
|
51
|
-
{
|
|
52
|
-
role: "system",
|
|
53
|
-
content: workflow.prompt,
|
|
54
|
-
},
|
|
55
|
-
{
|
|
56
|
-
role: "user",
|
|
57
|
-
content: fileContentsPayload.map((file) => {
|
|
58
|
-
return {
|
|
59
|
-
type: "text",
|
|
60
|
-
text: file,
|
|
61
|
-
}
|
|
62
|
-
}),
|
|
63
|
-
}
|
|
64
|
-
],
|
|
65
|
-
stream: false,
|
|
66
|
-
seed: seed,
|
|
67
|
-
frequencyPenalty: CONFIG.hyperparameters.frequency_penalty,
|
|
68
|
-
presencePenalty: CONFIG.hyperparameters.presence_penalty,
|
|
69
|
-
temperature: CONFIG.hyperparameters.temperature,
|
|
70
|
-
reasoning: {
|
|
71
|
-
effort: CONFIG.hyperparameters.reasoning_effort,
|
|
72
|
-
},
|
|
73
|
-
});
|
|
74
|
-
log(`Completion response generated in ${(Date.now() - startTime) / 1000} seconds`);
|
|
75
|
-
if (completion.choices.length < 1){
|
|
76
|
-
warn("No choices returned from completion");
|
|
77
|
-
console.log(completion);
|
|
78
|
-
}
|
|
79
|
-
const completionText = completion.choices[0]?.message.content?.toString() ?? "";
|
|
80
|
-
// TODO: Add more template variables
|
|
45
|
+
const completion = await generateCompletion(deps, log, warn, workflow.model, workflow.prompt, fileContentsPayload.map((file) => {
|
|
46
|
+
return {
|
|
47
|
+
type: "text",
|
|
48
|
+
text: file,
|
|
49
|
+
}
|
|
50
|
+
}));
|
|
51
|
+
|
|
81
52
|
const outputFileName = workflow.output_filename
|
|
82
|
-
.replaceAll("[seed]", seed.toString())
|
|
53
|
+
.replaceAll("[seed]", deps.seed.toString())
|
|
83
54
|
.replaceAll("[slug]", workflow.slug)
|
|
84
55
|
.replaceAll("[model]", `(${completion.model.replaceAll("/", "--")})`)
|
|
85
56
|
.replaceAll("[run]", runNum.toString());
|
|
86
|
-
await Bun.write(outputFileName,
|
|
57
|
+
await Bun.write(outputFileName, completion.text);
|
|
87
58
|
log(`Completion written to ${outputFileName}`);
|
|
88
59
|
}
|
package/src/workflow/index.ts
CHANGED
|
@@ -4,6 +4,7 @@ import {CONFIG} from "../util/config.ts";
|
|
|
4
4
|
import chalk from "chalk";
|
|
5
5
|
import {LLMJudgeInputModeEnum} from "../util/config-schema.ts";
|
|
6
6
|
import type {WorkflowDependencies} from "./index.ts";
|
|
7
|
+
import {generateCompletion} from "../util/llm.ts";
|
|
7
8
|
|
|
8
9
|
export async function executeTestingWorkflow(workflow: typeof CONFIG.testing_workflows[number], runNum: number, deps: WorkflowDependencies) {
|
|
9
10
|
console.log(`Executing testing workflow: ${workflow.slug}`);
|
|
@@ -71,37 +72,11 @@ export async function executeTestingWorkflow(workflow: typeof CONFIG.testing_wor
|
|
|
71
72
|
switch (testCase.single_run_expected_output.llm_judge_input_mode) {
|
|
72
73
|
case LLMJudgeInputModeEnum.Full:
|
|
73
74
|
log("Evaluating full output with LLM judge...");
|
|
74
|
-
const
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
{
|
|
80
|
-
role: "system",
|
|
81
|
-
content: testCase.single_run_expected_output.llm_judge_prompt,
|
|
82
|
-
},
|
|
83
|
-
{
|
|
84
|
-
role: "user",
|
|
85
|
-
content: JSON.stringify({
|
|
86
|
-
"expected_output_substring": testCase.single_run_expected_output.substring,
|
|
87
|
-
"actual_output": commandOutput,
|
|
88
|
-
}),
|
|
89
|
-
}
|
|
90
|
-
],
|
|
91
|
-
stream: false,
|
|
92
|
-
seed: seed,
|
|
93
|
-
frequencyPenalty: CONFIG.hyperparameters.frequency_penalty,
|
|
94
|
-
presencePenalty: CONFIG.hyperparameters.presence_penalty,
|
|
95
|
-
temperature: 0,
|
|
96
|
-
reasoning: {
|
|
97
|
-
effort: CONFIG.hyperparameters.reasoning_effort,
|
|
98
|
-
},
|
|
99
|
-
});
|
|
100
|
-
if (completion.choices.length < 1){
|
|
101
|
-
warn("No choices returned from completion");
|
|
102
|
-
console.log(completion);
|
|
103
|
-
}
|
|
104
|
-
const completionText = completion.choices[0]?.message.content?.toString() ?? "";
|
|
75
|
+
const completion = await generateCompletion(deps, log, warn, workflow.model, testCase.single_run_expected_output.llm_judge_prompt, JSON.stringify({
|
|
76
|
+
"expected_output_substring": testCase.single_run_expected_output.substring,
|
|
77
|
+
"actual_output": commandOutput,
|
|
78
|
+
}));
|
|
79
|
+
const completionText = completion.text;
|
|
105
80
|
log(`LLM judge completion:\n${completionText}`);
|
|
106
81
|
const llmJudgeResult = completionText.toLowerCase().includes("pass"); // TODO: More robust parsing
|
|
107
82
|
if (llmJudgeResult) {
|
package/tsconfig.json
CHANGED
|
File without changes
|