even-pf 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -28,11 +28,23 @@ Make sure you have a config file in your home or current directory. Alternativel
28
28
 
29
29
  This project was created using `bun init` in bun v1.3.2. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime.
30
30
 
31
+ ### Publishing
31
32
  After making changes, you might want to bump the version.
32
33
  ```bash
33
34
  bun run bump <semver>
34
35
  ```
35
36
 
37
+ Build the executable:
38
+ ```bash
39
+ bun run build:all
40
+ ```
41
+
42
+ Then publish to npm:
43
+ ```bash
44
+ bun run publish:all
45
+ ```
46
+ Do not run `bun publish` directly, the executables are distributed as separate packages and need to be published separately.
47
+
36
48
  ## Specs
37
49
  ### File-viewer Frontend
38
50
  In consideration of the tool might be running at a remote server, for easily viewing the Markdown files, we will use a simple file-viewer frontend.
package/bun.lock CHANGED
@@ -14,11 +14,11 @@
14
14
  "@types/bun": "latest",
15
15
  },
16
16
  "optionalDependencies": {
17
- "even-pf-darwin-arm64": "0.3.4",
18
- "even-pf-darwin-x64": "0.3.4",
19
- "even-pf-linux-arm64": "0.3.4",
20
- "even-pf-linux-x64": "0.3.4",
21
- "even-pf-windows-x64": "0.3.4",
17
+ "even-pf-darwin-arm64": "0.4.2",
18
+ "even-pf-darwin-x64": "0.4.2",
19
+ "even-pf-linux-arm64": "0.4.2",
20
+ "even-pf-linux-x64": "0.4.2",
21
+ "even-pf-windows-x64": "0.4.2",
22
22
  },
23
23
  "peerDependencies": {
24
24
  "typescript": "^5.9.3",
@@ -36,16 +36,6 @@
36
36
 
37
37
  "chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="],
38
38
 
39
- "even-pf-darwin-arm64": ["even-pf-darwin-arm64@0.3.4", "", { "os": "darwin", "cpu": "arm64", "bin": { "even-pf": "bin/even-pf" } }, "sha512-x2vTM0ogvlFhUiHqb13kXJTKPRPU/VdoZa1G51c3IHsZz7wdDpkD/DxcEvxAmO28MbJtfjxig8nRFMvld5J6jg=="],
40
-
41
- "even-pf-darwin-x64": ["even-pf-darwin-x64@0.3.4", "", { "os": "darwin", "cpu": "x64", "bin": { "even-pf": "bin/even-pf" } }, "sha512-L2hzBvSLFcWMB/MJQeZTQHI8mqpGMQ7T0tSPXjv4S1tFglF8ZtdxggDAhmItEyyqVfsAT6LY+HyOpJnUAga9tg=="],
42
-
43
- "even-pf-linux-arm64": ["even-pf-linux-arm64@0.3.4", "", { "os": "linux", "cpu": "arm64", "bin": { "even-pf": "bin/even-pf" } }, "sha512-/5nLtKs+8xvTHEkrVPQQ5XQBTKROmF42z6+fo4AOkOj/TbDGwCher6RYYMHQ6pD7M0jjF5AdSlj5HLEGf/N9Qg=="],
44
-
45
- "even-pf-linux-x64": ["even-pf-linux-x64@0.3.4", "", { "os": "linux", "cpu": "x64", "bin": { "even-pf": "bin/even-pf" } }, "sha512-UN0wz2svjcjckugzFyc4tHxllrTM7IScSmnLDq5z9AB5cplHZrvAg8cYcvz20YEcHsr7aUkxrhA7iDv5KKYhkA=="],
46
-
47
- "even-pf-windows-x64": ["even-pf-windows-x64@0.3.4", "", { "os": "win32", "cpu": "x64", "bin": { "even-pf": "bin/even-pf.exe" } }, "sha512-ni84uLUdo95TlACDUyz7Ia7+4wigSByvUuR+IrXbLzkN90mZTsJoZVbAoJMR8CnOlPPEClcPHqkTcYl1lbLOwA=="],
48
-
49
39
  "smol-toml": ["smol-toml@1.6.0", "", {}, "sha512-4zemZi0HvTnYwLfrpk/CF9LOd9Lt87kAt50GnqhMpyF9U3poDAP2+iukq2bZsO/ufegbYehBkqINbsWxj4l4cw=="],
50
40
 
51
41
  "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
package/epf.example.toml CHANGED
@@ -12,6 +12,8 @@ top_p = 1
12
12
  frequency_penalty = 0
13
13
  presence_penalty = 0
14
14
  reasoning_effort = "high"
15
+ max_retries = 1
16
+ retry_delay_ms = 1000
15
17
 
16
18
  [llm.models.output_comparison]
17
19
  sdk = "openrouter"
@@ -22,6 +24,8 @@ top_p = 1
22
24
  frequency_penalty = 0
23
25
  presence_penalty = 0
24
26
  reasoning_effort = "high"
27
+ max_retries = 1
28
+ retry_delay_ms = 1000
25
29
 
26
30
  [llm.prompt_replacement]
27
31
  role = "role_placeholder"
package/package.json CHANGED
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "name": "even-pf",
3
- "version": "0.4.0",
3
+ "version": "0.4.2",
4
4
  "description": "AI-assisted responsible grading tool for programming assignments",
5
5
  "module": "src/cli.ts",
6
6
  "type": "module",
7
7
  "license": "UNLICENSED",
8
8
  "scripts": {
9
9
  "start": "bun run src/cli.ts",
10
- "build-dev": "bun build src/cli.ts --compile --outfile build/epf",
10
+ "build:dev": "bun build src/cli.ts --compile --outfile build/epf",
11
11
  "build:all": "bun scripts/build-all.ts",
12
12
  "publish:all": "bun scripts/publish-all.ts",
13
13
  "publish:dry": "bun scripts/publish-all.ts --dry-run",
@@ -31,11 +31,11 @@
31
31
  "zod-defaults": "^0.2.3"
32
32
  },
33
33
  "optionalDependencies": {
34
- "even-pf-linux-x64": "0.4.0",
35
- "even-pf-linux-arm64": "0.4.0",
36
- "even-pf-windows-x64": "0.4.0",
37
- "even-pf-darwin-x64": "0.4.0",
38
- "even-pf-darwin-arm64": "0.4.0"
34
+ "even-pf-linux-x64": "0.4.2",
35
+ "even-pf-linux-arm64": "0.4.2",
36
+ "even-pf-windows-x64": "0.4.2",
37
+ "even-pf-darwin-x64": "0.4.2",
38
+ "even-pf-darwin-arm64": "0.4.2"
39
39
  },
40
40
  "files": [
41
41
  "bin/even-pf.js",
package/src/cli.ts CHANGED
@@ -4,6 +4,7 @@ import "./version.ts";
4
4
 
5
5
  import {OpenRouter} from "@openrouter/sdk";
6
6
 
7
+ import {ARGS} from "./util/args.ts";
7
8
  import {CONFIG} from "./util/config.ts";
8
9
  import {executeTestingWorkflow} from "./workflow/testing-workflow.ts";
9
10
  import {executeAnalysisWorkflow} from "./workflow/analysis-workflow.ts";
@@ -20,36 +21,65 @@ const workflowDependencies: WorkflowDependencies = {
20
21
  }
21
22
 
22
23
  // Parallelize workflows with Promise.allSettled
23
- const analysisWorkflows = CONFIG.analysis_workflows;
24
- const testingWorkflows = CONFIG.testing_workflows;
25
- console.log(`Starting execution of ${analysisWorkflows.length} workflows...`);
26
- console.log(analysisWorkflows.map((w) => w.slug));
27
- let workflowRuns: Promise<void>[] = [];
24
+ const onlySlugs: string[] | undefined = ARGS.values.only_workflows;
25
+ const skipSlugs: string[] | undefined = ARGS.values.skip_workflow;
26
+
27
+ function applyWorkflowFilters<T extends { slug: string }>(workflows: T[]): T[] {
28
+ let filtered = workflows;
29
+ if (onlySlugs && onlySlugs.length > 0) {
30
+ filtered = filtered.filter((w) => {
31
+ if (onlySlugs.includes(w.slug)) {
32
+ return true;
33
+ }
34
+ console.log(`Skipping workflow '${w.slug}' (not in --only_workflows list)`);
35
+ return false;
36
+ });
37
+ }
38
+ if (skipSlugs && skipSlugs.length > 0) {
39
+ filtered = filtered.filter((w) => {
40
+ if (skipSlugs.includes(w.slug)) {
41
+ console.log(`Skipping workflow '${w.slug}' (matched --skip_workflow)`);
42
+ return false;
43
+ }
44
+ return true;
45
+ });
46
+ }
47
+ return filtered;
48
+ }
49
+
50
+ const analysisWorkflows = applyWorkflowFilters(CONFIG.analysis_workflows);
51
+ const testingWorkflows = applyWorkflowFilters(CONFIG.testing_workflows);
52
+ console.log(`Starting execution of ${analysisWorkflows.length} analysis + ${testingWorkflows.length} testing workflows...`);
53
+ console.log([...analysisWorkflows, ...testingWorkflows].map((w) => w.slug));
54
+ const workflowRuns: Promise<void>[] = [];
55
+ const workflowRunSlugs: string[] = [];
28
56
  analysisWorkflows.forEach((workflow) => {
29
57
  for (let i = 0; i < workflow.runs; i++) {
30
58
  workflowRuns.push(executeAnalysisWorkflow(workflow, i+1, workflowDependencies));
59
+ workflowRunSlugs.push(workflow.slug);
31
60
  }
32
61
  });
33
62
  testingWorkflows.forEach((workflow) => {
34
63
  for (let i = 0; i < workflow.runs; i++) {
35
64
  workflowRuns.push(executeTestingWorkflow(workflow, i+1, workflowDependencies));
65
+ workflowRunSlugs.push(workflow.slug);
36
66
  }
37
67
  });
38
- workflowDependencies.outputViewer.display(); // For start the server early.
68
+ workflowDependencies.outputViewer.display(); // Start the server early.
39
69
  const workflowsResults = await Promise.allSettled(workflowRuns);
40
70
  // Summarize with indices to include slugs in failure logs
41
71
  const failedIndices: number[] = [];
42
72
  const succeededIndices: number[] = [];
43
73
  workflowsResults.forEach((r, i) => {
44
- if (r.status === "rejected") failedIndices.push(i);
45
- else succeededIndices.push(i);
74
+ if (r.status === "rejected") { failedIndices.push(i); }
75
+ else { succeededIndices.push(i); }
46
76
  });
47
77
 
48
78
  console.log(`Workflows completed. Succeeded: ${succeededIndices.length}; Failed: ${failedIndices.length}`);
49
79
  if (failedIndices.length > 0) {
50
80
  failedIndices.forEach((i) => {
51
81
  const r = workflowsResults[i] as PromiseRejectedResult;
52
- const slug = analysisWorkflows[i]?.slug ?? `#${i + 1}`;
82
+ const slug = workflowRunSlugs[i] ?? `#${i + 1}`;
53
83
  console.warn(`Workflow '${slug}' failed:`, r.reason);
54
84
  });
55
85
  }
package/src/util/args.ts CHANGED
@@ -1,4 +1,4 @@
1
- import {parseArgs} from "util";
1
+ import { parseArgs } from "util";
2
2
 
3
3
  // console.log(Bun.argv);
4
4
  export const ARGS = parseArgs({
@@ -23,6 +23,11 @@ export const ARGS = parseArgs({
23
23
  short: "S",
24
24
  multiple: true,
25
25
  },
26
+ only_workflows: {
27
+ type: "string",
28
+ short: "O",
29
+ multiple: true,
30
+ },
26
31
  completion_inputs_destination: {
27
32
  type: "string",
28
33
  },
@@ -1,4 +1,4 @@
1
- import {z} from "zod";
1
+ import { z } from "zod";
2
2
 
3
3
  export enum OutputViewingModeEnum {
4
4
  Local = "local",
@@ -20,6 +20,8 @@ export const ModelConfigSchema = z.object({
20
20
  frequency_penalty: z.number().min(-2).max(2).default(0),
21
21
  presence_penalty: z.number().min(-2).max(2).default(0),
22
22
  reasoning_effort: z.enum(["low", "medium", "high"]).default("high"),
23
+ max_retries: z.number().min(0).default(1), // 0 for no retry
24
+ retry_delay_ms: z.number().min(0).default(1000),
23
25
  });
24
26
 
25
27
  export const LLMConfigSchema = z.object({
@@ -45,7 +47,7 @@ export const AnalysisWorkflowEntrySchema = BaseWorkflowEntrySchema.extend({
45
47
  prompt: z.string(),
46
48
  })
47
49
 
48
- export enum LLMJudgeInputModeEnum{
50
+ export enum LLMJudgeInputModeEnum {
49
51
  None = "NONE",
50
52
  Diff = "DIFF",
51
53
  Full = "FULL",
package/src/util/llm.ts CHANGED
@@ -5,6 +5,10 @@ import type {WorkflowDependencies} from "../workflow";
5
5
  import {recordCompletionInput} from "./eval-harness.ts";
6
6
 
7
7
 
8
+ async function delay(ms: number): Promise<void> {
9
+ return new Promise(resolve => setTimeout(resolve, ms));
10
+ }
11
+
8
12
  export async function generateCompletion(deps: WorkflowDependencies,
9
13
  log: (..._: any[])=>void,
10
14
  warn: (..._: any[])=>void,
@@ -15,31 +19,30 @@ export async function generateCompletion(deps: WorkflowDependencies,
15
19
  if (!modelSettings) {
16
20
  throw new Error(`No model settings found for model "${model}"`);
17
21
  }
18
-
22
+
19
23
  let replacedCount = 0;
20
24
  for (const [replacementKey, replacementValue] of Object.entries(CONFIG.llm.prompt_replacement)) {
21
- if (systemPrompt.includes(replacementKey)) {replacedCount++}
25
+ if (systemPrompt.includes(replacementKey)) {replacedCount++;}
22
26
  systemPrompt = systemPrompt.replaceAll(`{{${replacementKey}}}`, replacementValue);
23
27
  if (typeof content === "string") {
24
- if (content.includes(replacementKey)) {replacedCount++}
28
+ if (content.includes(replacementKey)) {replacedCount++;}
25
29
  content = content.replaceAll(`{{${replacementKey}}}`, replacementValue);
26
30
  }
27
31
  else {
28
32
  for (let i = 0; i < content.length; i++) {
29
33
  const element = content[i];
30
34
  if (element && "type" in element && element.type === "text" && typeof element.text === "string") {
31
- if (element.text.includes(replacementKey)) {replacedCount++}
35
+ if (element.text.includes(replacementKey)) {replacedCount++;}
32
36
  content[i] = {
33
37
  ...element,
34
38
  text: element.text.replaceAll(`{{${replacementKey}}}`, replacementValue),
35
- }
39
+ };
36
40
  }
37
41
  }
38
-
39
42
  }
40
43
  }
41
44
  log(`Replaced ${replacedCount} instances of prompt variables in system prompt and content`);
42
-
45
+
43
46
  let messages: (SystemMessage | UserMessage)[] = [
44
47
  {
45
48
  role: "system",
@@ -51,30 +54,64 @@ export async function generateCompletion(deps: WorkflowDependencies,
51
54
  }
52
55
  ];
53
56
  setTimeout(async ()=> await recordCompletionInput(messages), 5);
54
-
55
- log("Sending chat completion request...");
56
- let startTime = Date.now();
57
- let completion = await deps.openRouter.chat.send({
58
- model: modelSettings.model_name,
59
- maxCompletionTokens: modelSettings.max_completion_tokens,
60
- messages: messages,
61
- stream: false,
62
- seed: deps.seed,
63
- frequencyPenalty: modelSettings.frequency_penalty,
64
- presencePenalty: modelSettings.presence_penalty,
65
- temperature: modelSettings.temperature,
66
- reasoning: {
67
- effort: modelSettings.reasoning_effort,
68
- },
69
- });
70
- log(`Completion response generated in ${(Date.now() - startTime) / 1000} seconds`);
71
- if (completion.choices.length < 1){
72
- warn("No choices returned from completion");
73
- console.log(completion);
57
+
58
+ const maxRetries = modelSettings.max_retries;
59
+ const retryDelayMs = modelSettings.retry_delay_ms;
60
+ const totalAttempts = maxRetries + 1;
61
+
62
+ let lastError: unknown = null;
63
+
64
+ for (let attempt = 0; attempt < totalAttempts; attempt++) {
65
+ const attemptLabel = `${attempt + 1}/${totalAttempts}`;
66
+
67
+ if (attempt > 0) {
68
+ const backoffMs = retryDelayMs * (2 ** (attempt - 1)) + Math.random() * 200;
69
+ warn(`Retrying after ${Math.round(backoffMs)}ms (attempt ${attemptLabel})...`);
70
+ await delay(backoffMs);
71
+ }
72
+
73
+ log(`Sending chat completion request (attempt ${attemptLabel})...`);
74
+ let startTime = Date.now();
75
+
76
+ try {
77
+ let completion = await deps.openRouter.chat.send({
78
+ model: modelSettings.model_name,
79
+ maxCompletionTokens: modelSettings.max_completion_tokens,
80
+ messages: messages,
81
+ stream: false,
82
+ seed: deps.seed,
83
+ frequencyPenalty: modelSettings.frequency_penalty,
84
+ presencePenalty: modelSettings.presence_penalty,
85
+ temperature: modelSettings.temperature,
86
+ reasoning: {
87
+ effort: modelSettings.reasoning_effort,
88
+ },
89
+ });
90
+ log(`Completion response received in ${(Date.now() - startTime) / 1000}s (attempt ${attemptLabel})`);
91
+
92
+ const text = completion.choices[0]?.message.content?.toString() ?? "";
93
+
94
+ if (completion.choices.length < 1 || text.length === 0) {
95
+ warn(`Empty completion on attempt ${attemptLabel}`);
96
+ console.log(completion);
97
+ // Retry if attempts remain; otherwise return empty
98
+ if (attempt < maxRetries) {
99
+ continue;
100
+ }
101
+ warn("Exhausted all retries — returning empty completion");
102
+ return {text: "", model: completion.model};
103
+ }
104
+
105
+ return {text, model: completion.model};
106
+
107
+ } catch (error) {
108
+ const message = error instanceof Error ? error.message : String(error);
109
+ warn(`Chat completion error on attempt ${attemptLabel}: ${message}`);
110
+ lastError = error;
111
+ // Loop continues to next attempt (or exits if this was the last)
112
+ }
74
113
  }
75
-
76
- return {
77
- text: completion.choices[0]?.message.content?.toString() ?? "",
78
- model: completion.model,
79
- };
114
+
115
+ warn("Exhausted all retries due to errors — re-throwing last error");
116
+ throw lastError;
80
117
  }
@@ -6,6 +6,7 @@ import {OutputViewingModeEnum} from "./config-schema.ts";
6
6
  type FileRecord = {
7
7
  type: "markdown" | "text";
8
8
  content: string;
9
+ modification_time: Date
9
10
  }
10
11
 
11
12
  const CORS_HEADERS = {
@@ -25,16 +26,17 @@ function jsonResponse(data: unknown, status = 200): Response {
25
26
  }
26
27
 
27
28
  export class OutputViewer {
28
- filesRecords: Record<string, FileRecord> = {};
29
+ fileRecords: Record<string, FileRecord> = {};
29
30
  displayed: boolean = false;
30
31
 
31
- addFile(filename: string, _: FileRecord): void {
32
- this.filesRecords[filename] = _;
32
+ addFile(filename: string, fileRecord: Omit<FileRecord, "modification_time">): void {
33
+ this.fileRecords[filename] = {
34
+ ...fileRecord,
35
+ modification_time: new Date(),
36
+ };
33
37
  }
34
38
 
35
39
  serve(): string {
36
- let files = Object.entries(this.filesRecords).sort((a, b) => a[0].localeCompare(b[0]));
37
-
38
40
  let server = Bun.serve({
39
41
  port: CONFIG.output_viewing.api_port,
40
42
  routes: {
@@ -42,10 +44,12 @@ export class OutputViewer {
42
44
  if (req.method === "OPTIONS") {
43
45
  return new Response(null, { status: 204, headers: CORS_HEADERS });
44
46
  }
47
+ let files = Object.entries(this.fileRecords).sort((a, b) => a[0].localeCompare(b[0]));
45
48
  return jsonResponse({
46
49
  files: files.map(([filename, fileRecord]) => ({
47
50
  name: filename,
48
51
  type: fileRecord.type,
52
+ modification_time: fileRecord.modification_time,
49
53
  })),
50
54
  });
51
55
  },
@@ -54,7 +58,7 @@ export class OutputViewer {
54
58
  return new Response(null, { status: 204, headers: CORS_HEADERS });
55
59
  }
56
60
  let slug = req.params.slug;
57
- let record = this.filesRecords[slug];
61
+ let record = this.fileRecords[slug];
58
62
  if (!record) {
59
63
  return jsonResponse({ error: "Not Found" }, 404);
60
64
  }
@@ -72,7 +76,7 @@ export class OutputViewer {
72
76
  return jsonResponse({ error: "Not Found" }, 404);
73
77
  },
74
78
  });
75
- console.log(server.url);
79
+ console.log(server.url.toString());
76
80
  return server.url.toString();
77
81
  }
78
82
 
@@ -80,14 +84,14 @@ export class OutputViewer {
80
84
  let frontendURL = "";
81
85
  switch (CONFIG.output_viewing.mode) {
82
86
  case OutputViewingModeEnum.Local:
83
- if (Object.keys(this.filesRecords).length === 0) {
87
+ if (Object.keys(this.fileRecords).length === 0) {
84
88
  console.warn("No files to display (you can probably ignore this warning if your workflows haven't completed yet)");
85
89
  return;
86
90
  }
87
91
 
88
92
  console.log("Click the following links to view the outputs in your browser:");
89
93
 
90
- let files = Object.entries(this.filesRecords).sort((a, b) => a[0].localeCompare(b[0]));
94
+ let files = Object.entries(this.fileRecords).sort((a, b) => a[0].localeCompare(b[0]));
91
95
  for (const [filename, fileRecord] of files) {
92
96
  let params = new URLSearchParams();
93
97
  params.set("name", filename);