@syntheticlab/synbad 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -23
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
- package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
- package/dist/evals/reasoning/response-in-reasoning.js +59 -0
- package/dist/evals/tools/claude-dash.d.ts +2 -2
- package/dist/evals/tools/claude-dash.js +1 -2
- package/dist/evals/tools/crush-list-files.d.ts +2 -5
- package/dist/evals/tools/crush-list-files.js +6 -8
- package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
- package/dist/evals/tools/multi-turn-tools.js +100 -0
- package/dist/evals/tools/no-fn-args.d.ts +22 -0
- package/dist/evals/tools/no-fn-args.js +31 -0
- package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
- package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
- package/dist/evals/tools/parallel-tool.d.ts +2 -2
- package/dist/evals/tools/parallel-tool.js +1 -2
- package/dist/evals/tools/simple-tool.d.ts +2 -2
- package/dist/evals/tools/simple-tool.js +3 -2
- package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
- package/dist/evals/tools/tool-dash-underscore.js +37 -0
- package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
- package/dist/evals/tools/tool-path-corruption.js +41 -0
- package/dist/source/asserts.d.ts +4 -1
- package/dist/source/asserts.js +36 -0
- package/dist/source/chat-completion.d.ts +5 -0
- package/dist/source/chat-completion.js +1 -0
- package/dist/source/evals.d.ts +9 -0
- package/dist/source/evals.js +53 -0
- package/dist/source/evals.test.d.ts +1 -0
- package/dist/source/evals.test.js +12 -0
- package/dist/source/exports.d.ts +2 -0
- package/dist/source/exports.js +1 -0
- package/dist/source/index.js +103 -43
- package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
- package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
- package/evals/reasoning/reasoning-parsing.ts +3 -3
- package/evals/reasoning/response-in-reasoning.ts +65 -0
- package/evals/tools/claude-dash.ts +2 -3
- package/evals/tools/crush-list-files.ts +11 -13
- package/evals/tools/multi-turn-tools.ts +104 -0
- package/evals/tools/no-fn-args.ts +34 -0
- package/evals/tools/octo-list-no-optional-args.ts +81 -0
- package/evals/tools/parallel-tool.ts +2 -3
- package/evals/tools/simple-tool.ts +4 -3
- package/evals/tools/tool-dash-underscore.ts +40 -0
- package/evals/tools/tool-path-corruption.ts +46 -0
- package/package.json +10 -3
- package/source/asserts.ts +37 -1
- package/source/chat-completion.ts +6 -0
- package/source/evals.test.ts +13 -0
- package/source/evals.ts +56 -0
- package/source/exports.ts +2 -0
- package/source/index.ts +121 -44
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
import * as assert from "../../source/asserts.ts";
|
|
3
|
+
|
|
4
|
+
export function test({ content, tool_calls }: ChatMessage) {
|
|
5
|
+
assert.isNotNullish(tool_calls);
|
|
6
|
+
assert.isNotEmptyArray(tool_calls);
|
|
7
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
8
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
9
|
+
assert.strictEqual(tool_calls[0].function.name, "get-weather__v1");
|
|
10
|
+
const args = JSON.parse(tool_calls[0].function.arguments);
|
|
11
|
+
assert.match(args.location.toLowerCase(), /paris/);
|
|
12
|
+
// Assert the tool call didn't leak into the content
|
|
13
|
+
assert.doesNotMatch(content || "", /get_weather/);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export const json = {
|
|
17
|
+
"messages": [
|
|
18
|
+
{"role": "user", "content": "What's the weather in Paris?"}
|
|
19
|
+
],
|
|
20
|
+
"tools": [
|
|
21
|
+
{
|
|
22
|
+
"type": "function",
|
|
23
|
+
"function": {
|
|
24
|
+
"name": "get-weather__v1",
|
|
25
|
+
"description": "Get current weather for a location",
|
|
26
|
+
"parameters": {
|
|
27
|
+
"type": "object",
|
|
28
|
+
"properties": {
|
|
29
|
+
"location": {
|
|
30
|
+
"type": "string",
|
|
31
|
+
"description": "City name"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"required": ["location"]
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
"tool_choice": "auto",
|
|
40
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
import * as assert from "../../source/asserts.ts";
|
|
3
|
+
|
|
4
|
+
const PATH = "/development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts";
|
|
5
|
+
|
|
6
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
7
|
+
assert.isNotNullish(tool_calls);
|
|
8
|
+
assert.isNotEmptyArray(tool_calls);
|
|
9
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
10
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
11
|
+
assert.strictEqual(tool_calls[0].function.name, "read");
|
|
12
|
+
|
|
13
|
+
const args = JSON.parse(tool_calls[0].function.arguments);
|
|
14
|
+
assert.stringContains(args.filePath, PATH);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export const json = {
|
|
18
|
+
"messages": [
|
|
19
|
+
{
|
|
20
|
+
"role": "user",
|
|
21
|
+
"content": "Read and summarize the file /development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts"
|
|
22
|
+
}
|
|
23
|
+
],
|
|
24
|
+
"tools": [
|
|
25
|
+
{
|
|
26
|
+
"type": "function",
|
|
27
|
+
"function": {
|
|
28
|
+
"name": "read",
|
|
29
|
+
"description": "The read tool",
|
|
30
|
+
"parameters": {
|
|
31
|
+
"type": "object",
|
|
32
|
+
"required": [
|
|
33
|
+
"filePath"
|
|
34
|
+
],
|
|
35
|
+
"properties": {
|
|
36
|
+
"filePath": {
|
|
37
|
+
"description": "Path to file to read",
|
|
38
|
+
"type": "string"
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"strict": true
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
],
|
|
46
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@syntheticlab/synbad",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.6",
|
|
4
4
|
"description": "LLM inference provider evals",
|
|
5
5
|
"main": "dist/source/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
"preferGlobal": true,
|
|
10
10
|
"type": "module",
|
|
11
11
|
"scripts": {
|
|
12
|
-
"test": "
|
|
12
|
+
"test": "vitest",
|
|
13
13
|
"build": "tsc",
|
|
14
14
|
"prepublishOnly": "tsc"
|
|
15
15
|
},
|
|
@@ -23,6 +23,12 @@
|
|
|
23
23
|
"package-lock.json",
|
|
24
24
|
"tsconfig.json"
|
|
25
25
|
],
|
|
26
|
+
"exports": {
|
|
27
|
+
".": {
|
|
28
|
+
"types": "./dist/source/exports.d.ts",
|
|
29
|
+
"import": "./dist/source/exports.js"
|
|
30
|
+
}
|
|
31
|
+
},
|
|
26
32
|
"dependencies": {
|
|
27
33
|
"@commander-js/extra-typings": "^14.0.0",
|
|
28
34
|
"commander": "^14.0.2",
|
|
@@ -32,6 +38,7 @@
|
|
|
32
38
|
},
|
|
33
39
|
"devDependencies": {
|
|
34
40
|
"@types/node": "^24.10.1",
|
|
35
|
-
"tsx": "^4.20.6"
|
|
41
|
+
"tsx": "^4.20.6",
|
|
42
|
+
"vitest": "^4.0.17"
|
|
36
43
|
}
|
|
37
44
|
}
|
package/source/asserts.ts
CHANGED
|
@@ -61,7 +61,13 @@ export function isEmptyArray(a: any[]) {
|
|
|
61
61
|
});
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
-
export function isNotEmptyArray(a: any[]) {
|
|
64
|
+
export function isNotEmptyArray(a: any[] | undefined) {
|
|
65
|
+
if(a == null) {
|
|
66
|
+
throw new assert.AssertionError({
|
|
67
|
+
message: "Expected a non-empty array",
|
|
68
|
+
actual: a,
|
|
69
|
+
});
|
|
70
|
+
}
|
|
65
71
|
if(a.length !== 0) return true;
|
|
66
72
|
throw new assert.AssertionError({
|
|
67
73
|
message: "Expected a non-empty array",
|
|
@@ -76,3 +82,33 @@ export function startsWith(a: string, prefix: string) {
|
|
|
76
82
|
actual: a,
|
|
77
83
|
});
|
|
78
84
|
}
|
|
85
|
+
|
|
86
|
+
export function gt(num: number, target: number) {
|
|
87
|
+
if(num > target) return true;
|
|
88
|
+
throw new assert.AssertionError({
|
|
89
|
+
message: `Expected ${num} > ${target}`,
|
|
90
|
+
actual: num,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export function gte(num: number, target: number) {
|
|
95
|
+
if(num >= target) return true;
|
|
96
|
+
throw new assert.AssertionError({
|
|
97
|
+
message: `Expected ${num} >= ${target}`,
|
|
98
|
+
actual: num,
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export function stringContains(str: string, expected: string) {
|
|
103
|
+
if(typeof str !== "string") {
|
|
104
|
+
throw new assert.AssertionError({
|
|
105
|
+
message: "Expected input to be of type string.",
|
|
106
|
+
actual: typeof str,
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
if(str.includes(expected)) return true;
|
|
110
|
+
throw new assert.AssertionError({
|
|
111
|
+
message: `Expected string to contain: "${expected}"`,
|
|
112
|
+
actual: str,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
@@ -10,10 +10,15 @@ export type ChatResponse = OpenAI.ChatCompletion & {
|
|
|
10
10
|
message: {
|
|
11
11
|
reasoning_content?: string,
|
|
12
12
|
reasoning?: string,
|
|
13
|
+
tool_calls?: Array<{
|
|
14
|
+
index: number,
|
|
15
|
+
}>
|
|
13
16
|
},
|
|
14
17
|
}>
|
|
15
18
|
};
|
|
16
19
|
|
|
20
|
+
export type ChatMessage = ChatResponse["choices"][number]["message"];
|
|
21
|
+
|
|
17
22
|
const TextContentPart = t.subtype({
|
|
18
23
|
type: t.value("text"),
|
|
19
24
|
text: t.str,
|
|
@@ -56,6 +61,7 @@ const AssistantMessageSchema = t.subtype({
|
|
|
56
61
|
name: t.str,
|
|
57
62
|
})),
|
|
58
63
|
reasoning_content: t.optional(t.str.or(t.nil)),
|
|
64
|
+
reasoning: t.optional(t.str.or(t.nil)),
|
|
59
65
|
});
|
|
60
66
|
|
|
61
67
|
const UserMessageSchema = t.subtype({
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { getEvals } from "./evals.ts";
|
|
3
|
+
|
|
4
|
+
describe("get-evals", () => {
|
|
5
|
+
it("works", async () => {
|
|
6
|
+
const evals = await getEvals();
|
|
7
|
+
evals.map(({ test, json, name }) => {
|
|
8
|
+
expect(name).toBeTypeOf("string");
|
|
9
|
+
expect(json).toBeTruthy();
|
|
10
|
+
expect(test).toBeTypeOf("function");
|
|
11
|
+
});
|
|
12
|
+
});
|
|
13
|
+
})
|
package/source/evals.ts
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import fs from "fs/promises";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { ChatMessage } from "./chat-completion.ts";
|
|
4
|
+
|
|
5
|
+
export type Eval = {
|
|
6
|
+
test: (response: ChatMessage) => any;
|
|
7
|
+
json: any;
|
|
8
|
+
name: string;
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
export async function getEvals(): Promise<Eval[]> {
|
|
12
|
+
const evals: Eval[] = [];
|
|
13
|
+
const evalsPath = path.join(import.meta.dirname, "..", "evals");
|
|
14
|
+
|
|
15
|
+
for await (const testFile of findTestFiles(evalsPath, false)) {
|
|
16
|
+
const { test, json } = await import(testFile);
|
|
17
|
+
evals.push({ test, json, name: evalName(testFile) });
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return evals;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function evalName(file: string) {
|
|
24
|
+
return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export async function* findTestFiles(dir: string, skipReasoning: boolean): AsyncGenerator<string> {
|
|
28
|
+
try {
|
|
29
|
+
await fs.stat(dir);
|
|
30
|
+
} catch(e) {
|
|
31
|
+
const pathname = `${dir}.js`;
|
|
32
|
+
const stat = await fs.stat(pathname);
|
|
33
|
+
if(stat.isFile()) {
|
|
34
|
+
yield pathname;
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
throw e;
|
|
38
|
+
}
|
|
39
|
+
const entryNames = await fs.readdir(dir);
|
|
40
|
+
const entries = await Promise.all(entryNames.map(async (entry) => {
|
|
41
|
+
return {
|
|
42
|
+
path: path.join(dir, entry),
|
|
43
|
+
stat: await fs.stat(path.join(dir, entry)),
|
|
44
|
+
};
|
|
45
|
+
}));
|
|
46
|
+
for(const entry of entries) {
|
|
47
|
+
if(entry.stat.isFile() && entry.path.endsWith(".js")) {
|
|
48
|
+
yield entry.path;
|
|
49
|
+
}
|
|
50
|
+
if(entry.stat.isDirectory()) {
|
|
51
|
+
if(skipReasoning && path.basename(entry.path) === "reasoning") continue;
|
|
52
|
+
yield* findTestFiles(entry.path, skipReasoning);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
package/source/index.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import * as http from "http";
|
|
3
3
|
import * as https from "https";
|
|
4
|
-
import fs from "fs/promises";
|
|
5
4
|
import path from "path";
|
|
6
5
|
import { Command } from "@commander-js/extra-typings";
|
|
7
6
|
import OpenAI from "openai";
|
|
7
|
+
import { ChatMessage, getReasoning } from "./chat-completion.ts";
|
|
8
|
+
import { findTestFiles, evalName } from "./evals.ts";
|
|
8
9
|
|
|
9
10
|
const cli = new Command()
|
|
10
11
|
.name("synbad")
|
|
@@ -21,14 +22,20 @@ cli.command("eval")
|
|
|
21
22
|
.option(
|
|
22
23
|
"--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)"
|
|
23
24
|
)
|
|
25
|
+
.option(
|
|
26
|
+
"--reasoning-effort <level>", "Set the reasoning effort to high, medium, or low"
|
|
27
|
+
)
|
|
24
28
|
.option(
|
|
25
29
|
"--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash"
|
|
26
30
|
)
|
|
27
31
|
.option(
|
|
28
32
|
"--count <num times>", "Number of times to run the eval. Any failures count as an overall failure",
|
|
29
33
|
)
|
|
34
|
+
.option(
|
|
35
|
+
"--stream", "Test streaming API calls",
|
|
36
|
+
)
|
|
30
37
|
.requiredOption("--model <model name>", "The model name to test")
|
|
31
|
-
.action(async ({ model, envVar, baseUrl, only, count }) => {
|
|
38
|
+
.action(async ({ model, envVar, baseUrl, only, count, skipReasoning, reasoningEffort, stream }) => {
|
|
32
39
|
if(!process.env[envVar]) {
|
|
33
40
|
console.error(`No env var named ${envVar} exists for the current process`);
|
|
34
41
|
process.exit(1);
|
|
@@ -41,28 +48,126 @@ cli.command("eval")
|
|
|
41
48
|
const failures = new Set<string>();
|
|
42
49
|
const evalPath = only ? path.join(
|
|
43
50
|
import.meta.dirname, "..", only
|
|
44
|
-
) : path.join(import.meta.dirname, "
|
|
51
|
+
) : path.join(import.meta.dirname, "..", "evals");
|
|
45
52
|
const maxRuns = count == null ? 1 : parseInt(count, 10);
|
|
46
|
-
for await(const testFile of findTestFiles(evalPath)) {
|
|
53
|
+
for await(const testFile of findTestFiles(evalPath, !!skipReasoning)) {
|
|
47
54
|
found++;
|
|
48
55
|
const test = await import(testFile);
|
|
49
56
|
const json = test.json;
|
|
50
57
|
const name = evalName(testFile);
|
|
51
58
|
process.stdout.write(`Running ${name}...`);
|
|
59
|
+
|
|
60
|
+
async function respond(): Promise<ChatMessage> {
|
|
61
|
+
const reasoning = reasoningEffort == null ? {} : {
|
|
62
|
+
reasoning_effort: reasoningEffort,
|
|
63
|
+
};
|
|
64
|
+
if(!stream) {
|
|
65
|
+
const response = await client.chat.completions.create({
|
|
66
|
+
...json,
|
|
67
|
+
...reasoning,
|
|
68
|
+
stream: false,
|
|
69
|
+
model,
|
|
70
|
+
});
|
|
71
|
+
return response.choices[0].message as ChatMessage;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const msg: Partial<ChatMessage> = {};
|
|
75
|
+
|
|
76
|
+
const chunkStream = await (client.chat.completions.create({
|
|
77
|
+
...json,
|
|
78
|
+
...reasoning,
|
|
79
|
+
model,
|
|
80
|
+
stream: true,
|
|
81
|
+
}) as unknown as Promise<AsyncIterable<OpenAI.ChatCompletionChunk & {
|
|
82
|
+
choices: Array<{
|
|
83
|
+
delta: {
|
|
84
|
+
reasoning?: string,
|
|
85
|
+
reasoning_content?: string,
|
|
86
|
+
},
|
|
87
|
+
}>
|
|
88
|
+
}>>);
|
|
89
|
+
|
|
90
|
+
let lastIndex: number | null = null;
|
|
91
|
+
let toolBuffer: {
|
|
92
|
+
id?: string,
|
|
93
|
+
type: "function",
|
|
94
|
+
index: number,
|
|
95
|
+
function: {
|
|
96
|
+
name?: string,
|
|
97
|
+
arguments?: string,
|
|
98
|
+
},
|
|
99
|
+
} | null = null;
|
|
100
|
+
for await(const chunk of chunkStream) {
|
|
101
|
+
if(!chunk.choices) continue;
|
|
102
|
+
const choice = chunk.choices[0];
|
|
103
|
+
if(!choice) continue;
|
|
104
|
+
const content = choice.delta.content;
|
|
105
|
+
const tools = choice.delta.tool_calls;
|
|
106
|
+
const reasoning = getReasoning(choice.delta);
|
|
107
|
+
if(content) {
|
|
108
|
+
if(!msg.content) msg.content = "";
|
|
109
|
+
msg.content += content;
|
|
110
|
+
}
|
|
111
|
+
if(tools) {
|
|
112
|
+
for(const toolDelta of tools) {
|
|
113
|
+
if(lastIndex == null) lastIndex = toolDelta.index;
|
|
114
|
+
if(lastIndex !== toolDelta.index && toolBuffer != null) {
|
|
115
|
+
msg.tool_calls ||= [];
|
|
116
|
+
// @ts-ignore
|
|
117
|
+
msg.tool_calls.push(toolBuffer);
|
|
118
|
+
toolBuffer = {
|
|
119
|
+
index: toolDelta.index,
|
|
120
|
+
type: "function",
|
|
121
|
+
function: {},
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
if(!toolBuffer) {
|
|
125
|
+
toolBuffer = {
|
|
126
|
+
index: toolDelta.index,
|
|
127
|
+
type: "function",
|
|
128
|
+
function: {}
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
lastIndex = toolDelta.index;
|
|
132
|
+
if(toolDelta.id) toolBuffer.id = toolDelta.id;
|
|
133
|
+
if(toolDelta.function) {
|
|
134
|
+
if(toolDelta.function.name) {
|
|
135
|
+
toolBuffer.function.name ||= "";
|
|
136
|
+
toolBuffer.function.name += toolDelta.function.name;
|
|
137
|
+
}
|
|
138
|
+
if(toolDelta.function.arguments) {
|
|
139
|
+
toolBuffer.function.arguments ||= "";
|
|
140
|
+
toolBuffer.function.arguments += toolDelta.function.arguments;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
if(reasoning) {
|
|
146
|
+
if(!msg.reasoning_content) msg.reasoning_content = "";
|
|
147
|
+
msg.reasoning_content += reasoning;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if(toolBuffer) {
|
|
152
|
+
msg.tool_calls ||= [];
|
|
153
|
+
// @ts-ignore
|
|
154
|
+
msg.tool_calls.push(toolBuffer);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return msg as ChatMessage;
|
|
158
|
+
}
|
|
159
|
+
|
|
52
160
|
try {
|
|
53
161
|
for(let i = 0; i < maxRuns; i++) {
|
|
54
162
|
if(maxRuns > 1) {
|
|
55
163
|
process.stdout.write(` ${i + 1}/${maxRuns}`);
|
|
56
164
|
}
|
|
57
|
-
const response = await
|
|
58
|
-
model,
|
|
59
|
-
...json,
|
|
60
|
-
});
|
|
165
|
+
const response = await respond();
|
|
61
166
|
try {
|
|
62
167
|
test.test(response);
|
|
63
168
|
} catch(e) {
|
|
64
169
|
console.error("Response:");
|
|
65
|
-
console.error(JSON.stringify(response
|
|
170
|
+
console.error(JSON.stringify(response, null, 2));
|
|
66
171
|
throw e;
|
|
67
172
|
}
|
|
68
173
|
}
|
|
@@ -90,6 +195,7 @@ ${passed}/${found} evals passed. Failures:
|
|
|
90
195
|
cli.command("proxy")
|
|
91
196
|
.requiredOption("-p, --port <number>", "Port to listen on")
|
|
92
197
|
.requiredOption("-t, --target <url>", "Target URL to proxy to")
|
|
198
|
+
.option("--pretty", "Pretty-print the JSON")
|
|
93
199
|
.action(async (options) => {
|
|
94
200
|
const port = parseInt(options.port, 10);
|
|
95
201
|
const targetUrl = new URL(options.target);
|
|
@@ -122,6 +228,8 @@ cli.command("proxy")
|
|
|
122
228
|
// Choose the right module based on target protocol
|
|
123
229
|
const httpModule = targetUrl.protocol === "https:" ? https : http;
|
|
124
230
|
|
|
231
|
+
const buffer: string[] = [];
|
|
232
|
+
|
|
125
233
|
// Create proxy request
|
|
126
234
|
const proxyReq = httpModule.request(
|
|
127
235
|
{
|
|
@@ -177,12 +285,14 @@ cli.command("proxy")
|
|
|
177
285
|
});
|
|
178
286
|
|
|
179
287
|
req.on("data", (chunk) => {
|
|
180
|
-
|
|
288
|
+
buffer.push(chunk);
|
|
289
|
+
if(!options.pretty) process.stdout.write(chunk);
|
|
181
290
|
proxyReq.write(chunk);
|
|
182
291
|
});
|
|
183
292
|
|
|
184
293
|
req.on("end", () => {
|
|
185
|
-
|
|
294
|
+
if(options.pretty) console.log(JSON.stringify(JSON.parse(buffer.join()), null, 2));
|
|
295
|
+
else process.stdout.write("\n");
|
|
186
296
|
console.log(`[${timestamp}] ✅ Request complete`);
|
|
187
297
|
proxyReq.end();
|
|
188
298
|
});
|
|
@@ -208,39 +318,6 @@ cli.command("proxy")
|
|
|
208
318
|
});
|
|
209
319
|
});
|
|
210
320
|
|
|
211
|
-
function evalName(file: string) {
|
|
212
|
-
return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
async function* findTestFiles(dir: string): AsyncGenerator<string> {
|
|
216
|
-
try {
|
|
217
|
-
await fs.stat(dir);
|
|
218
|
-
} catch(e) {
|
|
219
|
-
const pathname = `${dir}.js`;
|
|
220
|
-
const stat = await fs.stat(pathname);
|
|
221
|
-
if(stat.isFile()) {
|
|
222
|
-
yield pathname;
|
|
223
|
-
return;
|
|
224
|
-
}
|
|
225
|
-
throw e;
|
|
226
|
-
}
|
|
227
|
-
const entryNames = await fs.readdir(dir);
|
|
228
|
-
const entries = await Promise.all(entryNames.map(async (entry) => {
|
|
229
|
-
return {
|
|
230
|
-
path: path.join(dir, entry),
|
|
231
|
-
stat: await fs.stat(path.join(dir, entry)),
|
|
232
|
-
};
|
|
233
|
-
}));
|
|
234
|
-
for(const entry of entries) {
|
|
235
|
-
if(entry.stat.isFile() && entry.path.endsWith(".js")) {
|
|
236
|
-
yield entry.path;
|
|
237
|
-
}
|
|
238
|
-
if(entry.stat.isDirectory()) {
|
|
239
|
-
yield* findTestFiles(entry.path);
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
|
|
244
321
|
function stderrLog(item: string, ...items: string[]) {
|
|
245
322
|
let formatted = item;
|
|
246
323
|
if(items.length > 0) {
|