@syntheticlab/synbad 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +60 -23
  2. package/dist/evals/reasoning/custom-harness.d.ts +679 -0
  3. package/dist/evals/reasoning/custom-harness.js +847 -0
  4. package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
  5. package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
  6. package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
  7. package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
  8. package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
  9. package/dist/evals/reasoning/reasoning-parsing.js +2 -2
  10. package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
  11. package/dist/evals/reasoning/response-in-reasoning.js +59 -0
  12. package/dist/evals/tools/claude-dash.d.ts +2 -2
  13. package/dist/evals/tools/claude-dash.js +1 -2
  14. package/dist/evals/tools/crush-list-files.d.ts +2 -5
  15. package/dist/evals/tools/crush-list-files.js +6 -8
  16. package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
  17. package/dist/evals/tools/multi-turn-tools.js +100 -0
  18. package/dist/evals/tools/no-fn-args.d.ts +22 -0
  19. package/dist/evals/tools/no-fn-args.js +31 -0
  20. package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
  21. package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
  22. package/dist/evals/tools/parallel-tool.d.ts +2 -2
  23. package/dist/evals/tools/parallel-tool.js +1 -2
  24. package/dist/evals/tools/simple-tool.d.ts +2 -2
  25. package/dist/evals/tools/simple-tool.js +3 -2
  26. package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
  27. package/dist/evals/tools/tool-dash-underscore.js +37 -0
  28. package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
  29. package/dist/evals/tools/tool-path-corruption.js +41 -0
  30. package/dist/source/asserts.d.ts +4 -1
  31. package/dist/source/asserts.js +36 -0
  32. package/dist/source/chat-completion.d.ts +5 -0
  33. package/dist/source/chat-completion.js +1 -0
  34. package/dist/source/evals.d.ts +9 -0
  35. package/dist/source/evals.js +53 -0
  36. package/dist/source/evals.test.d.ts +1 -0
  37. package/dist/source/evals.test.js +12 -0
  38. package/dist/source/exports.d.ts +2 -0
  39. package/dist/source/exports.js +1 -0
  40. package/dist/source/index.js +103 -43
  41. package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
  42. package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
  43. package/evals/reasoning/reasoning-parsing.ts +3 -3
  44. package/evals/reasoning/response-in-reasoning.ts +65 -0
  45. package/evals/tools/claude-dash.ts +2 -3
  46. package/evals/tools/crush-list-files.ts +11 -13
  47. package/evals/tools/multi-turn-tools.ts +104 -0
  48. package/evals/tools/no-fn-args.ts +34 -0
  49. package/evals/tools/octo-list-no-optional-args.ts +81 -0
  50. package/evals/tools/parallel-tool.ts +2 -3
  51. package/evals/tools/simple-tool.ts +4 -3
  52. package/evals/tools/tool-dash-underscore.ts +40 -0
  53. package/evals/tools/tool-path-corruption.ts +46 -0
  54. package/package.json +10 -3
  55. package/source/asserts.ts +37 -1
  56. package/source/chat-completion.ts +6 -0
  57. package/source/evals.test.ts +13 -0
  58. package/source/evals.ts +56 -0
  59. package/source/exports.ts +2 -0
  60. package/source/index.ts +121 -44
@@ -0,0 +1,73 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test({ tool_calls }) {
3
+ assert.isNotNullish(tool_calls);
4
+ assert.isNotEmptyArray(tool_calls);
5
+ assert.strictEqual(tool_calls.length, 1);
6
+ assert.strictEqual(tool_calls[0].type, "function");
7
+ const parsed = JSON.parse(tool_calls[0].function.arguments);
8
+ assert.or(() => assert.isNullish(parsed), () => assert.deepEqual(parsed, {}), () => assert.isNullish(parsed.dirPath), () => assert.strictEqual(parsed.dirPath, ""));
9
+ }
10
+ export const json = {
11
+ "messages": [
12
+ {
13
+ "role": "system",
14
+ "content": "You are a coding assistant called Octo."
15
+ },
16
+ {
17
+ "role": "user",
18
+ "content": "call the list tool with no args"
19
+ }
20
+ ],
21
+ tools: [
22
+ {
23
+ "type": "function",
24
+ "function": {
25
+ "name": "append",
26
+ "description": "The append tool",
27
+ "parameters": {
28
+ "type": "object",
29
+ "required": ["filePath", "text"],
30
+ "properties": {
31
+ "filePath": {
32
+ "description": "The path to the file",
33
+ "type": "string"
34
+ },
35
+ "text": {
36
+ "description": "The text to append",
37
+ "type": "string"
38
+ }
39
+ }
40
+ },
41
+ "strict": true
42
+ }
43
+ },
44
+ {
45
+ "type": "function",
46
+ "function": {
47
+ "name": "create",
48
+ "description": "The create tool",
49
+ "parameters": {
50
+ "type": "object",
51
+ "required": ["filePath", "content"],
52
+ "properties": {
53
+ "filePath": {
54
+ "description": "Path where the file should be created",
55
+ "type": "string"
56
+ }, "content": {
57
+ "description": "Content to write to the file",
58
+ "type": "string"
59
+ }
60
+ }
61
+ },
62
+ "strict": true
63
+ }
64
+ },
65
+ { "type": "function", "function": { "name": "edit", "description": "The edit tool", "parameters": { "type": "object", "required": ["filePath", "search", "replace"], "properties": { "filePath": { "description": "The path to the file", "type": "string" }, "search": { "description": "The search string to replace. Must EXACTLY match the text you intend to replace, including\nwhitespace, punctuation, etc. Make sure to give a few lines of context above and below so you\ndon't accidentally replace a different matching substring in the same file.", "type": "string" }, "replace": { "description": "The string you want to insert into the file", "type": "string" } } }, "strict": true } },
66
+ { "type": "function", "function": { "name": "fetch", "description": "The fetch tool", "parameters": { "type": "object", "required": ["url"], "properties": { "url": { "description": "Full url to fetch, e.g. https://...", "type": "string" }, "includeMarkup": { "description": "Include the HTML markup? Defaults to false. By default or when set to false, markup will be\nstripped and converted to plain text. Prefer markup stripping, and only set this to true if the\noutput is confusing: otherwise you may download a massive amount of data", "type": "boolean" } } }, "strict": true } },
67
+ { "type": "function", "function": { "name": "list", "description": "The list tool", "parameters": { "type": "object", "required": [], "properties": { "dirPath": { "description": "Path to the directory", "type": "string" } } }, "strict": true } },
68
+ { "type": "function", "function": { "name": "prepend", "description": "The prepend tool", "parameters": { "type": "object", "required": ["filePath", "text"], "properties": { "filePath": { "description": "The path to the file", "type": "string" }, "text": { "description": "The text to prepend", "type": "string" } } }, "strict": true } },
69
+ { "type": "function", "function": { "name": "read", "description": "The read tool", "parameters": { "type": "object", "required": ["filePath"], "properties": { "filePath": { "description": "Path to file to read", "type": "string" } } }, "strict": true } },
70
+ { "type": "function", "function": { "name": "rewrite", "description": "The rewrite tool", "parameters": { "type": "object", "required": ["filePath", "text"], "properties": { "filePath": { "description": "The path to the file", "type": "string" }, "text": { "description": "The replaced file contents. This will rewrite and replace the entire file", "type": "string" } } }, "strict": true } },
71
+ { "type": "function", "function": { "name": "shell", "description": "The shell tool", "parameters": { "type": "object", "required": ["timeout", "cmd"], "properties": { "timeout": { "description": "A timeout for the command, in milliseconds. Be generous. You MUST specify this.", "type": "number" }, "cmd": { "description": "The command to run", "type": "string" } } }, "strict": true } }
72
+ ],
73
+ };
@@ -1,5 +1,5 @@
1
- import OpenAI from "openai";
2
- export declare function test(response: OpenAI.ChatCompletion): void;
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ tool_calls }: ChatMessage): void;
3
3
  export declare const json: {
4
4
  messages: {
5
5
  role: string;
@@ -1,6 +1,5 @@
1
1
  import * as assert from "../../source/asserts.js";
2
- export function test(response) {
3
- const { tool_calls } = response.choices[0].message;
2
+ export function test({ tool_calls }) {
4
3
  assert.isNotNullish(tool_calls);
5
4
  assert.isNotEmptyArray(tool_calls);
6
5
  assert.strictEqual(tool_calls.length, 2);
@@ -1,5 +1,5 @@
1
- import { ChatResponse } from "../../source/chat-completion.ts";
2
- export declare function test(response: ChatResponse): void;
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ content, tool_calls }: ChatMessage): void;
3
3
  export declare const json: {
4
4
  messages: {
5
5
  role: string;
@@ -1,6 +1,5 @@
1
1
  import * as assert from "../../source/asserts.js";
2
- export function test(response) {
3
- const { tool_calls } = response.choices[0].message;
2
+ export function test({ content, tool_calls }) {
4
3
  assert.isNotNullish(tool_calls);
5
4
  assert.isNotEmptyArray(tool_calls);
6
5
  assert.strictEqual(tool_calls.length, 1);
@@ -8,6 +7,8 @@ export function test(response) {
8
7
  assert.strictEqual(tool_calls[0].function.name, "get_weather");
9
8
  const args = JSON.parse(tool_calls[0].function.arguments);
10
9
  assert.match(args.location.toLowerCase(), /paris/);
10
+ // Assert the tool call didn't leak into the content
11
+ assert.doesNotMatch(content || "", /get_weather/);
11
12
  }
12
13
  export const json = {
13
14
  "messages": [
@@ -0,0 +1,26 @@
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ content, tool_calls }: ChatMessage): void;
3
+ export declare const json: {
4
+ messages: {
5
+ role: string;
6
+ content: string;
7
+ }[];
8
+ tools: {
9
+ type: string;
10
+ function: {
11
+ name: string;
12
+ description: string;
13
+ parameters: {
14
+ type: string;
15
+ properties: {
16
+ location: {
17
+ type: string;
18
+ description: string;
19
+ };
20
+ };
21
+ required: string[];
22
+ };
23
+ };
24
+ }[];
25
+ tool_choice: string;
26
+ };
@@ -0,0 +1,37 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test({ content, tool_calls }) {
3
+ assert.isNotNullish(tool_calls);
4
+ assert.isNotEmptyArray(tool_calls);
5
+ assert.strictEqual(tool_calls.length, 1);
6
+ assert.strictEqual(tool_calls[0].type, "function");
7
+ assert.strictEqual(tool_calls[0].function.name, "get-weather__v1");
8
+ const args = JSON.parse(tool_calls[0].function.arguments);
9
+ assert.match(args.location.toLowerCase(), /paris/);
10
+ // Assert the tool call didn't leak into the content
11
+ assert.doesNotMatch(content || "", /get_weather/);
12
+ }
13
+ export const json = {
14
+ "messages": [
15
+ { "role": "user", "content": "What's the weather in Paris?" }
16
+ ],
17
+ "tools": [
18
+ {
19
+ "type": "function",
20
+ "function": {
21
+ "name": "get-weather__v1",
22
+ "description": "Get current weather for a location",
23
+ "parameters": {
24
+ "type": "object",
25
+ "properties": {
26
+ "location": {
27
+ "type": "string",
28
+ "description": "City name"
29
+ }
30
+ },
31
+ "required": ["location"]
32
+ }
33
+ }
34
+ }
35
+ ],
36
+ "tool_choice": "auto",
37
+ };
@@ -0,0 +1,26 @@
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ tool_calls }: ChatMessage): void;
3
+ export declare const json: {
4
+ messages: {
5
+ role: string;
6
+ content: string;
7
+ }[];
8
+ tools: {
9
+ type: string;
10
+ function: {
11
+ name: string;
12
+ description: string;
13
+ parameters: {
14
+ type: string;
15
+ required: string[];
16
+ properties: {
17
+ filePath: {
18
+ description: string;
19
+ type: string;
20
+ };
21
+ };
22
+ };
23
+ strict: boolean;
24
+ };
25
+ }[];
26
+ };
@@ -0,0 +1,41 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ const PATH = "/development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts";
3
+ export function test({ tool_calls }) {
4
+ assert.isNotNullish(tool_calls);
5
+ assert.isNotEmptyArray(tool_calls);
6
+ assert.strictEqual(tool_calls.length, 1);
7
+ assert.strictEqual(tool_calls[0].type, "function");
8
+ assert.strictEqual(tool_calls[0].function.name, "read");
9
+ const args = JSON.parse(tool_calls[0].function.arguments);
10
+ assert.stringContains(args.filePath, PATH);
11
+ }
12
+ export const json = {
13
+ "messages": [
14
+ {
15
+ "role": "user",
16
+ "content": "Read and summarize the file /development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts"
17
+ }
18
+ ],
19
+ "tools": [
20
+ {
21
+ "type": "function",
22
+ "function": {
23
+ "name": "read",
24
+ "description": "The read tool",
25
+ "parameters": {
26
+ "type": "object",
27
+ "required": [
28
+ "filePath"
29
+ ],
30
+ "properties": {
31
+ "filePath": {
32
+ "description": "Path to file to read",
33
+ "type": "string"
34
+ }
35
+ }
36
+ },
37
+ "strict": true
38
+ }
39
+ },
40
+ ],
41
+ };
@@ -3,5 +3,8 @@ export declare function or(a: () => void, ...rest: Array<() => void>): void;
3
3
  export declare function isNullish(a: unknown): asserts a is null | undefined;
4
4
  export declare function isNotNullish<T extends any>(a: T): asserts a is Exclude<T, null | undefined>;
5
5
  export declare function isEmptyArray(a: any[]): boolean;
6
- export declare function isNotEmptyArray(a: any[]): boolean;
6
+ export declare function isNotEmptyArray(a: any[] | undefined): boolean;
7
7
  export declare function startsWith(a: string, prefix: string): boolean;
8
+ export declare function gt(num: number, target: number): boolean;
9
+ export declare function gte(num: number, target: number): boolean;
10
+ export declare function stringContains(str: string, expected: string): boolean;
@@ -45,6 +45,12 @@ export function isEmptyArray(a) {
45
45
  });
46
46
  }
47
47
  export function isNotEmptyArray(a) {
48
+ if (a == null) {
49
+ throw new assert.AssertionError({
50
+ message: "Expected a non-empty array",
51
+ actual: a,
52
+ });
53
+ }
48
54
  if (a.length !== 0)
49
55
  return true;
50
56
  throw new assert.AssertionError({
@@ -60,3 +66,33 @@ export function startsWith(a, prefix) {
60
66
  actual: a,
61
67
  });
62
68
  }
69
+ export function gt(num, target) {
70
+ if (num > target)
71
+ return true;
72
+ throw new assert.AssertionError({
73
+ message: `Expected ${num} > ${target}`,
74
+ actual: num,
75
+ });
76
+ }
77
+ export function gte(num, target) {
78
+ if (num >= target)
79
+ return true;
80
+ throw new assert.AssertionError({
81
+ message: `Expected ${num} >= ${target}`,
82
+ actual: num,
83
+ });
84
+ }
85
+ export function stringContains(str, expected) {
86
+ if (typeof str !== "string") {
87
+ throw new assert.AssertionError({
88
+ message: "Expected input to be of type string.",
89
+ actual: typeof str,
90
+ });
91
+ }
92
+ if (str.includes(expected))
93
+ return true;
94
+ throw new assert.AssertionError({
95
+ message: `Expected string to contain: "${expected}"`,
96
+ actual: str,
97
+ });
98
+ }
@@ -9,9 +9,13 @@ export type ChatResponse = OpenAI.ChatCompletion & {
9
9
  message: {
10
10
  reasoning_content?: string;
11
11
  reasoning?: string;
12
+ tool_calls?: Array<{
13
+ index: number;
14
+ }>;
12
15
  };
13
16
  }>;
14
17
  };
18
+ export type ChatMessage = ChatResponse["choices"][number]["message"];
15
19
  export declare const ChatCompletion: t.Struct<{
16
20
  messages: t.Arr<t.UnwrappedTypeStruct<{
17
21
  content: t.Type<string | t.UnwrappedTypeStruct<{
@@ -54,6 +58,7 @@ export declare const ChatCompletion: t.Struct<{
54
58
  name: t.TypeOf<string>;
55
59
  }>>;
56
60
  reasoning_content: t.OptionalKey<t.Type<string | null>>;
61
+ reasoning: t.OptionalKey<t.Type<string | null>>;
57
62
  }> | t.UnwrappedTypeStruct<{
58
63
  role: t.Value<"tool">;
59
64
  content: t.Type<string | t.UnwrappedTypeStruct<{
@@ -41,6 +41,7 @@ const AssistantMessageSchema = t.subtype({
41
41
  name: t.str,
42
42
  })),
43
43
  reasoning_content: t.optional(t.str.or(t.nil)),
44
+ reasoning: t.optional(t.str.or(t.nil)),
44
45
  });
45
46
  const UserMessageSchema = t.subtype({
46
47
  content: UserContent,
@@ -0,0 +1,9 @@
1
+ import { ChatMessage } from "./chat-completion.ts";
2
+ export type Eval = {
3
+ test: (response: ChatMessage) => any;
4
+ json: any;
5
+ name: string;
6
+ };
7
+ export declare function getEvals(): Promise<Eval[]>;
8
+ export declare function evalName(file: string): string;
9
+ export declare function findTestFiles(dir: string, skipReasoning: boolean): AsyncGenerator<string>;
@@ -0,0 +1,53 @@
1
+ var __rewriteRelativeImportExtension = (this && this.__rewriteRelativeImportExtension) || function (path, preserveJsx) {
2
+ if (typeof path === "string" && /^\.\.?\//.test(path)) {
3
+ return path.replace(/\.(tsx)$|((?:\.d)?)((?:\.[^./]+?)?)\.([cm]?)ts$/i, function (m, tsx, d, ext, cm) {
4
+ return tsx ? preserveJsx ? ".jsx" : ".js" : d && (!ext || !cm) ? m : (d + ext + "." + cm.toLowerCase() + "js");
5
+ });
6
+ }
7
+ return path;
8
+ };
9
+ import fs from "fs/promises";
10
+ import path from "path";
11
+ export async function getEvals() {
12
+ const evals = [];
13
+ const evalsPath = path.join(import.meta.dirname, "..", "evals");
14
+ for await (const testFile of findTestFiles(evalsPath, false)) {
15
+ const { test, json } = await import(__rewriteRelativeImportExtension(testFile));
16
+ evals.push({ test, json, name: evalName(testFile) });
17
+ }
18
+ return evals;
19
+ }
20
+ export function evalName(file) {
21
+ return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`;
22
+ }
23
+ export async function* findTestFiles(dir, skipReasoning) {
24
+ try {
25
+ await fs.stat(dir);
26
+ }
27
+ catch (e) {
28
+ const pathname = `${dir}.js`;
29
+ const stat = await fs.stat(pathname);
30
+ if (stat.isFile()) {
31
+ yield pathname;
32
+ return;
33
+ }
34
+ throw e;
35
+ }
36
+ const entryNames = await fs.readdir(dir);
37
+ const entries = await Promise.all(entryNames.map(async (entry) => {
38
+ return {
39
+ path: path.join(dir, entry),
40
+ stat: await fs.stat(path.join(dir, entry)),
41
+ };
42
+ }));
43
+ for (const entry of entries) {
44
+ if (entry.stat.isFile() && entry.path.endsWith(".js")) {
45
+ yield entry.path;
46
+ }
47
+ if (entry.stat.isDirectory()) {
48
+ if (skipReasoning && path.basename(entry.path) === "reasoning")
49
+ continue;
50
+ yield* findTestFiles(entry.path, skipReasoning);
51
+ }
52
+ }
53
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,12 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { getEvals } from "./evals.js";
3
+ describe("get-evals", () => {
4
+ it("works", async () => {
5
+ const evals = await getEvals();
6
+ evals.map(({ test, json, name }) => {
7
+ expect(name).toBeTypeOf("string");
8
+ expect(json).toBeTruthy();
9
+ expect(test).toBeTypeOf("function");
10
+ });
11
+ });
12
+ });
@@ -0,0 +1,2 @@
1
+ export { getEvals, type Eval as SynbadEval } from "./evals.ts";
2
+ export { ChatMessage as SynbadChatMessage } from "./chat-completion.ts";
@@ -0,0 +1 @@
1
+ export { getEvals } from "./evals.js";
@@ -9,10 +9,11 @@ var __rewriteRelativeImportExtension = (this && this.__rewriteRelativeImportExte
9
9
  };
10
10
  import * as http from "http";
11
11
  import * as https from "https";
12
- import fs from "fs/promises";
13
12
  import path from "path";
14
13
  import { Command } from "@commander-js/extra-typings";
15
14
  import OpenAI from "openai";
15
+ import { getReasoning } from "./chat-completion.js";
16
+ import { findTestFiles, evalName } from "./evals.js";
16
17
  const cli = new Command()
17
18
  .name("synbad")
18
19
  .description("A set of evals for LLM inference providers");
@@ -21,10 +22,12 @@ cli.command("eval")
21
22
  .requiredOption("--env-var <env var name>", "The env var to use to authenticate with the inference provider")
22
23
  .requiredOption("--base-url <base url>", "The base URL for the inference provider")
23
24
  .option("--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)")
25
+ .option("--reasoning-effort <level>", "Set the reasoning effort to high, medium, or low")
24
26
  .option("--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash")
25
27
  .option("--count <num times>", "Number of times to run the eval. Any failures count as an overall failure")
28
+ .option("--stream", "Test streaming API calls")
26
29
  .requiredOption("--model <model name>", "The model name to test")
27
- .action(async ({ model, envVar, baseUrl, only, count }) => {
30
+ .action(async ({ model, envVar, baseUrl, only, count, skipReasoning, reasoningEffort, stream }) => {
28
31
  if (!process.env[envVar]) {
29
32
  console.error(`No env var named ${envVar} exists for the current process`);
30
33
  process.exit(1);
@@ -35,29 +38,111 @@ cli.command("eval")
35
38
  });
36
39
  let found = 0;
37
40
  const failures = new Set();
38
- const evalPath = only ? path.join(import.meta.dirname, "..", only) : path.join(import.meta.dirname, "../evals");
41
+ const evalPath = only ? path.join(import.meta.dirname, "..", only) : path.join(import.meta.dirname, "..", "evals");
39
42
  const maxRuns = count == null ? 1 : parseInt(count, 10);
40
- for await (const testFile of findTestFiles(evalPath)) {
43
+ for await (const testFile of findTestFiles(evalPath, !!skipReasoning)) {
41
44
  found++;
42
45
  const test = await import(__rewriteRelativeImportExtension(testFile));
43
46
  const json = test.json;
44
47
  const name = evalName(testFile);
45
48
  process.stdout.write(`Running ${name}...`);
49
+ async function respond() {
50
+ const reasoning = reasoningEffort == null ? {} : {
51
+ reasoning_effort: reasoningEffort,
52
+ };
53
+ if (!stream) {
54
+ const response = await client.chat.completions.create({
55
+ ...json,
56
+ ...reasoning,
57
+ stream: false,
58
+ model,
59
+ });
60
+ return response.choices[0].message;
61
+ }
62
+ const msg = {};
63
+ const chunkStream = await client.chat.completions.create({
64
+ ...json,
65
+ ...reasoning,
66
+ model,
67
+ stream: true,
68
+ });
69
+ let lastIndex = null;
70
+ let toolBuffer = null;
71
+ for await (const chunk of chunkStream) {
72
+ if (!chunk.choices)
73
+ continue;
74
+ const choice = chunk.choices[0];
75
+ if (!choice)
76
+ continue;
77
+ const content = choice.delta.content;
78
+ const tools = choice.delta.tool_calls;
79
+ const reasoning = getReasoning(choice.delta);
80
+ if (content) {
81
+ if (!msg.content)
82
+ msg.content = "";
83
+ msg.content += content;
84
+ }
85
+ if (tools) {
86
+ for (const toolDelta of tools) {
87
+ if (lastIndex == null)
88
+ lastIndex = toolDelta.index;
89
+ if (lastIndex !== toolDelta.index && toolBuffer != null) {
90
+ msg.tool_calls ||= [];
91
+ // @ts-ignore
92
+ msg.tool_calls.push(toolBuffer);
93
+ toolBuffer = {
94
+ index: toolDelta.index,
95
+ type: "function",
96
+ function: {},
97
+ };
98
+ }
99
+ if (!toolBuffer) {
100
+ toolBuffer = {
101
+ index: toolDelta.index,
102
+ type: "function",
103
+ function: {}
104
+ };
105
+ }
106
+ lastIndex = toolDelta.index;
107
+ if (toolDelta.id)
108
+ toolBuffer.id = toolDelta.id;
109
+ if (toolDelta.function) {
110
+ if (toolDelta.function.name) {
111
+ toolBuffer.function.name ||= "";
112
+ toolBuffer.function.name += toolDelta.function.name;
113
+ }
114
+ if (toolDelta.function.arguments) {
115
+ toolBuffer.function.arguments ||= "";
116
+ toolBuffer.function.arguments += toolDelta.function.arguments;
117
+ }
118
+ }
119
+ }
120
+ }
121
+ if (reasoning) {
122
+ if (!msg.reasoning_content)
123
+ msg.reasoning_content = "";
124
+ msg.reasoning_content += reasoning;
125
+ }
126
+ }
127
+ if (toolBuffer) {
128
+ msg.tool_calls ||= [];
129
+ // @ts-ignore
130
+ msg.tool_calls.push(toolBuffer);
131
+ }
132
+ return msg;
133
+ }
46
134
  try {
47
135
  for (let i = 0; i < maxRuns; i++) {
48
136
  if (maxRuns > 1) {
49
137
  process.stdout.write(` ${i + 1}/${maxRuns}`);
50
138
  }
51
- const response = await client.chat.completions.create({
52
- model,
53
- ...json,
54
- });
139
+ const response = await respond();
55
140
  try {
56
141
  test.test(response);
57
142
  }
58
143
  catch (e) {
59
144
  console.error("Response:");
60
- console.error(JSON.stringify(response.choices[0], null, 2));
145
+ console.error(JSON.stringify(response, null, 2));
61
146
  throw e;
62
147
  }
63
148
  }
@@ -84,6 +169,7 @@ ${passed}/${found} evals passed. Failures:
84
169
  cli.command("proxy")
85
170
  .requiredOption("-p, --port <number>", "Port to listen on")
86
171
  .requiredOption("-t, --target <url>", "Target URL to proxy to")
172
+ .option("--pretty", "Pretty-print the JSON")
87
173
  .action(async (options) => {
88
174
  const port = parseInt(options.port, 10);
89
175
  const targetUrl = new URL(options.target);
@@ -108,6 +194,7 @@ cli.command("proxy")
108
194
  stderrLog(`[${timestamp}] 📦 Writing request data to stdout...`);
109
195
  // Choose the right module based on target protocol
110
196
  const httpModule = targetUrl.protocol === "https:" ? https : http;
197
+ const buffer = [];
111
198
  // Create proxy request
112
199
  const proxyReq = httpModule.request({
113
200
  hostname: targetUrl.hostname,
@@ -151,11 +238,16 @@ cli.command("proxy")
151
238
  }
152
239
  });
153
240
  req.on("data", (chunk) => {
154
- process.stdout.write(chunk);
241
+ buffer.push(chunk);
242
+ if (!options.pretty)
243
+ process.stdout.write(chunk);
155
244
  proxyReq.write(chunk);
156
245
  });
157
246
  req.on("end", () => {
158
- process.stdout.write("\n");
247
+ if (options.pretty)
248
+ console.log(JSON.stringify(JSON.parse(buffer.join()), null, 2));
249
+ else
250
+ process.stdout.write("\n");
159
251
  console.log(`[${timestamp}] ✅ Request complete`);
160
252
  proxyReq.end();
161
253
  });
@@ -178,38 +270,6 @@ cli.command("proxy")
178
270
  stderrLog("🤓 Terminal UI messages (such as this one) will be logged to stderr");
179
271
  });
180
272
  });
181
- function evalName(file) {
182
- return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`;
183
- }
184
- async function* findTestFiles(dir) {
185
- try {
186
- await fs.stat(dir);
187
- }
188
- catch (e) {
189
- const pathname = `${dir}.js`;
190
- const stat = await fs.stat(pathname);
191
- if (stat.isFile()) {
192
- yield pathname;
193
- return;
194
- }
195
- throw e;
196
- }
197
- const entryNames = await fs.readdir(dir);
198
- const entries = await Promise.all(entryNames.map(async (entry) => {
199
- return {
200
- path: path.join(dir, entry),
201
- stat: await fs.stat(path.join(dir, entry)),
202
- };
203
- }));
204
- for (const entry of entries) {
205
- if (entry.stat.isFile() && entry.path.endsWith(".js")) {
206
- yield entry.path;
207
- }
208
- if (entry.stat.isDirectory()) {
209
- yield* findTestFiles(entry.path);
210
- }
211
- }
212
- }
213
273
  function stderrLog(item, ...items) {
214
274
  let formatted = item;
215
275
  if (items.length > 0) {
@@ -1,8 +1,8 @@
1
1
  import * as assert from "../../source/asserts.ts";
2
- import { ChatResponse, getReasoning } from "../../source/chat-completion.ts";
2
+ import { ChatMessage, getReasoning } from "../../source/chat-completion.ts";
3
3
 
4
- export function test(response: ChatResponse) {
5
- const reasoning = getReasoning(response.choices[0].message);
4
+ export function test(message: ChatMessage) {
5
+ const reasoning = getReasoning(message);
6
6
  assert.isNotNullish(reasoning);
7
7
  }
8
8