@syntheticlab/synbad 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +60 -23
  2. package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
  3. package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
  4. package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
  5. package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
  6. package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
  7. package/dist/evals/reasoning/reasoning-parsing.js +2 -2
  8. package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
  9. package/dist/evals/reasoning/response-in-reasoning.js +59 -0
  10. package/dist/evals/tools/claude-dash.d.ts +2 -2
  11. package/dist/evals/tools/claude-dash.js +1 -2
  12. package/dist/evals/tools/crush-list-files.d.ts +2 -5
  13. package/dist/evals/tools/crush-list-files.js +6 -8
  14. package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
  15. package/dist/evals/tools/multi-turn-tools.js +100 -0
  16. package/dist/evals/tools/no-fn-args.d.ts +22 -0
  17. package/dist/evals/tools/no-fn-args.js +31 -0
  18. package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
  19. package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
  20. package/dist/evals/tools/parallel-tool.d.ts +2 -2
  21. package/dist/evals/tools/parallel-tool.js +1 -2
  22. package/dist/evals/tools/simple-tool.d.ts +2 -2
  23. package/dist/evals/tools/simple-tool.js +3 -2
  24. package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
  25. package/dist/evals/tools/tool-dash-underscore.js +37 -0
  26. package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
  27. package/dist/evals/tools/tool-path-corruption.js +41 -0
  28. package/dist/source/asserts.d.ts +4 -1
  29. package/dist/source/asserts.js +36 -0
  30. package/dist/source/chat-completion.d.ts +5 -0
  31. package/dist/source/chat-completion.js +1 -0
  32. package/dist/source/evals.d.ts +9 -0
  33. package/dist/source/evals.js +53 -0
  34. package/dist/source/evals.test.d.ts +1 -0
  35. package/dist/source/evals.test.js +12 -0
  36. package/dist/source/exports.d.ts +2 -0
  37. package/dist/source/exports.js +1 -0
  38. package/dist/source/index.js +103 -43
  39. package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
  40. package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
  41. package/evals/reasoning/reasoning-parsing.ts +3 -3
  42. package/evals/reasoning/response-in-reasoning.ts +65 -0
  43. package/evals/tools/claude-dash.ts +2 -3
  44. package/evals/tools/crush-list-files.ts +11 -13
  45. package/evals/tools/multi-turn-tools.ts +104 -0
  46. package/evals/tools/no-fn-args.ts +34 -0
  47. package/evals/tools/octo-list-no-optional-args.ts +81 -0
  48. package/evals/tools/parallel-tool.ts +2 -3
  49. package/evals/tools/simple-tool.ts +4 -3
  50. package/evals/tools/tool-dash-underscore.ts +40 -0
  51. package/evals/tools/tool-path-corruption.ts +46 -0
  52. package/package.json +10 -3
  53. package/source/asserts.ts +37 -1
  54. package/source/chat-completion.ts +6 -0
  55. package/source/evals.test.ts +13 -0
  56. package/source/evals.ts +56 -0
  57. package/source/exports.ts +2 -0
  58. package/source/index.ts +121 -44
@@ -0,0 +1,40 @@
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ import * as assert from "../../source/asserts.ts";
3
+
4
+ export function test({ content, tool_calls }: ChatMessage) {
5
+ assert.isNotNullish(tool_calls);
6
+ assert.isNotEmptyArray(tool_calls);
7
+ assert.strictEqual(tool_calls.length, 1);
8
+ assert.strictEqual(tool_calls[0].type, "function");
9
+ assert.strictEqual(tool_calls[0].function.name, "get-weather__v1");
10
+ const args = JSON.parse(tool_calls[0].function.arguments);
11
+ assert.match(args.location.toLowerCase(), /paris/);
12
+ // Assert the tool call didn't leak into the content
13
+ assert.doesNotMatch(content || "", /get_weather/);
14
+ }
15
+
16
+ export const json = {
17
+ "messages": [
18
+ {"role": "user", "content": "What's the weather in Paris?"}
19
+ ],
20
+ "tools": [
21
+ {
22
+ "type": "function",
23
+ "function": {
24
+ "name": "get-weather__v1",
25
+ "description": "Get current weather for a location",
26
+ "parameters": {
27
+ "type": "object",
28
+ "properties": {
29
+ "location": {
30
+ "type": "string",
31
+ "description": "City name"
32
+ }
33
+ },
34
+ "required": ["location"]
35
+ }
36
+ }
37
+ }
38
+ ],
39
+ "tool_choice": "auto",
40
+ }
@@ -0,0 +1,46 @@
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ import * as assert from "../../source/asserts.ts";
3
+
4
+ const PATH = "/development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts";
5
+
6
+ export function test({ tool_calls }: ChatMessage) {
7
+ assert.isNotNullish(tool_calls);
8
+ assert.isNotEmptyArray(tool_calls);
9
+ assert.strictEqual(tool_calls.length, 1);
10
+ assert.strictEqual(tool_calls[0].type, "function");
11
+ assert.strictEqual(tool_calls[0].function.name, "read");
12
+
13
+ const args = JSON.parse(tool_calls[0].function.arguments);
14
+ assert.stringContains(args.filePath, PATH);
15
+ }
16
+
17
+ export const json = {
18
+ "messages": [
19
+ {
20
+ "role": "user",
21
+ "content": "Read and summarize the file /development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts"
22
+ }
23
+ ],
24
+ "tools": [
25
+ {
26
+ "type": "function",
27
+ "function": {
28
+ "name": "read",
29
+ "description": "The read tool",
30
+ "parameters": {
31
+ "type": "object",
32
+ "required": [
33
+ "filePath"
34
+ ],
35
+ "properties": {
36
+ "filePath": {
37
+ "description": "Path to file to read",
38
+ "type": "string"
39
+ }
40
+ }
41
+ },
42
+ "strict": true
43
+ }
44
+ },
45
+ ],
46
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@syntheticlab/synbad",
3
- "version": "0.0.5",
3
+ "version": "0.0.6",
4
4
  "description": "LLM inference provider evals",
5
5
  "main": "dist/source/index.js",
6
6
  "bin": {
@@ -9,7 +9,7 @@
9
9
  "preferGlobal": true,
10
10
  "type": "module",
11
11
  "scripts": {
12
- "test": "echo \"Error: no test specified\" && exit 1",
12
+ "test": "vitest",
13
13
  "build": "tsc",
14
14
  "prepublishOnly": "tsc"
15
15
  },
@@ -23,6 +23,12 @@
23
23
  "package-lock.json",
24
24
  "tsconfig.json"
25
25
  ],
26
+ "exports": {
27
+ ".": {
28
+ "types": "./dist/source/exports.d.ts",
29
+ "import": "./dist/source/exports.js"
30
+ }
31
+ },
26
32
  "dependencies": {
27
33
  "@commander-js/extra-typings": "^14.0.0",
28
34
  "commander": "^14.0.2",
@@ -32,6 +38,7 @@
32
38
  },
33
39
  "devDependencies": {
34
40
  "@types/node": "^24.10.1",
35
- "tsx": "^4.20.6"
41
+ "tsx": "^4.20.6",
42
+ "vitest": "^4.0.17"
36
43
  }
37
44
  }
package/source/asserts.ts CHANGED
@@ -61,7 +61,13 @@ export function isEmptyArray(a: any[]) {
61
61
  });
62
62
  }
63
63
 
64
- export function isNotEmptyArray(a: any[]) {
64
+ export function isNotEmptyArray(a: any[] | undefined) {
65
+ if(a == null) {
66
+ throw new assert.AssertionError({
67
+ message: "Expected a non-empty array",
68
+ actual: a,
69
+ });
70
+ }
65
71
  if(a.length !== 0) return true;
66
72
  throw new assert.AssertionError({
67
73
  message: "Expected a non-empty array",
@@ -76,3 +82,33 @@ export function startsWith(a: string, prefix: string) {
76
82
  actual: a,
77
83
  });
78
84
  }
85
+
86
+ export function gt(num: number, target: number) {
87
+ if(num > target) return true;
88
+ throw new assert.AssertionError({
89
+ message: `Expected ${num} > ${target}`,
90
+ actual: num,
91
+ });
92
+ }
93
+
94
+ export function gte(num: number, target: number) {
95
+ if(num >= target) return true;
96
+ throw new assert.AssertionError({
97
+ message: `Expected ${num} >= ${target}`,
98
+ actual: num,
99
+ });
100
+ }
101
+
102
+ export function stringContains(str: string, expected: string) {
103
+ if(typeof str !== "string") {
104
+ throw new assert.AssertionError({
105
+ message: "Expected input to be of type string.",
106
+ actual: typeof str,
107
+ });
108
+ }
109
+ if(str.includes(expected)) return true;
110
+ throw new assert.AssertionError({
111
+ message: `Expected string to contain: "${expected}"`,
112
+ actual: str,
113
+ });
114
+ }
@@ -10,10 +10,15 @@ export type ChatResponse = OpenAI.ChatCompletion & {
10
10
  message: {
11
11
  reasoning_content?: string,
12
12
  reasoning?: string,
13
+ tool_calls?: Array<{
14
+ index: number,
15
+ }>
13
16
  },
14
17
  }>
15
18
  };
16
19
 
20
+ export type ChatMessage = ChatResponse["choices"][number]["message"];
21
+
17
22
  const TextContentPart = t.subtype({
18
23
  type: t.value("text"),
19
24
  text: t.str,
@@ -56,6 +61,7 @@ const AssistantMessageSchema = t.subtype({
56
61
  name: t.str,
57
62
  })),
58
63
  reasoning_content: t.optional(t.str.or(t.nil)),
64
+ reasoning: t.optional(t.str.or(t.nil)),
59
65
  });
60
66
 
61
67
  const UserMessageSchema = t.subtype({
@@ -0,0 +1,13 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { getEvals } from "./evals.ts";
3
+
4
+ describe("get-evals", () => {
5
+ it("works", async () => {
6
+ const evals = await getEvals();
7
+ evals.map(({ test, json, name }) => {
8
+ expect(name).toBeTypeOf("string");
9
+ expect(json).toBeTruthy();
10
+ expect(test).toBeTypeOf("function");
11
+ });
12
+ });
13
+ })
@@ -0,0 +1,56 @@
1
+ import fs from "fs/promises";
2
+ import path from "path";
3
+ import { ChatMessage } from "./chat-completion.ts";
4
+
5
+ export type Eval = {
6
+ test: (response: ChatMessage) => any;
7
+ json: any;
8
+ name: string;
9
+ };
10
+
11
+ export async function getEvals(): Promise<Eval[]> {
12
+ const evals: Eval[] = [];
13
+ const evalsPath = path.join(import.meta.dirname, "..", "evals");
14
+
15
+ for await (const testFile of findTestFiles(evalsPath, false)) {
16
+ const { test, json } = await import(testFile);
17
+ evals.push({ test, json, name: evalName(testFile) });
18
+ }
19
+
20
+ return evals;
21
+ }
22
+
23
+ export function evalName(file: string) {
24
+ return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
25
+ }
26
+
27
+ export async function* findTestFiles(dir: string, skipReasoning: boolean): AsyncGenerator<string> {
28
+ try {
29
+ await fs.stat(dir);
30
+ } catch(e) {
31
+ const pathname = `${dir}.js`;
32
+ const stat = await fs.stat(pathname);
33
+ if(stat.isFile()) {
34
+ yield pathname;
35
+ return;
36
+ }
37
+ throw e;
38
+ }
39
+ const entryNames = await fs.readdir(dir);
40
+ const entries = await Promise.all(entryNames.map(async (entry) => {
41
+ return {
42
+ path: path.join(dir, entry),
43
+ stat: await fs.stat(path.join(dir, entry)),
44
+ };
45
+ }));
46
+ for(const entry of entries) {
47
+ if(entry.stat.isFile() && entry.path.endsWith(".js")) {
48
+ yield entry.path;
49
+ }
50
+ if(entry.stat.isDirectory()) {
51
+ if(skipReasoning && path.basename(entry.path) === "reasoning") continue;
52
+ yield* findTestFiles(entry.path, skipReasoning);
53
+ }
54
+ }
55
+ }
56
+
@@ -0,0 +1,2 @@
1
+ export { getEvals, type Eval as SynbadEval } from "./evals.ts";
2
+ export { ChatMessage as SynbadChatMessage } from "./chat-completion.ts";
package/source/index.ts CHANGED
@@ -1,10 +1,11 @@
1
1
  #!/usr/bin/env node
2
2
  import * as http from "http";
3
3
  import * as https from "https";
4
- import fs from "fs/promises";
5
4
  import path from "path";
6
5
  import { Command } from "@commander-js/extra-typings";
7
6
  import OpenAI from "openai";
7
+ import { ChatMessage, getReasoning } from "./chat-completion.ts";
8
+ import { findTestFiles, evalName } from "./evals.ts";
8
9
 
9
10
  const cli = new Command()
10
11
  .name("synbad")
@@ -21,14 +22,20 @@ cli.command("eval")
21
22
  .option(
22
23
  "--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)"
23
24
  )
25
+ .option(
26
+ "--reasoning-effort <level>", "Set the reasoning effort to high, medium, or low"
27
+ )
24
28
  .option(
25
29
  "--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash"
26
30
  )
27
31
  .option(
28
32
  "--count <num times>", "Number of times to run the eval. Any failures count as an overall failure",
29
33
  )
34
+ .option(
35
+ "--stream", "Test streaming API calls",
36
+ )
30
37
  .requiredOption("--model <model name>", "The model name to test")
31
- .action(async ({ model, envVar, baseUrl, only, count }) => {
38
+ .action(async ({ model, envVar, baseUrl, only, count, skipReasoning, reasoningEffort, stream }) => {
32
39
  if(!process.env[envVar]) {
33
40
  console.error(`No env var named ${envVar} exists for the current process`);
34
41
  process.exit(1);
@@ -41,28 +48,126 @@ cli.command("eval")
41
48
  const failures = new Set<string>();
42
49
  const evalPath = only ? path.join(
43
50
  import.meta.dirname, "..", only
44
- ) : path.join(import.meta.dirname, "../evals");
51
+ ) : path.join(import.meta.dirname, "..", "evals");
45
52
  const maxRuns = count == null ? 1 : parseInt(count, 10);
46
- for await(const testFile of findTestFiles(evalPath)) {
53
+ for await(const testFile of findTestFiles(evalPath, !!skipReasoning)) {
47
54
  found++;
48
55
  const test = await import(testFile);
49
56
  const json = test.json;
50
57
  const name = evalName(testFile);
51
58
  process.stdout.write(`Running ${name}...`);
59
+
60
+ async function respond(): Promise<ChatMessage> {
61
+ const reasoning = reasoningEffort == null ? {} : {
62
+ reasoning_effort: reasoningEffort,
63
+ };
64
+ if(!stream) {
65
+ const response = await client.chat.completions.create({
66
+ ...json,
67
+ ...reasoning,
68
+ stream: false,
69
+ model,
70
+ });
71
+ return response.choices[0].message as ChatMessage;
72
+ }
73
+
74
+ const msg: Partial<ChatMessage> = {};
75
+
76
+ const chunkStream = await (client.chat.completions.create({
77
+ ...json,
78
+ ...reasoning,
79
+ model,
80
+ stream: true,
81
+ }) as unknown as Promise<AsyncIterable<OpenAI.ChatCompletionChunk & {
82
+ choices: Array<{
83
+ delta: {
84
+ reasoning?: string,
85
+ reasoning_content?: string,
86
+ },
87
+ }>
88
+ }>>);
89
+
90
+ let lastIndex: number | null = null;
91
+ let toolBuffer: {
92
+ id?: string,
93
+ type: "function",
94
+ index: number,
95
+ function: {
96
+ name?: string,
97
+ arguments?: string,
98
+ },
99
+ } | null = null;
100
+ for await(const chunk of chunkStream) {
101
+ if(!chunk.choices) continue;
102
+ const choice = chunk.choices[0];
103
+ if(!choice) continue;
104
+ const content = choice.delta.content;
105
+ const tools = choice.delta.tool_calls;
106
+ const reasoning = getReasoning(choice.delta);
107
+ if(content) {
108
+ if(!msg.content) msg.content = "";
109
+ msg.content += content;
110
+ }
111
+ if(tools) {
112
+ for(const toolDelta of tools) {
113
+ if(lastIndex == null) lastIndex = toolDelta.index;
114
+ if(lastIndex !== toolDelta.index && toolBuffer != null) {
115
+ msg.tool_calls ||= [];
116
+ // @ts-ignore
117
+ msg.tool_calls.push(toolBuffer);
118
+ toolBuffer = {
119
+ index: toolDelta.index,
120
+ type: "function",
121
+ function: {},
122
+ };
123
+ }
124
+ if(!toolBuffer) {
125
+ toolBuffer = {
126
+ index: toolDelta.index,
127
+ type: "function",
128
+ function: {}
129
+ };
130
+ }
131
+ lastIndex = toolDelta.index;
132
+ if(toolDelta.id) toolBuffer.id = toolDelta.id;
133
+ if(toolDelta.function) {
134
+ if(toolDelta.function.name) {
135
+ toolBuffer.function.name ||= "";
136
+ toolBuffer.function.name += toolDelta.function.name;
137
+ }
138
+ if(toolDelta.function.arguments) {
139
+ toolBuffer.function.arguments ||= "";
140
+ toolBuffer.function.arguments += toolDelta.function.arguments;
141
+ }
142
+ }
143
+ }
144
+ }
145
+ if(reasoning) {
146
+ if(!msg.reasoning_content) msg.reasoning_content = "";
147
+ msg.reasoning_content += reasoning;
148
+ }
149
+ }
150
+
151
+ if(toolBuffer) {
152
+ msg.tool_calls ||= [];
153
+ // @ts-ignore
154
+ msg.tool_calls.push(toolBuffer);
155
+ }
156
+
157
+ return msg as ChatMessage;
158
+ }
159
+
52
160
  try {
53
161
  for(let i = 0; i < maxRuns; i++) {
54
162
  if(maxRuns > 1) {
55
163
  process.stdout.write(` ${i + 1}/${maxRuns}`);
56
164
  }
57
- const response = await client.chat.completions.create({
58
- model,
59
- ...json,
60
- });
165
+ const response = await respond();
61
166
  try {
62
167
  test.test(response);
63
168
  } catch(e) {
64
169
  console.error("Response:");
65
- console.error(JSON.stringify(response.choices[0], null, 2));
170
+ console.error(JSON.stringify(response, null, 2));
66
171
  throw e;
67
172
  }
68
173
  }
@@ -90,6 +195,7 @@ ${passed}/${found} evals passed. Failures:
90
195
  cli.command("proxy")
91
196
  .requiredOption("-p, --port <number>", "Port to listen on")
92
197
  .requiredOption("-t, --target <url>", "Target URL to proxy to")
198
+ .option("--pretty", "Pretty-print the JSON")
93
199
  .action(async (options) => {
94
200
  const port = parseInt(options.port, 10);
95
201
  const targetUrl = new URL(options.target);
@@ -122,6 +228,8 @@ cli.command("proxy")
122
228
  // Choose the right module based on target protocol
123
229
  const httpModule = targetUrl.protocol === "https:" ? https : http;
124
230
 
231
+ const buffer: string[] = [];
232
+
125
233
  // Create proxy request
126
234
  const proxyReq = httpModule.request(
127
235
  {
@@ -177,12 +285,14 @@ cli.command("proxy")
177
285
  });
178
286
 
179
287
  req.on("data", (chunk) => {
180
- process.stdout.write(chunk);
288
+ buffer.push(chunk);
289
+ if(!options.pretty) process.stdout.write(chunk);
181
290
  proxyReq.write(chunk);
182
291
  });
183
292
 
184
293
  req.on("end", () => {
185
- process.stdout.write("\n");
294
+ if(options.pretty) console.log(JSON.stringify(JSON.parse(buffer.join()), null, 2));
295
+ else process.stdout.write("\n");
186
296
  console.log(`[${timestamp}] ✅ Request complete`);
187
297
  proxyReq.end();
188
298
  });
@@ -208,39 +318,6 @@ cli.command("proxy")
208
318
  });
209
319
  });
210
320
 
211
- function evalName(file: string) {
212
- return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
213
- }
214
-
215
- async function* findTestFiles(dir: string): AsyncGenerator<string> {
216
- try {
217
- await fs.stat(dir);
218
- } catch(e) {
219
- const pathname = `${dir}.js`;
220
- const stat = await fs.stat(pathname);
221
- if(stat.isFile()) {
222
- yield pathname;
223
- return;
224
- }
225
- throw e;
226
- }
227
- const entryNames = await fs.readdir(dir);
228
- const entries = await Promise.all(entryNames.map(async (entry) => {
229
- return {
230
- path: path.join(dir, entry),
231
- stat: await fs.stat(path.join(dir, entry)),
232
- };
233
- }));
234
- for(const entry of entries) {
235
- if(entry.stat.isFile() && entry.path.endsWith(".js")) {
236
- yield entry.path;
237
- }
238
- if(entry.stat.isDirectory()) {
239
- yield* findTestFiles(entry.path);
240
- }
241
- }
242
- }
243
-
244
321
  function stderrLog(item: string, ...items: string[]) {
245
322
  let formatted = item;
246
323
  if(items.length > 0) {