@syntheticlab/synbad 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +111 -26
  2. package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
  3. package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
  4. package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
  5. package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
  6. package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
  7. package/dist/evals/reasoning/reasoning-parsing.js +4 -4
  8. package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
  9. package/dist/evals/reasoning/response-in-reasoning.js +59 -0
  10. package/dist/evals/tools/claude-dash.d.ts +2 -2
  11. package/dist/evals/tools/claude-dash.js +1 -2
  12. package/dist/evals/tools/crush-list-files.d.ts +2 -5
  13. package/dist/evals/tools/crush-list-files.js +6 -8
  14. package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
  15. package/dist/evals/tools/multi-turn-tools.js +100 -0
  16. package/dist/evals/tools/no-fn-args.d.ts +22 -0
  17. package/dist/evals/tools/no-fn-args.js +31 -0
  18. package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
  19. package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
  20. package/dist/evals/tools/parallel-tool.d.ts +2 -2
  21. package/dist/evals/tools/parallel-tool.js +1 -2
  22. package/dist/evals/tools/simple-tool.d.ts +2 -2
  23. package/dist/evals/tools/simple-tool.js +3 -2
  24. package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
  25. package/dist/evals/tools/tool-dash-underscore.js +37 -0
  26. package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
  27. package/dist/evals/tools/tool-path-corruption.js +41 -0
  28. package/dist/source/asserts.d.ts +4 -1
  29. package/dist/source/asserts.js +36 -0
  30. package/dist/source/chat-completion.d.ts +5 -0
  31. package/dist/source/chat-completion.js +1 -0
  32. package/dist/source/evals.d.ts +9 -0
  33. package/dist/source/evals.js +53 -0
  34. package/dist/source/evals.test.d.ts +1 -0
  35. package/dist/source/evals.test.js +12 -0
  36. package/dist/source/exports.d.ts +2 -0
  37. package/dist/source/exports.js +1 -0
  38. package/dist/source/index.js +204 -38
  39. package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
  40. package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
  41. package/evals/reasoning/reasoning-parsing.ts +5 -5
  42. package/evals/reasoning/response-in-reasoning.ts +65 -0
  43. package/evals/tools/claude-dash.ts +2 -3
  44. package/evals/tools/crush-list-files.ts +11 -13
  45. package/evals/tools/multi-turn-tools.ts +104 -0
  46. package/evals/tools/no-fn-args.ts +34 -0
  47. package/evals/tools/octo-list-no-optional-args.ts +81 -0
  48. package/evals/tools/parallel-tool.ts +2 -3
  49. package/evals/tools/simple-tool.ts +4 -3
  50. package/evals/tools/tool-dash-underscore.ts +40 -0
  51. package/evals/tools/tool-path-corruption.ts +46 -0
  52. package/package.json +10 -3
  53. package/source/asserts.ts +37 -1
  54. package/source/chat-completion.ts +6 -0
  55. package/source/evals.test.ts +13 -0
  56. package/source/evals.ts +56 -0
  57. package/source/exports.ts +2 -0
  58. package/source/index.ts +246 -38
package/source/index.ts CHANGED
@@ -1,8 +1,11 @@
1
1
  #!/usr/bin/env node
2
- import { Command } from "@commander-js/extra-typings";
3
- import fs from "fs/promises";
2
+ import * as http from "http";
3
+ import * as https from "https";
4
4
  import path from "path";
5
+ import { Command } from "@commander-js/extra-typings";
5
6
  import OpenAI from "openai";
7
+ import { ChatMessage, getReasoning } from "./chat-completion.ts";
8
+ import { findTestFiles, evalName } from "./evals.ts";
6
9
 
7
10
  const cli = new Command()
8
11
  .name("synbad")
@@ -19,14 +22,20 @@ cli.command("eval")
19
22
  .option(
20
23
  "--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)"
21
24
  )
25
+ .option(
26
+ "--reasoning-effort <level>", "Set the reasoning effort to high, medium, or low"
27
+ )
22
28
  .option(
23
29
  "--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash"
24
30
  )
25
31
  .option(
26
32
  "--count <num times>", "Number of times to run the eval. Any failures count as an overall failure",
27
33
  )
34
+ .option(
35
+ "--stream", "Test streaming API calls",
36
+ )
28
37
  .requiredOption("--model <model name>", "The model name to test")
29
- .action(async ({ model, envVar, baseUrl, only, count }) => {
38
+ .action(async ({ model, envVar, baseUrl, only, count, skipReasoning, reasoningEffort, stream }) => {
30
39
  if(!process.env[envVar]) {
31
40
  console.error(`No env var named ${envVar} exists for the current process`);
32
41
  process.exit(1);
@@ -39,28 +48,126 @@ cli.command("eval")
39
48
  const failures = new Set<string>();
40
49
  const evalPath = only ? path.join(
41
50
  import.meta.dirname, "..", only
42
- ) : path.join(import.meta.dirname, "../evals");
51
+ ) : path.join(import.meta.dirname, "..", "evals");
43
52
  const maxRuns = count == null ? 1 : parseInt(count, 10);
44
- for await(const testFile of findTestFiles(evalPath)) {
53
+ for await(const testFile of findTestFiles(evalPath, !!skipReasoning)) {
45
54
  found++;
46
55
  const test = await import(testFile);
47
56
  const json = test.json;
48
57
  const name = evalName(testFile);
49
58
  process.stdout.write(`Running ${name}...`);
59
+
60
+ async function respond(): Promise<ChatMessage> {
61
+ const reasoning = reasoningEffort == null ? {} : {
62
+ reasoning_effort: reasoningEffort,
63
+ };
64
+ if(!stream) {
65
+ const response = await client.chat.completions.create({
66
+ ...json,
67
+ ...reasoning,
68
+ stream: false,
69
+ model,
70
+ });
71
+ return response.choices[0].message as ChatMessage;
72
+ }
73
+
74
+ const msg: Partial<ChatMessage> = {};
75
+
76
+ const chunkStream = await (client.chat.completions.create({
77
+ ...json,
78
+ ...reasoning,
79
+ model,
80
+ stream: true,
81
+ }) as unknown as Promise<AsyncIterable<OpenAI.ChatCompletionChunk & {
82
+ choices: Array<{
83
+ delta: {
84
+ reasoning?: string,
85
+ reasoning_content?: string,
86
+ },
87
+ }>
88
+ }>>);
89
+
90
+ let lastIndex: number | null = null;
91
+ let toolBuffer: {
92
+ id?: string,
93
+ type: "function",
94
+ index: number,
95
+ function: {
96
+ name?: string,
97
+ arguments?: string,
98
+ },
99
+ } | null = null;
100
+ for await(const chunk of chunkStream) {
101
+ if(!chunk.choices) continue;
102
+ const choice = chunk.choices[0];
103
+ if(!choice) continue;
104
+ const content = choice.delta.content;
105
+ const tools = choice.delta.tool_calls;
106
+ const reasoning = getReasoning(choice.delta);
107
+ if(content) {
108
+ if(!msg.content) msg.content = "";
109
+ msg.content += content;
110
+ }
111
+ if(tools) {
112
+ for(const toolDelta of tools) {
113
+ if(lastIndex == null) lastIndex = toolDelta.index;
114
+ if(lastIndex !== toolDelta.index && toolBuffer != null) {
115
+ msg.tool_calls ||= [];
116
+ // @ts-ignore
117
+ msg.tool_calls.push(toolBuffer);
118
+ toolBuffer = {
119
+ index: toolDelta.index,
120
+ type: "function",
121
+ function: {},
122
+ };
123
+ }
124
+ if(!toolBuffer) {
125
+ toolBuffer = {
126
+ index: toolDelta.index,
127
+ type: "function",
128
+ function: {}
129
+ };
130
+ }
131
+ lastIndex = toolDelta.index;
132
+ if(toolDelta.id) toolBuffer.id = toolDelta.id;
133
+ if(toolDelta.function) {
134
+ if(toolDelta.function.name) {
135
+ toolBuffer.function.name ||= "";
136
+ toolBuffer.function.name += toolDelta.function.name;
137
+ }
138
+ if(toolDelta.function.arguments) {
139
+ toolBuffer.function.arguments ||= "";
140
+ toolBuffer.function.arguments += toolDelta.function.arguments;
141
+ }
142
+ }
143
+ }
144
+ }
145
+ if(reasoning) {
146
+ if(!msg.reasoning_content) msg.reasoning_content = "";
147
+ msg.reasoning_content += reasoning;
148
+ }
149
+ }
150
+
151
+ if(toolBuffer) {
152
+ msg.tool_calls ||= [];
153
+ // @ts-ignore
154
+ msg.tool_calls.push(toolBuffer);
155
+ }
156
+
157
+ return msg as ChatMessage;
158
+ }
159
+
50
160
  try {
51
161
  for(let i = 0; i < maxRuns; i++) {
52
162
  if(maxRuns > 1) {
53
163
  process.stdout.write(` ${i + 1}/${maxRuns}`);
54
164
  }
55
- const response = await client.chat.completions.create({
56
- model,
57
- ...json,
58
- });
165
+ const response = await respond();
59
166
  try {
60
167
  test.test(response);
61
168
  } catch(e) {
62
169
  console.error("Response:");
63
- console.error(JSON.stringify(response.choices[0], null, 2));
170
+ console.error(JSON.stringify(response, null, 2));
64
171
  throw e;
65
172
  }
66
173
  }
@@ -85,37 +192,138 @@ ${passed}/${found} evals passed. Failures:
85
192
  `.trim());
86
193
  });
87
194
 
88
- function evalName(file: string) {
89
- return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
90
- }
195
+ cli.command("proxy")
196
+ .requiredOption("-p, --port <number>", "Port to listen on")
197
+ .requiredOption("-t, --target <url>", "Target URL to proxy to")
198
+ .option("--pretty", "Pretty-print the JSON")
199
+ .action(async (options) => {
200
+ const port = parseInt(options.port, 10);
201
+ const targetUrl = new URL(options.target);
91
202
 
92
- async function* findTestFiles(dir: string): AsyncGenerator<string> {
93
- try {
94
- await fs.stat(dir);
95
- } catch(e) {
96
- const pathname = `${dir}.js`;
97
- const stat = await fs.stat(pathname);
98
- if(stat.isFile()) {
99
- yield pathname;
100
- return;
101
- }
102
- throw e;
103
- }
104
- const entryNames = await fs.readdir(dir);
105
- const entries = await Promise.all(entryNames.map(async (entry) => {
106
- return {
107
- path: path.join(dir, entry),
108
- stat: await fs.stat(path.join(dir, entry)),
109
- };
110
- }));
111
- for(const entry of entries) {
112
- if(entry.stat.isFile() && entry.path.endsWith(".js")) {
113
- yield entry.path;
114
- }
115
- if(entry.stat.isDirectory()) {
116
- yield* findTestFiles(entry.path);
203
+ stderrLog(`🚀 Starting proxy on port ${port}`);
204
+ stderrLog(`📯 Proxying to: ${targetUrl.origin}`);
205
+
206
+ const server = http.createServer(async (req, res) => {
207
+ try {
208
+ const timestamp = new Date().toISOString();
209
+
210
+ // Log request metadata
211
+ stderrLog(`\n[${timestamp}] 📥 ${req.method} ${req.url}`);
212
+
213
+ // Construct target URL - handle target path correctly
214
+ const incomingPath = req.url || "";
215
+ const targetBasePath = targetUrl.pathname.replace(/\/$/, ''); // Remove trailing slash
216
+ const targetPath = targetBasePath + incomingPath;
217
+ const target = `${targetUrl.origin}${targetPath}`;
218
+
219
+ // Prepare request headers (remove problematic ones)
220
+ const requestHeaders = { ...req.headers };
221
+ delete requestHeaders["host"];
222
+ delete requestHeaders["content-length"];
223
+ delete requestHeaders["transfer-encoding"];
224
+
225
+ stderrLog(`[${timestamp}] ➡️ Forwarding to: ${target}`);
226
+ stderrLog(`[${timestamp}] 📦 Writing request data to stdout...`);
227
+
228
+ // Choose the right module based on target protocol
229
+ const httpModule = targetUrl.protocol === "https:" ? https : http;
230
+
231
+ const buffer: string[] = [];
232
+
233
+ // Create proxy request
234
+ const proxyReq = httpModule.request(
235
+ {
236
+ hostname: targetUrl.hostname,
237
+ port: targetUrl.port || (targetUrl.protocol === "https:" ? 443 : 80),
238
+ path: targetPath,
239
+ method: req.method,
240
+ headers: requestHeaders,
241
+ },
242
+ (proxyRes) => {
243
+ // Log response status and headers
244
+ stderrLog(
245
+ `[${timestamp}] 📤 Response to ${req.url}: ${proxyRes.statusCode} ${proxyRes.statusMessage}`
246
+ );
247
+ stderrLog(`[${timestamp}] 📦 Loading response...`);
248
+
249
+ // Filter problematic response headers
250
+ const responseHeaders = { ...proxyRes.headers };
251
+ delete responseHeaders["transfer-encoding"];
252
+ delete responseHeaders["content-length"];
253
+
254
+ res.writeHead(proxyRes.statusCode || 200, responseHeaders);
255
+
256
+ // Stream response data immediately to client
257
+ proxyRes.on("data", (chunk) => {
258
+ res.write(chunk);
259
+ });
260
+
261
+ proxyRes.on("end", () => {
262
+ stderrLog(`[${timestamp}] ✅ Response complete`);
263
+ res.end();
264
+ });
265
+ }
266
+ );
267
+
268
+ // Handle proxy request errors
269
+ proxyReq.on("error", (e) => {
270
+ console.error(`[${timestamp}] ❌ Proxy request error:`, e);
271
+ if (!res.headersSent) {
272
+ res.writeHead(500, { "Content-Type": "application/json" });
273
+ res.end(JSON.stringify({ error: "Proxy error", message: e.message }));
274
+ }
275
+ });
276
+
277
+ // Handle client request errors
278
+ req.on("error", (e) => {
279
+ console.error(`[${timestamp}] ❌ Client request error:`, e);
280
+ proxyReq.destroy();
281
+ if (!res.headersSent) {
282
+ res.writeHead(400, { "Content-Type": "application/json" });
283
+ res.end(JSON.stringify({ error: "Client error", message: e.message }));
284
+ }
285
+ });
286
+
287
+ req.on("data", (chunk) => {
288
+ buffer.push(chunk);
289
+ if(!options.pretty) process.stdout.write(chunk);
290
+ proxyReq.write(chunk);
291
+ });
292
+
293
+ req.on("end", () => {
294
+ if(options.pretty) console.log(JSON.stringify(JSON.parse(buffer.join()), null, 2));
295
+ else process.stdout.write("\n");
296
+ console.log(`[${timestamp}] ✅ Request complete`);
297
+ proxyReq.end();
298
+ });
299
+
300
+ } catch (e) {
301
+ const timestamp = new Date().toISOString();
302
+ console.error(`[${timestamp}] ❌ Server error:`, e);
303
+ if (!res.headersSent) {
304
+ res.writeHead(500, { "Content-Type": "application/json" });
305
+ res.end(JSON.stringify({ error: "Server error", message: (e as Error).message }));
306
+ }
117
307
  }
308
+ });
309
+
310
+ server.on("error", (e) => {
311
+ console.error("❌ Server error:", e);
312
+ });
313
+
314
+ server.listen(port, () => {
315
+ stderrLog(`✅ Server listening on http://localhost:${port}`);
316
+ stderrLog(`📡 All HTTP request data will be logged to stdout`);
317
+ stderrLog("🤓 Terminal UI messages (such as this one) will be logged to stderr");
318
+ });
319
+ });
320
+
321
+ function stderrLog(item: string, ...items: string[]) {
322
+ let formatted = item;
323
+ if(items.length > 0) {
324
+ formatted += " " + items.join(" ");
118
325
  }
326
+ process.stderr.write(formatted + "\n");
119
327
  }
120
328
 
121
329
  cli.parse();