@syntheticlab/synbad 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -26
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
- package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-parsing.js +4 -4
- package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
- package/dist/evals/reasoning/response-in-reasoning.js +59 -0
- package/dist/evals/tools/claude-dash.d.ts +2 -2
- package/dist/evals/tools/claude-dash.js +1 -2
- package/dist/evals/tools/crush-list-files.d.ts +2 -5
- package/dist/evals/tools/crush-list-files.js +6 -8
- package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
- package/dist/evals/tools/multi-turn-tools.js +100 -0
- package/dist/evals/tools/no-fn-args.d.ts +22 -0
- package/dist/evals/tools/no-fn-args.js +31 -0
- package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
- package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
- package/dist/evals/tools/parallel-tool.d.ts +2 -2
- package/dist/evals/tools/parallel-tool.js +1 -2
- package/dist/evals/tools/simple-tool.d.ts +2 -2
- package/dist/evals/tools/simple-tool.js +3 -2
- package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
- package/dist/evals/tools/tool-dash-underscore.js +37 -0
- package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
- package/dist/evals/tools/tool-path-corruption.js +41 -0
- package/dist/source/asserts.d.ts +4 -1
- package/dist/source/asserts.js +36 -0
- package/dist/source/chat-completion.d.ts +5 -0
- package/dist/source/chat-completion.js +1 -0
- package/dist/source/evals.d.ts +9 -0
- package/dist/source/evals.js +53 -0
- package/dist/source/evals.test.d.ts +1 -0
- package/dist/source/evals.test.js +12 -0
- package/dist/source/exports.d.ts +2 -0
- package/dist/source/exports.js +1 -0
- package/dist/source/index.js +204 -38
- package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
- package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
- package/evals/reasoning/reasoning-parsing.ts +5 -5
- package/evals/reasoning/response-in-reasoning.ts +65 -0
- package/evals/tools/claude-dash.ts +2 -3
- package/evals/tools/crush-list-files.ts +11 -13
- package/evals/tools/multi-turn-tools.ts +104 -0
- package/evals/tools/no-fn-args.ts +34 -0
- package/evals/tools/octo-list-no-optional-args.ts +81 -0
- package/evals/tools/parallel-tool.ts +2 -3
- package/evals/tools/simple-tool.ts +4 -3
- package/evals/tools/tool-dash-underscore.ts +40 -0
- package/evals/tools/tool-path-corruption.ts +46 -0
- package/package.json +10 -3
- package/source/asserts.ts +37 -1
- package/source/chat-completion.ts +6 -0
- package/source/evals.test.ts +13 -0
- package/source/evals.ts +56 -0
- package/source/exports.ts +2 -0
- package/source/index.ts +246 -38
package/source/index.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import
|
|
3
|
-
import
|
|
2
|
+
import * as http from "http";
|
|
3
|
+
import * as https from "https";
|
|
4
4
|
import path from "path";
|
|
5
|
+
import { Command } from "@commander-js/extra-typings";
|
|
5
6
|
import OpenAI from "openai";
|
|
7
|
+
import { ChatMessage, getReasoning } from "./chat-completion.ts";
|
|
8
|
+
import { findTestFiles, evalName } from "./evals.ts";
|
|
6
9
|
|
|
7
10
|
const cli = new Command()
|
|
8
11
|
.name("synbad")
|
|
@@ -19,14 +22,20 @@ cli.command("eval")
|
|
|
19
22
|
.option(
|
|
20
23
|
"--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)"
|
|
21
24
|
)
|
|
25
|
+
.option(
|
|
26
|
+
"--reasoning-effort <level>", "Set the reasoning effort to high, medium, or low"
|
|
27
|
+
)
|
|
22
28
|
.option(
|
|
23
29
|
"--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash"
|
|
24
30
|
)
|
|
25
31
|
.option(
|
|
26
32
|
"--count <num times>", "Number of times to run the eval. Any failures count as an overall failure",
|
|
27
33
|
)
|
|
34
|
+
.option(
|
|
35
|
+
"--stream", "Test streaming API calls",
|
|
36
|
+
)
|
|
28
37
|
.requiredOption("--model <model name>", "The model name to test")
|
|
29
|
-
.action(async ({ model, envVar, baseUrl, only, count }) => {
|
|
38
|
+
.action(async ({ model, envVar, baseUrl, only, count, skipReasoning, reasoningEffort, stream }) => {
|
|
30
39
|
if(!process.env[envVar]) {
|
|
31
40
|
console.error(`No env var named ${envVar} exists for the current process`);
|
|
32
41
|
process.exit(1);
|
|
@@ -39,28 +48,126 @@ cli.command("eval")
|
|
|
39
48
|
const failures = new Set<string>();
|
|
40
49
|
const evalPath = only ? path.join(
|
|
41
50
|
import.meta.dirname, "..", only
|
|
42
|
-
) : path.join(import.meta.dirname, "
|
|
51
|
+
) : path.join(import.meta.dirname, "..", "evals");
|
|
43
52
|
const maxRuns = count == null ? 1 : parseInt(count, 10);
|
|
44
|
-
for await(const testFile of findTestFiles(evalPath)) {
|
|
53
|
+
for await(const testFile of findTestFiles(evalPath, !!skipReasoning)) {
|
|
45
54
|
found++;
|
|
46
55
|
const test = await import(testFile);
|
|
47
56
|
const json = test.json;
|
|
48
57
|
const name = evalName(testFile);
|
|
49
58
|
process.stdout.write(`Running ${name}...`);
|
|
59
|
+
|
|
60
|
+
async function respond(): Promise<ChatMessage> {
|
|
61
|
+
const reasoning = reasoningEffort == null ? {} : {
|
|
62
|
+
reasoning_effort: reasoningEffort,
|
|
63
|
+
};
|
|
64
|
+
if(!stream) {
|
|
65
|
+
const response = await client.chat.completions.create({
|
|
66
|
+
...json,
|
|
67
|
+
...reasoning,
|
|
68
|
+
stream: false,
|
|
69
|
+
model,
|
|
70
|
+
});
|
|
71
|
+
return response.choices[0].message as ChatMessage;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const msg: Partial<ChatMessage> = {};
|
|
75
|
+
|
|
76
|
+
const chunkStream = await (client.chat.completions.create({
|
|
77
|
+
...json,
|
|
78
|
+
...reasoning,
|
|
79
|
+
model,
|
|
80
|
+
stream: true,
|
|
81
|
+
}) as unknown as Promise<AsyncIterable<OpenAI.ChatCompletionChunk & {
|
|
82
|
+
choices: Array<{
|
|
83
|
+
delta: {
|
|
84
|
+
reasoning?: string,
|
|
85
|
+
reasoning_content?: string,
|
|
86
|
+
},
|
|
87
|
+
}>
|
|
88
|
+
}>>);
|
|
89
|
+
|
|
90
|
+
let lastIndex: number | null = null;
|
|
91
|
+
let toolBuffer: {
|
|
92
|
+
id?: string,
|
|
93
|
+
type: "function",
|
|
94
|
+
index: number,
|
|
95
|
+
function: {
|
|
96
|
+
name?: string,
|
|
97
|
+
arguments?: string,
|
|
98
|
+
},
|
|
99
|
+
} | null = null;
|
|
100
|
+
for await(const chunk of chunkStream) {
|
|
101
|
+
if(!chunk.choices) continue;
|
|
102
|
+
const choice = chunk.choices[0];
|
|
103
|
+
if(!choice) continue;
|
|
104
|
+
const content = choice.delta.content;
|
|
105
|
+
const tools = choice.delta.tool_calls;
|
|
106
|
+
const reasoning = getReasoning(choice.delta);
|
|
107
|
+
if(content) {
|
|
108
|
+
if(!msg.content) msg.content = "";
|
|
109
|
+
msg.content += content;
|
|
110
|
+
}
|
|
111
|
+
if(tools) {
|
|
112
|
+
for(const toolDelta of tools) {
|
|
113
|
+
if(lastIndex == null) lastIndex = toolDelta.index;
|
|
114
|
+
if(lastIndex !== toolDelta.index && toolBuffer != null) {
|
|
115
|
+
msg.tool_calls ||= [];
|
|
116
|
+
// @ts-ignore
|
|
117
|
+
msg.tool_calls.push(toolBuffer);
|
|
118
|
+
toolBuffer = {
|
|
119
|
+
index: toolDelta.index,
|
|
120
|
+
type: "function",
|
|
121
|
+
function: {},
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
if(!toolBuffer) {
|
|
125
|
+
toolBuffer = {
|
|
126
|
+
index: toolDelta.index,
|
|
127
|
+
type: "function",
|
|
128
|
+
function: {}
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
lastIndex = toolDelta.index;
|
|
132
|
+
if(toolDelta.id) toolBuffer.id = toolDelta.id;
|
|
133
|
+
if(toolDelta.function) {
|
|
134
|
+
if(toolDelta.function.name) {
|
|
135
|
+
toolBuffer.function.name ||= "";
|
|
136
|
+
toolBuffer.function.name += toolDelta.function.name;
|
|
137
|
+
}
|
|
138
|
+
if(toolDelta.function.arguments) {
|
|
139
|
+
toolBuffer.function.arguments ||= "";
|
|
140
|
+
toolBuffer.function.arguments += toolDelta.function.arguments;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
if(reasoning) {
|
|
146
|
+
if(!msg.reasoning_content) msg.reasoning_content = "";
|
|
147
|
+
msg.reasoning_content += reasoning;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if(toolBuffer) {
|
|
152
|
+
msg.tool_calls ||= [];
|
|
153
|
+
// @ts-ignore
|
|
154
|
+
msg.tool_calls.push(toolBuffer);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return msg as ChatMessage;
|
|
158
|
+
}
|
|
159
|
+
|
|
50
160
|
try {
|
|
51
161
|
for(let i = 0; i < maxRuns; i++) {
|
|
52
162
|
if(maxRuns > 1) {
|
|
53
163
|
process.stdout.write(` ${i + 1}/${maxRuns}`);
|
|
54
164
|
}
|
|
55
|
-
const response = await
|
|
56
|
-
model,
|
|
57
|
-
...json,
|
|
58
|
-
});
|
|
165
|
+
const response = await respond();
|
|
59
166
|
try {
|
|
60
167
|
test.test(response);
|
|
61
168
|
} catch(e) {
|
|
62
169
|
console.error("Response:");
|
|
63
|
-
console.error(JSON.stringify(response
|
|
170
|
+
console.error(JSON.stringify(response, null, 2));
|
|
64
171
|
throw e;
|
|
65
172
|
}
|
|
66
173
|
}
|
|
@@ -85,37 +192,138 @@ ${passed}/${found} evals passed. Failures:
|
|
|
85
192
|
`.trim());
|
|
86
193
|
});
|
|
87
194
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
195
|
+
cli.command("proxy")
|
|
196
|
+
.requiredOption("-p, --port <number>", "Port to listen on")
|
|
197
|
+
.requiredOption("-t, --target <url>", "Target URL to proxy to")
|
|
198
|
+
.option("--pretty", "Pretty-print the JSON")
|
|
199
|
+
.action(async (options) => {
|
|
200
|
+
const port = parseInt(options.port, 10);
|
|
201
|
+
const targetUrl = new URL(options.target);
|
|
91
202
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
203
|
+
stderrLog(`🚀 Starting proxy on port ${port}`);
|
|
204
|
+
stderrLog(`📯 Proxying to: ${targetUrl.origin}`);
|
|
205
|
+
|
|
206
|
+
const server = http.createServer(async (req, res) => {
|
|
207
|
+
try {
|
|
208
|
+
const timestamp = new Date().toISOString();
|
|
209
|
+
|
|
210
|
+
// Log request metadata
|
|
211
|
+
stderrLog(`\n[${timestamp}] 📥 ${req.method} ${req.url}`);
|
|
212
|
+
|
|
213
|
+
// Construct target URL - handle target path correctly
|
|
214
|
+
const incomingPath = req.url || "";
|
|
215
|
+
const targetBasePath = targetUrl.pathname.replace(/\/$/, ''); // Remove trailing slash
|
|
216
|
+
const targetPath = targetBasePath + incomingPath;
|
|
217
|
+
const target = `${targetUrl.origin}${targetPath}`;
|
|
218
|
+
|
|
219
|
+
// Prepare request headers (remove problematic ones)
|
|
220
|
+
const requestHeaders = { ...req.headers };
|
|
221
|
+
delete requestHeaders["host"];
|
|
222
|
+
delete requestHeaders["content-length"];
|
|
223
|
+
delete requestHeaders["transfer-encoding"];
|
|
224
|
+
|
|
225
|
+
stderrLog(`[${timestamp}] ➡️ Forwarding to: ${target}`);
|
|
226
|
+
stderrLog(`[${timestamp}] 📦 Writing request data to stdout...`);
|
|
227
|
+
|
|
228
|
+
// Choose the right module based on target protocol
|
|
229
|
+
const httpModule = targetUrl.protocol === "https:" ? https : http;
|
|
230
|
+
|
|
231
|
+
const buffer: string[] = [];
|
|
232
|
+
|
|
233
|
+
// Create proxy request
|
|
234
|
+
const proxyReq = httpModule.request(
|
|
235
|
+
{
|
|
236
|
+
hostname: targetUrl.hostname,
|
|
237
|
+
port: targetUrl.port || (targetUrl.protocol === "https:" ? 443 : 80),
|
|
238
|
+
path: targetPath,
|
|
239
|
+
method: req.method,
|
|
240
|
+
headers: requestHeaders,
|
|
241
|
+
},
|
|
242
|
+
(proxyRes) => {
|
|
243
|
+
// Log response status and headers
|
|
244
|
+
stderrLog(
|
|
245
|
+
`[${timestamp}] 📤 Response to ${req.url}: ${proxyRes.statusCode} ${proxyRes.statusMessage}`
|
|
246
|
+
);
|
|
247
|
+
stderrLog(`[${timestamp}] 📦 Loading response...`);
|
|
248
|
+
|
|
249
|
+
// Filter problematic response headers
|
|
250
|
+
const responseHeaders = { ...proxyRes.headers };
|
|
251
|
+
delete responseHeaders["transfer-encoding"];
|
|
252
|
+
delete responseHeaders["content-length"];
|
|
253
|
+
|
|
254
|
+
res.writeHead(proxyRes.statusCode || 200, responseHeaders);
|
|
255
|
+
|
|
256
|
+
// Stream response data immediately to client
|
|
257
|
+
proxyRes.on("data", (chunk) => {
|
|
258
|
+
res.write(chunk);
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
proxyRes.on("end", () => {
|
|
262
|
+
stderrLog(`[${timestamp}] ✅ Response complete`);
|
|
263
|
+
res.end();
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
);
|
|
267
|
+
|
|
268
|
+
// Handle proxy request errors
|
|
269
|
+
proxyReq.on("error", (e) => {
|
|
270
|
+
console.error(`[${timestamp}] ❌ Proxy request error:`, e);
|
|
271
|
+
if (!res.headersSent) {
|
|
272
|
+
res.writeHead(500, { "Content-Type": "application/json" });
|
|
273
|
+
res.end(JSON.stringify({ error: "Proxy error", message: e.message }));
|
|
274
|
+
}
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
// Handle client request errors
|
|
278
|
+
req.on("error", (e) => {
|
|
279
|
+
console.error(`[${timestamp}] ❌ Client request error:`, e);
|
|
280
|
+
proxyReq.destroy();
|
|
281
|
+
if (!res.headersSent) {
|
|
282
|
+
res.writeHead(400, { "Content-Type": "application/json" });
|
|
283
|
+
res.end(JSON.stringify({ error: "Client error", message: e.message }));
|
|
284
|
+
}
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
req.on("data", (chunk) => {
|
|
288
|
+
buffer.push(chunk);
|
|
289
|
+
if(!options.pretty) process.stdout.write(chunk);
|
|
290
|
+
proxyReq.write(chunk);
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
req.on("end", () => {
|
|
294
|
+
if(options.pretty) console.log(JSON.stringify(JSON.parse(buffer.join()), null, 2));
|
|
295
|
+
else process.stdout.write("\n");
|
|
296
|
+
console.log(`[${timestamp}] ✅ Request complete`);
|
|
297
|
+
proxyReq.end();
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
} catch (e) {
|
|
301
|
+
const timestamp = new Date().toISOString();
|
|
302
|
+
console.error(`[${timestamp}] ❌ Server error:`, e);
|
|
303
|
+
if (!res.headersSent) {
|
|
304
|
+
res.writeHead(500, { "Content-Type": "application/json" });
|
|
305
|
+
res.end(JSON.stringify({ error: "Server error", message: (e as Error).message }));
|
|
306
|
+
}
|
|
117
307
|
}
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
server.on("error", (e) => {
|
|
311
|
+
console.error("❌ Server error:", e);
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
server.listen(port, () => {
|
|
315
|
+
stderrLog(`✅ Server listening on http://localhost:${port}`);
|
|
316
|
+
stderrLog(`📡 All HTTP request data will be logged to stdout`);
|
|
317
|
+
stderrLog("🤓 Terminal UI messages (such as this one) will be logged to stderr");
|
|
318
|
+
});
|
|
319
|
+
});
|
|
320
|
+
|
|
321
|
+
function stderrLog(item: string, ...items: string[]) {
|
|
322
|
+
let formatted = item;
|
|
323
|
+
if(items.length > 0) {
|
|
324
|
+
formatted += " " + items.join(" ");
|
|
118
325
|
}
|
|
326
|
+
process.stderr.write(formatted + "\n");
|
|
119
327
|
}
|
|
120
328
|
|
|
121
329
|
cli.parse();
|