@syntheticlab/synbad 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -0
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +13 -0
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +16 -0
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +46 -0
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +60 -0
- package/dist/evals/reasoning/reasoning-parsing.d.ts +8 -0
- package/dist/evals/reasoning/reasoning-parsing.js +10 -0
- package/dist/evals/tools/claude-dash.d.ts +1249 -0
- package/dist/evals/tools/claude-dash.js +637 -0
- package/dist/evals/tools/crush-list-files.d.ts +676 -0
- package/dist/evals/tools/crush-list-files.js +441 -0
- package/dist/evals/tools/parallel-tool.d.ts +27 -0
- package/dist/evals/tools/parallel-tool.js +33 -0
- package/dist/evals/tools/simple-tool.d.ts +27 -0
- package/dist/evals/tools/simple-tool.js +37 -0
- package/dist/source/asserts.d.ts +6 -0
- package/dist/source/asserts.js +54 -0
- package/dist/source/chat-completion.d.ts +117 -0
- package/dist/source/chat-completion.js +113 -0
- package/dist/source/index.d.ts +2 -0
- package/dist/source/index.js +107 -0
- package/evals/reasoning/multiturn-reasoning-parsing.ts +19 -0
- package/evals/reasoning/reasoning-claude-tool-call.ts +63 -0
- package/evals/reasoning/reasoning-parsing.ts +13 -0
- package/evals/tools/claude-dash.ts +640 -0
- package/evals/tools/crush-list-files.ts +451 -0
- package/evals/tools/parallel-tool.ts +36 -0
- package/evals/tools/simple-tool.ts +40 -0
- package/package.json +37 -0
- package/source/asserts.ts +70 -0
- package/source/chat-completion.ts +140 -0
- package/source/index.ts +115 -0
- package/tsconfig.json +32 -0
package/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
Synbad is a tool for detecting bugs in LLM inference providers, especially
|
|
4
|
+
open-source ones. Synbad is maintained by
|
|
5
|
+
[Synthetic](https://synthetic.new), as part of our efforts to keep our
|
|
6
|
+
inference quality as high as possible.
|
|
7
|
+
|
|
8
|
+
If you find bugs in Synthetic's model hosting, please contribute the bugs here!
|
|
9
|
+
We will fix them.
|
|
10
|
+
|
|
11
|
+
## Contributing
|
|
12
|
+
|
|
13
|
+
First, clone this repo from Github. Then `cd` into it and run:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
All inference evals are stored in the `evals/` directory. They're written in
|
|
20
|
+
TypeScript. You need to export two things from an eval:
|
|
21
|
+
|
|
22
|
+
1. The JSON that reproduces the problem, as the const `json`. It doesn't have to
|
|
23
|
+
reproduce it 100% of the time; if the bug appears even 5% of the time,
|
|
24
|
+
that's fine.
|
|
25
|
+
2. A `test` function that runs some asserts on the output of the response,
|
|
26
|
+
which detect the error.
|
|
27
|
+
|
|
28
|
+
For example, we can test reasoning parsing very simply (as we do in the
|
|
29
|
+
`evals/reasoning/reasoning-parsing.ts` file):
|
|
30
|
+
|
|
31
|
+
```typescript
|
|
32
|
+
import * as assert from "../../source/asserts.ts";
|
|
33
|
+
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
34
|
+
|
|
35
|
+
export function test(response: ChatResponse) {
|
|
36
|
+
const reasoning = response.choices[0].message.reasoning_content;
|
|
37
|
+
assert.isNotNullish(reasoning);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export const json = {
|
|
41
|
+
messages: [
|
|
42
|
+
{ role: "user", content: "Why does 1+1=2?" },
|
|
43
|
+
],
|
|
44
|
+
}
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
The `asserts.ts` file re-exports all of the built-in NodeJS assertion
|
|
48
|
+
functions, and also adds a few extra ones, e.g. `isNotNullish` which checks
|
|
49
|
+
whether an object is `null` or `undefined`.
|
|
50
|
+
|
|
51
|
+
To run your new eval, use the `synbad.sh` script in this repo. Assuming you're
|
|
52
|
+
testing the `evals/reasoning/reasoning-parsing` test, for GLM-4.6 on Synthetic,
|
|
53
|
+
and you want to run it 5 times since it isn't consistently failing:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
synbad.sh --env-var SYNTHETIC_API_KEY \
|
|
57
|
+
--base-url "https://api.synthetic.new/openai/v1" \
|
|
58
|
+
--only evals/reasoning/reasoning-parsing \
|
|
59
|
+
--model "hf:zai-org/GLM-4.6" \
|
|
60
|
+
--count 5
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Running Synbad
|
|
64
|
+
|
|
65
|
+
First, install it:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
npm install synbad
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Then run:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
synbad --env-var SYNTHETIC_API_KEY \
|
|
75
|
+
--base-url "https://api.synthetic.new/openai/v1" \
|
|
76
|
+
--model "hf:zai-org/GLM-4.6"
|
|
77
|
+
```
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test(response: ChatResponse): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: ({
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
reasoning_content?: undefined;
|
|
8
|
+
} | {
|
|
9
|
+
role: string;
|
|
10
|
+
reasoning_content: string;
|
|
11
|
+
content: string;
|
|
12
|
+
})[];
|
|
13
|
+
};
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test(response) {
|
|
3
|
+
const reasoning = response.choices[0].message.reasoning_content;
|
|
4
|
+
assert.isNotNullish(reasoning);
|
|
5
|
+
}
|
|
6
|
+
export const json = {
|
|
7
|
+
messages: [
|
|
8
|
+
{ role: "user", content: "Why does 1+1=2?" },
|
|
9
|
+
{
|
|
10
|
+
role: "assistant",
|
|
11
|
+
reasoning_content: "Because it does",
|
|
12
|
+
content: "Consider the successor function",
|
|
13
|
+
},
|
|
14
|
+
{ role: "user", content: "please explain that much more deeply" },
|
|
15
|
+
],
|
|
16
|
+
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import OpenAI from "openai";
|
|
2
|
+
export declare function test(response: OpenAI.ChatCompletion): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: ({
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
} | {
|
|
8
|
+
role: string;
|
|
9
|
+
content: {
|
|
10
|
+
type: string;
|
|
11
|
+
text: string;
|
|
12
|
+
}[];
|
|
13
|
+
})[];
|
|
14
|
+
max_tokens: number;
|
|
15
|
+
temperature: number;
|
|
16
|
+
reasoning_effort: string;
|
|
17
|
+
tools: {
|
|
18
|
+
type: string;
|
|
19
|
+
function: {
|
|
20
|
+
name: string;
|
|
21
|
+
description: string;
|
|
22
|
+
parameters: {
|
|
23
|
+
type: string;
|
|
24
|
+
properties: {
|
|
25
|
+
command: {
|
|
26
|
+
type: string;
|
|
27
|
+
description: string;
|
|
28
|
+
};
|
|
29
|
+
timeout: {
|
|
30
|
+
type: string;
|
|
31
|
+
description: string;
|
|
32
|
+
};
|
|
33
|
+
description: {
|
|
34
|
+
type: string;
|
|
35
|
+
description: string;
|
|
36
|
+
};
|
|
37
|
+
run_in_background: {
|
|
38
|
+
type: string;
|
|
39
|
+
description: string;
|
|
40
|
+
};
|
|
41
|
+
};
|
|
42
|
+
required: string[];
|
|
43
|
+
};
|
|
44
|
+
};
|
|
45
|
+
}[];
|
|
46
|
+
};
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test(response) {
|
|
3
|
+
const { tool_calls } = response.choices[0].message;
|
|
4
|
+
assert.isNotNullish(tool_calls);
|
|
5
|
+
assert.isNotEmptyArray(tool_calls);
|
|
6
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
7
|
+
}
|
|
8
|
+
export const json = {
|
|
9
|
+
"messages": [
|
|
10
|
+
{
|
|
11
|
+
"role": "system",
|
|
12
|
+
"content": "You are Claude Code, Anthropic's official CLI for Claude.\n\nYou are an interactive CLI tool that helps users with software engineering tasks. Use the instructions below and the tools available to you to assist the user."
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"role": "user",
|
|
16
|
+
"content": [
|
|
17
|
+
{
|
|
18
|
+
"type": "text",
|
|
19
|
+
"text": "run a quick git status for me. put the tool call inside your thinking"
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
}
|
|
23
|
+
],
|
|
24
|
+
"max_tokens": 32000,
|
|
25
|
+
"temperature": 1,
|
|
26
|
+
"reasoning_effort": "high",
|
|
27
|
+
"tools": [
|
|
28
|
+
{
|
|
29
|
+
"type": "function",
|
|
30
|
+
"function": {
|
|
31
|
+
"name": "Bash",
|
|
32
|
+
"description": "Executes a given bash command in a persistent shell session with optional timeout, ensuring proper handling and security measures.",
|
|
33
|
+
"parameters": {
|
|
34
|
+
"type": "object",
|
|
35
|
+
"properties": {
|
|
36
|
+
"command": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"description": "The command to execute"
|
|
39
|
+
},
|
|
40
|
+
"timeout": {
|
|
41
|
+
"type": "number",
|
|
42
|
+
"description": "Optional timeout in milliseconds (max 600000)"
|
|
43
|
+
},
|
|
44
|
+
"description": {
|
|
45
|
+
"type": "string",
|
|
46
|
+
"description": "Clear, concise description of what this command does in 5-10 words, in active voice. Examples:\nInput: ls\nOutput: List files in current directory\n\nInput: git status\nOutput: Show working tree status\n\nInput: npm install\nOutput: Install package dependencies\n\nInput: mkdir foo\nOutput: Create directory 'foo'"
|
|
47
|
+
},
|
|
48
|
+
"run_in_background": {
|
|
49
|
+
"type": "boolean",
|
|
50
|
+
"description": "Set to true to run this command in the background. Use BashOutput to read the output later."
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"required": [
|
|
54
|
+
"command"
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
};
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test(response) {
|
|
3
|
+
const reasoning = response.choices[0].message.reasoning_content;
|
|
4
|
+
assert.isNotNullish(reasoning);
|
|
5
|
+
}
|
|
6
|
+
export const json = {
|
|
7
|
+
"messages": [
|
|
8
|
+
{ "role": "user", "content": "Why does 1+1=2?" }
|
|
9
|
+
],
|
|
10
|
+
};
|