@syntheticlab/synbad 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +77 -0
  2. package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +13 -0
  3. package/dist/evals/reasoning/multiturn-reasoning-parsing.js +16 -0
  4. package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +46 -0
  5. package/dist/evals/reasoning/reasoning-claude-tool-call.js +60 -0
  6. package/dist/evals/reasoning/reasoning-parsing.d.ts +8 -0
  7. package/dist/evals/reasoning/reasoning-parsing.js +10 -0
  8. package/dist/evals/tools/claude-dash.d.ts +1249 -0
  9. package/dist/evals/tools/claude-dash.js +637 -0
  10. package/dist/evals/tools/crush-list-files.d.ts +676 -0
  11. package/dist/evals/tools/crush-list-files.js +441 -0
  12. package/dist/evals/tools/parallel-tool.d.ts +27 -0
  13. package/dist/evals/tools/parallel-tool.js +33 -0
  14. package/dist/evals/tools/simple-tool.d.ts +27 -0
  15. package/dist/evals/tools/simple-tool.js +37 -0
  16. package/dist/source/asserts.d.ts +6 -0
  17. package/dist/source/asserts.js +54 -0
  18. package/dist/source/chat-completion.d.ts +117 -0
  19. package/dist/source/chat-completion.js +113 -0
  20. package/dist/source/index.d.ts +2 -0
  21. package/dist/source/index.js +107 -0
  22. package/evals/reasoning/multiturn-reasoning-parsing.ts +19 -0
  23. package/evals/reasoning/reasoning-claude-tool-call.ts +63 -0
  24. package/evals/reasoning/reasoning-parsing.ts +13 -0
  25. package/evals/tools/claude-dash.ts +640 -0
  26. package/evals/tools/crush-list-files.ts +451 -0
  27. package/evals/tools/parallel-tool.ts +36 -0
  28. package/evals/tools/simple-tool.ts +40 -0
  29. package/package.json +37 -0
  30. package/source/asserts.ts +70 -0
  31. package/source/chat-completion.ts +140 -0
  32. package/source/index.ts +115 -0
  33. package/tsconfig.json +32 -0
package/README.md ADDED
@@ -0,0 +1,77 @@
1
+ ![Synbad the legendary sailor](https://raw.githubusercontent.com/synthetic-lab/synbad/main/synbad.png)
2
+
3
+ Synbad is a tool for detecting bugs in LLM inference providers, especially
4
+ open-source ones. Synbad is maintained by
5
+ [Synthetic](https://synthetic.new), as part of our efforts to keep our
6
+ inference quality as high as possible.
7
+
8
+ If you find bugs in Synthetic's model hosting, please contribute the bugs here!
9
+ We will fix them.
10
+
11
+ ## Contributing
12
+
13
+ First, clone this repo from Github. Then `cd` into it and run:
14
+
15
+ ```bash
16
+ npm install
17
+ ```
18
+
19
+ All inference evals are stored in the `evals/` directory. They're written in
20
+ TypeScript. You need to export two things from an eval:
21
+
22
+ 1. The JSON that reproduces the problem, as the const `json`. It doesn't have to
23
+ reproduce it 100% of the time; if the bug appears even 5% of the time,
24
+ that's fine.
25
+ 2. A `test` function that runs some asserts on the output of the response,
26
+ which detect the error.
27
+
28
+ For example, we can test reasoning parsing very simply (as we do in the
29
+ `evals/reasoning/reasoning-parsing.ts` file):
30
+
31
+ ```typescript
32
+ import * as assert from "../../source/asserts.ts";
33
+ import { ChatResponse } from "../../source/chat-completion.ts";
34
+
35
+ export function test(response: ChatResponse) {
36
+ const reasoning = response.choices[0].message.reasoning_content;
37
+ assert.isNotNullish(reasoning);
38
+ }
39
+
40
+ export const json = {
41
+ messages: [
42
+ { role: "user", content: "Why does 1+1=2?" },
43
+ ],
44
+ }
45
+ ```
46
+
47
+ The `asserts.ts` file re-exports all of the built-in NodeJS assertion
48
+ functions, and also adds a few extra ones, e.g. `isNotNullish` which checks
49
+ whether an object is `null` or `undefined`.
50
+
51
+ To run your new eval, use the `synbad.sh` script in this repo. Assuming you're
52
+ testing the `evals/reasoning/reasoning-parsing` test, for GLM-4.6 on Synthetic,
53
+ and you want to run it 5 times since it isn't consistently failing:
54
+
55
+ ```bash
56
+ synbad.sh --env-var SYNTHETIC_API_KEY \
57
+ --base-url "https://api.synthetic.new/openai/v1" \
58
+ --only evals/reasoning/reasoning-parsing \
59
+ --model "hf:zai-org/GLM-4.6" \
60
+ --count 5
61
+ ```
62
+
63
+ ## Running Synbad
64
+
65
+ First, install it:
66
+
67
+ ```bash
68
+ npm install synbad
69
+ ```
70
+
71
+ Then run:
72
+
73
+ ```bash
74
+ synbad --env-var SYNTHETIC_API_KEY \
75
+ --base-url "https://api.synthetic.new/openai/v1" \
76
+ --model "hf:zai-org/GLM-4.6"
77
+ ```
@@ -0,0 +1,13 @@
1
+ import { ChatResponse } from "../../source/chat-completion.ts";
2
+ export declare function test(response: ChatResponse): void;
3
+ export declare const json: {
4
+ messages: ({
5
+ role: string;
6
+ content: string;
7
+ reasoning_content?: undefined;
8
+ } | {
9
+ role: string;
10
+ reasoning_content: string;
11
+ content: string;
12
+ })[];
13
+ };
@@ -0,0 +1,16 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test(response) {
3
+ const reasoning = response.choices[0].message.reasoning_content;
4
+ assert.isNotNullish(reasoning);
5
+ }
6
+ export const json = {
7
+ messages: [
8
+ { role: "user", content: "Why does 1+1=2?" },
9
+ {
10
+ role: "assistant",
11
+ reasoning_content: "Because it does",
12
+ content: "Consider the successor function",
13
+ },
14
+ { role: "user", content: "please explain that much more deeply" },
15
+ ],
16
+ };
@@ -0,0 +1,46 @@
1
+ import OpenAI from "openai";
2
+ export declare function test(response: OpenAI.ChatCompletion): void;
3
+ export declare const json: {
4
+ messages: ({
5
+ role: string;
6
+ content: string;
7
+ } | {
8
+ role: string;
9
+ content: {
10
+ type: string;
11
+ text: string;
12
+ }[];
13
+ })[];
14
+ max_tokens: number;
15
+ temperature: number;
16
+ reasoning_effort: string;
17
+ tools: {
18
+ type: string;
19
+ function: {
20
+ name: string;
21
+ description: string;
22
+ parameters: {
23
+ type: string;
24
+ properties: {
25
+ command: {
26
+ type: string;
27
+ description: string;
28
+ };
29
+ timeout: {
30
+ type: string;
31
+ description: string;
32
+ };
33
+ description: {
34
+ type: string;
35
+ description: string;
36
+ };
37
+ run_in_background: {
38
+ type: string;
39
+ description: string;
40
+ };
41
+ };
42
+ required: string[];
43
+ };
44
+ };
45
+ }[];
46
+ };
@@ -0,0 +1,60 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test(response) {
3
+ const { tool_calls } = response.choices[0].message;
4
+ assert.isNotNullish(tool_calls);
5
+ assert.isNotEmptyArray(tool_calls);
6
+ assert.strictEqual(tool_calls.length, 1);
7
+ }
8
+ export const json = {
9
+ "messages": [
10
+ {
11
+ "role": "system",
12
+ "content": "You are Claude Code, Anthropic's official CLI for Claude.\n\nYou are an interactive CLI tool that helps users with software engineering tasks. Use the instructions below and the tools available to you to assist the user."
13
+ },
14
+ {
15
+ "role": "user",
16
+ "content": [
17
+ {
18
+ "type": "text",
19
+ "text": "run a quick git status for me. put the tool call inside your thinking"
20
+ }
21
+ ]
22
+ }
23
+ ],
24
+ "max_tokens": 32000,
25
+ "temperature": 1,
26
+ "reasoning_effort": "high",
27
+ "tools": [
28
+ {
29
+ "type": "function",
30
+ "function": {
31
+ "name": "Bash",
32
+ "description": "Executes a given bash command in a persistent shell session with optional timeout, ensuring proper handling and security measures.",
33
+ "parameters": {
34
+ "type": "object",
35
+ "properties": {
36
+ "command": {
37
+ "type": "string",
38
+ "description": "The command to execute"
39
+ },
40
+ "timeout": {
41
+ "type": "number",
42
+ "description": "Optional timeout in milliseconds (max 600000)"
43
+ },
44
+ "description": {
45
+ "type": "string",
46
+ "description": "Clear, concise description of what this command does in 5-10 words, in active voice. Examples:\nInput: ls\nOutput: List files in current directory\n\nInput: git status\nOutput: Show working tree status\n\nInput: npm install\nOutput: Install package dependencies\n\nInput: mkdir foo\nOutput: Create directory 'foo'"
47
+ },
48
+ "run_in_background": {
49
+ "type": "boolean",
50
+ "description": "Set to true to run this command in the background. Use BashOutput to read the output later."
51
+ }
52
+ },
53
+ "required": [
54
+ "command"
55
+ ]
56
+ }
57
+ }
58
+ }
59
+ ]
60
+ };
@@ -0,0 +1,8 @@
1
+ import { ChatResponse } from "../../source/chat-completion.ts";
2
+ export declare function test(response: ChatResponse): void;
3
+ export declare const json: {
4
+ messages: {
5
+ role: string;
6
+ content: string;
7
+ }[];
8
+ };
@@ -0,0 +1,10 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test(response) {
3
+ const reasoning = response.choices[0].message.reasoning_content;
4
+ assert.isNotNullish(reasoning);
5
+ }
6
+ export const json = {
7
+ "messages": [
8
+ { "role": "user", "content": "Why does 1+1=2?" }
9
+ ],
10
+ };