@syntheticlab/synbad 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -2
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -1
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-parsing.js +2 -1
- package/dist/evals/tools/crush-list-files.js +2 -2
- package/dist/source/asserts.d.ts +1 -0
- package/dist/source/asserts.js +8 -0
- package/dist/source/chat-completion.d.ts +5 -0
- package/dist/source/chat-completion.js +3 -0
- package/dist/source/index.js +8 -1
- package/evals/reasoning/multiturn-reasoning-parsing.ts +2 -2
- package/evals/reasoning/reasoning-claude-tool-call.ts +2 -2
- package/evals/reasoning/reasoning-parsing.ts +2 -2
- package/evals/tools/crush-list-files.ts +5 -2
- package/package.json +5 -5
- package/source/asserts.ts +8 -0
- package/source/chat-completion.ts +5 -0
- package/source/index.ts +7 -1
package/README.md
CHANGED
|
@@ -8,6 +8,33 @@ inference quality as high as possible.
|
|
|
8
8
|
If you find bugs in Synthetic's model hosting, please contribute the bugs here!
|
|
9
9
|
We will fix them.
|
|
10
10
|
|
|
11
|
+
## Results
|
|
12
|
+
|
|
13
|
+
We keep a running tally of provider+model results for GLM-4.6, Kimi K2
|
|
14
|
+
Thinking, and MiniMax M2. Feel free to add more provider results!
|
|
15
|
+
|
|
16
|
+
|Provider |Model |Success Rate|
|
|
17
|
+
|---------|----------------|------------|
|
|
18
|
+
|Synthetic|GLM-4.6 |:white_check_mark: 100%|
|
|
19
|
+
|Synthetic|Kimi K2 Thinking|:white_check_mark: 100%|
|
|
20
|
+
|Synthetic|MiniMax M2 |:white_check_mark: 100%|
|
|
21
|
+
|
|
22
|
+
|Provider |Model |Success Rate|
|
|
23
|
+
|---------|----------------|------------|
|
|
24
|
+
|Fireworks|GLM-4.6 |:white_check_mark: 100%|
|
|
25
|
+
|Fireworks|Kimi K2 Thinking|:x: 86%|
|
|
26
|
+
|Fireworks|MiniMax M2 |:x: 29%|
|
|
27
|
+
|
|
28
|
+
|Provider |Model |Success Rate|
|
|
29
|
+
|---------|----------------|------------|
|
|
30
|
+
|Together |GLM-4.6 |:white_check_mark: 100%|
|
|
31
|
+
|Together |Kimi K2 Thinking|:x: 71%|
|
|
32
|
+
|
|
33
|
+
|Provider |Model |Success Rate|
|
|
34
|
+
|---------|----------------|------------|
|
|
35
|
+
|Parasail |GLM-4.6 |:x: 71%|
|
|
36
|
+
|Parasail |Kimi K2 Thinking|:x: 57%|
|
|
37
|
+
|
|
11
38
|
## Contributing
|
|
12
39
|
|
|
13
40
|
First, clone this repo from Github. Then `cd` into it and run:
|
|
@@ -53,7 +80,7 @@ testing the `evals/reasoning/reasoning-parsing` test, for GLM-4.6 on Synthetic,
|
|
|
53
80
|
and you want to run it 5 times since it isn't consistently failing:
|
|
54
81
|
|
|
55
82
|
```bash
|
|
56
|
-
synbad.sh --env-var SYNTHETIC_API_KEY \
|
|
83
|
+
./synbad.sh --env-var SYNTHETIC_API_KEY \
|
|
57
84
|
--base-url "https://api.synthetic.new/openai/v1" \
|
|
58
85
|
--only evals/reasoning/reasoning-parsing \
|
|
59
86
|
--model "hf:zai-org/GLM-4.6" \
|
|
@@ -65,7 +92,7 @@ synbad.sh --env-var SYNTHETIC_API_KEY \
|
|
|
65
92
|
First, install it:
|
|
66
93
|
|
|
67
94
|
```bash
|
|
68
|
-
npm install @syntheticlab/synbad
|
|
95
|
+
npm install -g @syntheticlab/synbad
|
|
69
96
|
```
|
|
70
97
|
|
|
71
98
|
Then run:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
|
+
import { getReasoning } from "../../source/chat-completion.js";
|
|
2
3
|
export function test(response) {
|
|
3
|
-
const reasoning = response.choices[0].message
|
|
4
|
+
const reasoning = getReasoning(response.choices[0].message);
|
|
4
5
|
assert.isNotNullish(reasoning);
|
|
5
6
|
}
|
|
6
7
|
export const json = {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
export declare function test(response:
|
|
1
|
+
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test(response: ChatResponse): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: ({
|
|
5
5
|
role: string;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
|
+
import { getReasoning } from "../../source/chat-completion.js";
|
|
2
3
|
export function test(response) {
|
|
3
|
-
const reasoning = response.choices[0].message
|
|
4
|
+
const reasoning = getReasoning(response.choices[0].message);
|
|
4
5
|
assert.isNotNullish(reasoning);
|
|
5
6
|
}
|
|
6
7
|
export const json = {
|
|
@@ -9,11 +9,11 @@ export function test(response) {
|
|
|
9
9
|
assert.or(() => {
|
|
10
10
|
assert.strictEqual(fn.name, "ls");
|
|
11
11
|
const args = JSON.parse(fn.arguments);
|
|
12
|
-
assert.or(() => assert.
|
|
12
|
+
assert.or(() => assert.strictEqual(args.path, "/home/reissbaker/Hack/scratch-scripts"), () => assert.strictEqual(args.path, "."), () => assert.isNullish(args.path));
|
|
13
13
|
}, () => {
|
|
14
14
|
assert.strictEqual(fn.name, "bash");
|
|
15
15
|
const args = JSON.parse(fn.arguments);
|
|
16
|
-
assert.
|
|
16
|
+
assert.or(() => assert.startsWith(args.command, "ls"), () => assert.startsWith(args.command, "find"));
|
|
17
17
|
});
|
|
18
18
|
}
|
|
19
19
|
export const json = {
|
package/dist/source/asserts.d.ts
CHANGED
|
@@ -4,3 +4,4 @@ export declare function isNullish(a: unknown): asserts a is null | undefined;
|
|
|
4
4
|
export declare function isNotNullish<T extends any>(a: T): asserts a is Exclude<T, null | undefined>;
|
|
5
5
|
export declare function isEmptyArray(a: any[]): boolean;
|
|
6
6
|
export declare function isNotEmptyArray(a: any[]): boolean;
|
|
7
|
+
export declare function startsWith(a: string, prefix: string): boolean;
|
package/dist/source/asserts.js
CHANGED
|
@@ -52,3 +52,11 @@ export function isNotEmptyArray(a) {
|
|
|
52
52
|
actual: a,
|
|
53
53
|
});
|
|
54
54
|
}
|
|
55
|
+
export function startsWith(a, prefix) {
|
|
56
|
+
if (a.startsWith(prefix))
|
|
57
|
+
return true;
|
|
58
|
+
throw new assert.AssertionError({
|
|
59
|
+
message: "Expected to start with: " + prefix,
|
|
60
|
+
actual: a,
|
|
61
|
+
});
|
|
62
|
+
}
|
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import { t } from "structural";
|
|
2
2
|
import OpenAI from "openai";
|
|
3
|
+
export declare function getReasoning(msg: {
|
|
4
|
+
reasoning_content?: string;
|
|
5
|
+
reasoning?: string;
|
|
6
|
+
}): string | undefined;
|
|
3
7
|
export type ChatResponse = OpenAI.ChatCompletion & {
|
|
4
8
|
choices: Array<{
|
|
5
9
|
message: {
|
|
6
10
|
reasoning_content?: string;
|
|
11
|
+
reasoning?: string;
|
|
7
12
|
};
|
|
8
13
|
}>;
|
|
9
14
|
};
|
package/dist/source/index.js
CHANGED
|
@@ -50,7 +50,14 @@ cli.command("eval")
|
|
|
50
50
|
model,
|
|
51
51
|
...json,
|
|
52
52
|
});
|
|
53
|
-
|
|
53
|
+
try {
|
|
54
|
+
test.test(response);
|
|
55
|
+
}
|
|
56
|
+
catch (e) {
|
|
57
|
+
console.error("Response:");
|
|
58
|
+
console.error(JSON.stringify(response.choices[0], null, 2));
|
|
59
|
+
throw e;
|
|
60
|
+
}
|
|
54
61
|
}
|
|
55
62
|
process.stdout.write(" ✅ passed\n");
|
|
56
63
|
}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.ts";
|
|
2
|
-
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
2
|
+
import { ChatResponse, getReasoning } from "../../source/chat-completion.ts";
|
|
3
3
|
|
|
4
4
|
export function test(response: ChatResponse) {
|
|
5
|
-
const reasoning = response.choices[0].message
|
|
5
|
+
const reasoning = getReasoning(response.choices[0].message);
|
|
6
6
|
assert.isNotNullish(reasoning);
|
|
7
7
|
}
|
|
8
8
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
2
2
|
import * as assert from "../../source/asserts.ts";
|
|
3
3
|
|
|
4
|
-
export function test(response:
|
|
4
|
+
export function test(response: ChatResponse) {
|
|
5
5
|
const { tool_calls } = response.choices[0].message;
|
|
6
6
|
assert.isNotNullish(tool_calls);
|
|
7
7
|
assert.isNotEmptyArray(tool_calls);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.ts";
|
|
2
|
-
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
2
|
+
import { ChatResponse, getReasoning } from "../../source/chat-completion.ts";
|
|
3
3
|
|
|
4
4
|
export function test(response: ChatResponse) {
|
|
5
|
-
const reasoning = response.choices[0].message
|
|
5
|
+
const reasoning = getReasoning(response.choices[0].message);
|
|
6
6
|
assert.isNotNullish(reasoning);
|
|
7
7
|
}
|
|
8
8
|
|
|
@@ -13,15 +13,18 @@ export function test(response: OpenAI.ChatCompletion) {
|
|
|
13
13
|
assert.strictEqual(fn.name, "ls");
|
|
14
14
|
const args = JSON.parse(fn.arguments);
|
|
15
15
|
assert.or(
|
|
16
|
-
() => assert.deepStrictEqual(args, {}),
|
|
17
16
|
() => assert.strictEqual(args.path, "/home/reissbaker/Hack/scratch-scripts"),
|
|
18
17
|
() => assert.strictEqual(args.path, "."),
|
|
18
|
+
() => assert.isNullish(args.path),
|
|
19
19
|
);
|
|
20
20
|
},
|
|
21
21
|
() => {
|
|
22
22
|
assert.strictEqual(fn.name, "bash");
|
|
23
23
|
const args = JSON.parse(fn.arguments);
|
|
24
|
-
assert.
|
|
24
|
+
assert.or(
|
|
25
|
+
() => assert.startsWith(args.command, "ls"),
|
|
26
|
+
() => assert.startsWith(args.command, "find"),
|
|
27
|
+
);
|
|
25
28
|
},
|
|
26
29
|
);
|
|
27
30
|
}
|
package/package.json
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@syntheticlab/synbad",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.4",
|
|
4
4
|
"description": "LLM inference provider evals",
|
|
5
5
|
"main": "dist/source/index.js",
|
|
6
6
|
"bin": {
|
|
7
|
-
|
|
7
|
+
"synbad": "dist/source/index.js"
|
|
8
8
|
},
|
|
9
|
-
|
|
9
|
+
"preferGlobal": true,
|
|
10
10
|
"type": "module",
|
|
11
11
|
"scripts": {
|
|
12
12
|
"test": "echo \"Error: no test specified\" && exit 1",
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
"build": "tsc",
|
|
14
|
+
"prepublishOnly": "tsc"
|
|
15
15
|
},
|
|
16
16
|
"author": "@reissbaker",
|
|
17
17
|
"license": "MIT",
|
package/source/asserts.ts
CHANGED
|
@@ -68,3 +68,11 @@ export function isNotEmptyArray(a: any[]) {
|
|
|
68
68
|
actual: a,
|
|
69
69
|
});
|
|
70
70
|
}
|
|
71
|
+
|
|
72
|
+
export function startsWith(a: string, prefix: string) {
|
|
73
|
+
if(a.startsWith(prefix)) return true;
|
|
74
|
+
throw new assert.AssertionError({
|
|
75
|
+
message: "Expected to start with: " + prefix,
|
|
76
|
+
actual: a,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
import { t } from "structural";
|
|
2
2
|
import OpenAI from "openai";
|
|
3
3
|
|
|
4
|
+
export function getReasoning(msg: { reasoning_content?: string, reasoning?: string }) {
|
|
5
|
+
return msg.reasoning_content || msg.reasoning;
|
|
6
|
+
}
|
|
7
|
+
|
|
4
8
|
export type ChatResponse = OpenAI.ChatCompletion & {
|
|
5
9
|
choices: Array<{
|
|
6
10
|
message: {
|
|
7
11
|
reasoning_content?: string,
|
|
12
|
+
reasoning?: string,
|
|
8
13
|
},
|
|
9
14
|
}>
|
|
10
15
|
};
|
package/source/index.ts
CHANGED
|
@@ -56,7 +56,13 @@ cli.command("eval")
|
|
|
56
56
|
model,
|
|
57
57
|
...json,
|
|
58
58
|
});
|
|
59
|
-
|
|
59
|
+
try {
|
|
60
|
+
test.test(response);
|
|
61
|
+
} catch(e) {
|
|
62
|
+
console.error("Response:");
|
|
63
|
+
console.error(JSON.stringify(response.choices[0], null, 2));
|
|
64
|
+
throw e;
|
|
65
|
+
}
|
|
60
66
|
}
|
|
61
67
|
process.stdout.write(" ✅ passed\n");
|
|
62
68
|
} catch(e) {
|