@syntheticlab/synbad 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -23
- package/dist/evals/reasoning/custom-harness.d.ts +679 -0
- package/dist/evals/reasoning/custom-harness.js +847 -0
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
- package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
- package/dist/evals/reasoning/response-in-reasoning.js +59 -0
- package/dist/evals/tools/claude-dash.d.ts +2 -2
- package/dist/evals/tools/claude-dash.js +1 -2
- package/dist/evals/tools/crush-list-files.d.ts +2 -5
- package/dist/evals/tools/crush-list-files.js +6 -8
- package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
- package/dist/evals/tools/multi-turn-tools.js +100 -0
- package/dist/evals/tools/no-fn-args.d.ts +22 -0
- package/dist/evals/tools/no-fn-args.js +31 -0
- package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
- package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
- package/dist/evals/tools/parallel-tool.d.ts +2 -2
- package/dist/evals/tools/parallel-tool.js +1 -2
- package/dist/evals/tools/simple-tool.d.ts +2 -2
- package/dist/evals/tools/simple-tool.js +3 -2
- package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
- package/dist/evals/tools/tool-dash-underscore.js +37 -0
- package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
- package/dist/evals/tools/tool-path-corruption.js +41 -0
- package/dist/source/asserts.d.ts +4 -1
- package/dist/source/asserts.js +36 -0
- package/dist/source/chat-completion.d.ts +5 -0
- package/dist/source/chat-completion.js +1 -0
- package/dist/source/evals.d.ts +9 -0
- package/dist/source/evals.js +53 -0
- package/dist/source/evals.test.d.ts +1 -0
- package/dist/source/evals.test.js +12 -0
- package/dist/source/exports.d.ts +2 -0
- package/dist/source/exports.js +1 -0
- package/dist/source/index.js +103 -43
- package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
- package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
- package/evals/reasoning/reasoning-parsing.ts +3 -3
- package/evals/reasoning/response-in-reasoning.ts +65 -0
- package/evals/tools/claude-dash.ts +2 -3
- package/evals/tools/crush-list-files.ts +11 -13
- package/evals/tools/multi-turn-tools.ts +104 -0
- package/evals/tools/no-fn-args.ts +34 -0
- package/evals/tools/octo-list-no-optional-args.ts +81 -0
- package/evals/tools/parallel-tool.ts +2 -3
- package/evals/tools/simple-tool.ts +4 -3
- package/evals/tools/tool-dash-underscore.ts +40 -0
- package/evals/tools/tool-path-corruption.ts +46 -0
- package/package.json +10 -3
- package/source/asserts.ts +37 -1
- package/source/chat-completion.ts +6 -0
- package/source/evals.test.ts +13 -0
- package/source/evals.ts +56 -0
- package/source/exports.ts +2 -0
- package/source/index.ts +121 -44
package/README.md
CHANGED
|
@@ -8,32 +8,48 @@ inference quality as high as possible.
|
|
|
8
8
|
If you find bugs in Synthetic's model hosting, please contribute the bugs here!
|
|
9
9
|
We will fix them.
|
|
10
10
|
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
Synbad is distributed through npm. Install it with:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install -g @syntheticlab/synbad
|
|
17
|
+
```
|
|
18
|
+
|
|
11
19
|
## Results
|
|
12
20
|
|
|
13
|
-
We keep a running tally of provider+model results for
|
|
14
|
-
Thinking, and MiniMax M2. Feel free to
|
|
21
|
+
We keep a running tally of provider+model results for tool calling and
|
|
22
|
+
reasoning parsing for GLM-4.7, Kimi K2 Thinking, and MiniMax M2. Feel free to
|
|
23
|
+
add more provider results!
|
|
15
24
|
|
|
16
25
|
|Provider |Model |Success Rate|
|
|
17
26
|
|---------|----------------|------------|
|
|
18
|
-
|Synthetic|GLM-4.
|
|
19
|
-
|Synthetic|Kimi K2 Thinking|:white_check_mark: 100%|
|
|
20
|
-
|Synthetic|MiniMax M2 |:white_check_mark: 100%|
|
|
27
|
+
|Synthetic.new|GLM-4.7 |:white_check_mark: 100%|
|
|
28
|
+
|Synthetic.new|Kimi K2 Thinking|:white_check_mark: 100%|
|
|
29
|
+
|Synthetic.new|MiniMax M2 |:white_check_mark: 100%|
|
|
21
30
|
|
|
22
31
|
|Provider |Model |Success Rate|
|
|
23
32
|
|---------|----------------|------------|
|
|
24
|
-
|Fireworks|GLM-4.
|
|
25
|
-
|Fireworks|Kimi K2 Thinking|:x:
|
|
26
|
-
|Fireworks|MiniMax M2 |:
|
|
33
|
+
|Fireworks|GLM-4.7 |:x: 83%|
|
|
34
|
+
|Fireworks|Kimi K2 Thinking|:x: 92%|
|
|
35
|
+
|Fireworks|MiniMax M2 |:white_check_mark: 100%|
|
|
27
36
|
|
|
28
37
|
|Provider |Model |Success Rate|
|
|
29
38
|
|---------|----------------|------------|
|
|
30
|
-
|Together |
|
|
31
|
-
|Together |Kimi K2 Thinking|:x: 71%|
|
|
39
|
+
|Together |Kimi K2 Thinking|:x: 66%|
|
|
32
40
|
|
|
33
41
|
|Provider |Model |Success Rate|
|
|
34
42
|
|---------|----------------|------------|
|
|
35
|
-
|Parasail |GLM-4.
|
|
36
|
-
|Parasail |Kimi K2 Thinking|:x:
|
|
43
|
+
|Parasail |GLM-4.7 |:x: 83%|
|
|
44
|
+
|Parasail |Kimi K2 Thinking|:x: 75%|
|
|
45
|
+
|
|
46
|
+
Note for attempting reproductions: generally all tests are reproducible with
|
|
47
|
+
`--count 1` and `--count 1 --stream`, but for evaluating the
|
|
48
|
+
response-in-reasoning eval, you generally will need a high count to reproduce
|
|
49
|
+
the bug: `--count 40` and `--count 40 --stream` typically is sufficient.
|
|
50
|
+
|
|
51
|
+
All evals must pass both with and without Synbad's `--stream` parameter (which
|
|
52
|
+
tests streaming APIs) to be considered a pass.
|
|
37
53
|
|
|
38
54
|
## How do I contribute inference bugs?
|
|
39
55
|
|
|
@@ -71,26 +87,47 @@ TypeScript. You need to export two things from an eval:
|
|
|
71
87
|
1. The JSON that reproduces the problem, as the const `json`. It doesn't have to
|
|
72
88
|
reproduce it 100% of the time; if the bug appears even 5% of the time,
|
|
73
89
|
that's fine.
|
|
74
|
-
2. A `test` function that runs some asserts on the
|
|
90
|
+
2. A `test` function that runs some asserts on the returned assistant message,
|
|
75
91
|
which detect the error.
|
|
76
92
|
|
|
77
|
-
For example, we can test
|
|
78
|
-
`evals/
|
|
93
|
+
For example, we can test parallel tool call support very simply (as we do in the
|
|
94
|
+
`evals/tools/parallel-tool.ts` file):
|
|
79
95
|
|
|
80
96
|
```typescript
|
|
81
97
|
import * as assert from "../../source/asserts.ts";
|
|
82
|
-
import {
|
|
98
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
83
99
|
|
|
84
|
-
export function test(
|
|
85
|
-
|
|
86
|
-
assert.
|
|
100
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
101
|
+
assert.isNotNullish(tool_calls);
|
|
102
|
+
assert.isNotEmptyArray(tool_calls);
|
|
103
|
+
assert.strictEqual(tool_calls.length, 2);
|
|
87
104
|
}
|
|
88
105
|
|
|
89
|
-
// Insert your JSON. You can paste your results from the Synbad proxy here.
|
|
90
106
|
export const json = {
|
|
91
|
-
messages: [
|
|
92
|
-
{
|
|
107
|
+
"messages": [
|
|
108
|
+
{"role": "user", "content": "What's the weather in Paris and London?"}
|
|
109
|
+
],
|
|
110
|
+
"tools": [
|
|
111
|
+
{
|
|
112
|
+
"type": "function",
|
|
113
|
+
"function": {
|
|
114
|
+
"name": "get_weather",
|
|
115
|
+
"description": "Get current weather for a location",
|
|
116
|
+
"parameters": {
|
|
117
|
+
"type": "object",
|
|
118
|
+
"properties": {
|
|
119
|
+
"location": {
|
|
120
|
+
"type": "string",
|
|
121
|
+
"description": "City name"
|
|
122
|
+
}
|
|
123
|
+
},
|
|
124
|
+
"required": ["location"]
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
93
128
|
],
|
|
129
|
+
"parallel_tool_calls": true,
|
|
130
|
+
"tool_choice": "auto",
|
|
94
131
|
}
|
|
95
132
|
```
|
|
96
133
|
|
|
@@ -129,7 +166,7 @@ import { getReasoning } from "../../source/chat-completion.ts";
|
|
|
129
166
|
|
|
130
167
|
// In your test:
|
|
131
168
|
|
|
132
|
-
const reasoning = getReasoning(
|
|
169
|
+
const reasoning = getReasoning(message);
|
|
133
170
|
```
|
|
134
171
|
|
|
135
172
|
This ensures your test will use the correct reasoning content data regardless
|