@syntheticlab/synbad 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +60 -23
  2. package/dist/evals/reasoning/custom-harness.d.ts +679 -0
  3. package/dist/evals/reasoning/custom-harness.js +847 -0
  4. package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
  5. package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
  6. package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
  7. package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
  8. package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
  9. package/dist/evals/reasoning/reasoning-parsing.js +2 -2
  10. package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
  11. package/dist/evals/reasoning/response-in-reasoning.js +59 -0
  12. package/dist/evals/tools/claude-dash.d.ts +2 -2
  13. package/dist/evals/tools/claude-dash.js +1 -2
  14. package/dist/evals/tools/crush-list-files.d.ts +2 -5
  15. package/dist/evals/tools/crush-list-files.js +6 -8
  16. package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
  17. package/dist/evals/tools/multi-turn-tools.js +100 -0
  18. package/dist/evals/tools/no-fn-args.d.ts +22 -0
  19. package/dist/evals/tools/no-fn-args.js +31 -0
  20. package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
  21. package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
  22. package/dist/evals/tools/parallel-tool.d.ts +2 -2
  23. package/dist/evals/tools/parallel-tool.js +1 -2
  24. package/dist/evals/tools/simple-tool.d.ts +2 -2
  25. package/dist/evals/tools/simple-tool.js +3 -2
  26. package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
  27. package/dist/evals/tools/tool-dash-underscore.js +37 -0
  28. package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
  29. package/dist/evals/tools/tool-path-corruption.js +41 -0
  30. package/dist/source/asserts.d.ts +4 -1
  31. package/dist/source/asserts.js +36 -0
  32. package/dist/source/chat-completion.d.ts +5 -0
  33. package/dist/source/chat-completion.js +1 -0
  34. package/dist/source/evals.d.ts +9 -0
  35. package/dist/source/evals.js +53 -0
  36. package/dist/source/evals.test.d.ts +1 -0
  37. package/dist/source/evals.test.js +12 -0
  38. package/dist/source/exports.d.ts +2 -0
  39. package/dist/source/exports.js +1 -0
  40. package/dist/source/index.js +103 -43
  41. package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
  42. package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
  43. package/evals/reasoning/reasoning-parsing.ts +3 -3
  44. package/evals/reasoning/response-in-reasoning.ts +65 -0
  45. package/evals/tools/claude-dash.ts +2 -3
  46. package/evals/tools/crush-list-files.ts +11 -13
  47. package/evals/tools/multi-turn-tools.ts +104 -0
  48. package/evals/tools/no-fn-args.ts +34 -0
  49. package/evals/tools/octo-list-no-optional-args.ts +81 -0
  50. package/evals/tools/parallel-tool.ts +2 -3
  51. package/evals/tools/simple-tool.ts +4 -3
  52. package/evals/tools/tool-dash-underscore.ts +40 -0
  53. package/evals/tools/tool-path-corruption.ts +46 -0
  54. package/package.json +10 -3
  55. package/source/asserts.ts +37 -1
  56. package/source/chat-completion.ts +6 -0
  57. package/source/evals.test.ts +13 -0
  58. package/source/evals.ts +56 -0
  59. package/source/exports.ts +2 -0
  60. package/source/index.ts +121 -44
package/README.md CHANGED
@@ -8,32 +8,48 @@ inference quality as high as possible.
8
8
  If you find bugs in Synthetic's model hosting, please contribute the bugs here!
9
9
  We will fix them.
10
10
 
11
+ ## Install
12
+
13
+ Synbad is distributed through npm. Install it with:
14
+
15
+ ```bash
16
+ npm install -g @syntheticlab/synbad
17
+ ```
18
+
11
19
  ## Results
12
20
 
13
- We keep a running tally of provider+model results for GLM-4.6, Kimi K2
14
- Thinking, and MiniMax M2. Feel free to add more provider results!
21
+ We keep a running tally of provider+model results for tool calling and
22
+ reasoning parsing for GLM-4.7, Kimi K2 Thinking, and MiniMax M2. Feel free to
23
+ add more provider results!
15
24
 
16
25
  |Provider |Model |Success Rate|
17
26
  |---------|----------------|------------|
18
- |Synthetic|GLM-4.6 |:white_check_mark: 100%|
19
- |Synthetic|Kimi K2 Thinking|:white_check_mark: 100%|
20
- |Synthetic|MiniMax M2 |:white_check_mark: 100%|
27
+ |Synthetic.new|GLM-4.7 |:white_check_mark: 100%|
28
+ |Synthetic.new|Kimi K2 Thinking|:white_check_mark: 100%|
29
+ |Synthetic.new|MiniMax M2 |:white_check_mark: 100%|
21
30
 
22
31
  |Provider |Model |Success Rate|
23
32
  |---------|----------------|------------|
24
- |Fireworks|GLM-4.6 |:white_check_mark: 100%|
25
- |Fireworks|Kimi K2 Thinking|:x: 86%|
26
- |Fireworks|MiniMax M2 |:x: 29%|
33
+ |Fireworks|GLM-4.7 |:x: 83%|
34
+ |Fireworks|Kimi K2 Thinking|:x: 92%|
35
+ |Fireworks|MiniMax M2 |:white_check_mark: 100%|
27
36
 
28
37
  |Provider |Model |Success Rate|
29
38
  |---------|----------------|------------|
30
- |Together |GLM-4.6 |:white_check_mark: 100%|
31
- |Together |Kimi K2 Thinking|:x: 71%|
39
+ |Together |Kimi K2 Thinking|:x: 66%|
32
40
 
33
41
  |Provider |Model |Success Rate|
34
42
  |---------|----------------|------------|
35
- |Parasail |GLM-4.6 |:x: 71%|
36
- |Parasail |Kimi K2 Thinking|:x: 57%|
43
+ |Parasail |GLM-4.7 |:x: 83%|
44
+ |Parasail |Kimi K2 Thinking|:x: 75%|
45
+
46
+ Note for attempting reproductions: generally all tests are reproducible with
47
+ `--count 1` and `--count 1 --stream`, but for evaluating the
48
+ response-in-reasoning eval, you generally will need a high count to reproduce
49
+ the bug: `--count 40` and `--count 40 --stream` typically is sufficient.
50
+
51
+ All evals must pass both with and without Synbad's `--stream` parameter (which
52
+ tests streaming APIs) to be considered a pass.
37
53
 
38
54
  ## How do I contribute inference bugs?
39
55
 
@@ -71,26 +87,47 @@ TypeScript. You need to export two things from an eval:
71
87
  1. The JSON that reproduces the problem, as the const `json`. It doesn't have to
72
88
  reproduce it 100% of the time; if the bug appears even 5% of the time,
73
89
  that's fine.
74
- 2. A `test` function that runs some asserts on the output of the response,
90
+ 2. A `test` function that runs some asserts on the returned assistant message,
75
91
  which detect the error.
76
92
 
77
- For example, we can test reasoning parsing very simply (as we do in the
78
- `evals/reasoning/reasoning-parsing.ts` file):
93
+ For example, we can test parallel tool call support very simply (as we do in the
94
+ `evals/tools/parallel-tool.ts` file):
79
95
 
80
96
  ```typescript
81
97
  import * as assert from "../../source/asserts.ts";
82
- import { ChatResponse, getReasoning } from "../../source/chat-completion.ts";
98
+ import { ChatMessage } from "../../source/chat-completion.ts";
83
99
 
84
- export function test(response: ChatResponse) {
85
- const reasoning = getReasoning(response.choices[0].message);
86
- assert.isNotNullish(reasoning);
100
+ export function test({ tool_calls }: ChatMessage) {
101
+ assert.isNotNullish(tool_calls);
102
+ assert.isNotEmptyArray(tool_calls);
103
+ assert.strictEqual(tool_calls.length, 2);
87
104
  }
88
105
 
89
- // Insert your JSON. You can paste your results from the Synbad proxy here.
90
106
  export const json = {
91
- messages: [
92
- { role: "user", content: "Why does 1+1=2?" }
107
+ "messages": [
108
+ {"role": "user", "content": "What's the weather in Paris and London?"}
109
+ ],
110
+ "tools": [
111
+ {
112
+ "type": "function",
113
+ "function": {
114
+ "name": "get_weather",
115
+ "description": "Get current weather for a location",
116
+ "parameters": {
117
+ "type": "object",
118
+ "properties": {
119
+ "location": {
120
+ "type": "string",
121
+ "description": "City name"
122
+ }
123
+ },
124
+ "required": ["location"]
125
+ }
126
+ }
127
+ }
93
128
  ],
129
+ "parallel_tool_calls": true,
130
+ "tool_choice": "auto",
94
131
  }
95
132
  ```
96
133
 
@@ -129,7 +166,7 @@ import { getReasoning } from "../../source/chat-completion.ts";
129
166
 
130
167
  // In your test:
131
168
 
132
- const reasoning = getReasoning(response.choices[0].message);
169
+ const reasoning = getReasoning(message);
133
170
  ```
134
171
 
135
172
  This ensures your test will use the correct reasoning content data regardless