@syntheticlab/synbad 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +111 -26
  2. package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
  3. package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
  4. package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
  5. package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
  6. package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
  7. package/dist/evals/reasoning/reasoning-parsing.js +4 -4
  8. package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
  9. package/dist/evals/reasoning/response-in-reasoning.js +59 -0
  10. package/dist/evals/tools/claude-dash.d.ts +2 -2
  11. package/dist/evals/tools/claude-dash.js +1 -2
  12. package/dist/evals/tools/crush-list-files.d.ts +2 -5
  13. package/dist/evals/tools/crush-list-files.js +6 -8
  14. package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
  15. package/dist/evals/tools/multi-turn-tools.js +100 -0
  16. package/dist/evals/tools/no-fn-args.d.ts +22 -0
  17. package/dist/evals/tools/no-fn-args.js +31 -0
  18. package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
  19. package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
  20. package/dist/evals/tools/parallel-tool.d.ts +2 -2
  21. package/dist/evals/tools/parallel-tool.js +1 -2
  22. package/dist/evals/tools/simple-tool.d.ts +2 -2
  23. package/dist/evals/tools/simple-tool.js +3 -2
  24. package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
  25. package/dist/evals/tools/tool-dash-underscore.js +37 -0
  26. package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
  27. package/dist/evals/tools/tool-path-corruption.js +41 -0
  28. package/dist/source/asserts.d.ts +4 -1
  29. package/dist/source/asserts.js +36 -0
  30. package/dist/source/chat-completion.d.ts +5 -0
  31. package/dist/source/chat-completion.js +1 -0
  32. package/dist/source/evals.d.ts +9 -0
  33. package/dist/source/evals.js +53 -0
  34. package/dist/source/evals.test.d.ts +1 -0
  35. package/dist/source/evals.test.js +12 -0
  36. package/dist/source/exports.d.ts +2 -0
  37. package/dist/source/exports.js +1 -0
  38. package/dist/source/index.js +204 -38
  39. package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
  40. package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
  41. package/evals/reasoning/reasoning-parsing.ts +5 -5
  42. package/evals/reasoning/response-in-reasoning.ts +65 -0
  43. package/evals/tools/claude-dash.ts +2 -3
  44. package/evals/tools/crush-list-files.ts +11 -13
  45. package/evals/tools/multi-turn-tools.ts +104 -0
  46. package/evals/tools/no-fn-args.ts +34 -0
  47. package/evals/tools/octo-list-no-optional-args.ts +81 -0
  48. package/evals/tools/parallel-tool.ts +2 -3
  49. package/evals/tools/simple-tool.ts +4 -3
  50. package/evals/tools/tool-dash-underscore.ts +40 -0
  51. package/evals/tools/tool-path-corruption.ts +46 -0
  52. package/package.json +10 -3
  53. package/source/asserts.ts +37 -1
  54. package/source/chat-completion.ts +6 -0
  55. package/source/evals.test.ts +13 -0
  56. package/source/evals.ts +56 -0
  57. package/source/exports.ts +2 -0
  58. package/source/index.ts +246 -38
package/README.md CHANGED
@@ -8,32 +8,70 @@ inference quality as high as possible.
8
8
  If you find bugs in Synthetic's model hosting, please contribute the bugs here!
9
9
  We will fix them.
10
10
 
11
+ ## Install
12
+
13
+ Synbad is distributed through npm. Install it with:
14
+
15
+ ```bash
16
+ npm install -g @syntheticlab/synbad
17
+ ```
18
+
11
19
  ## Results
12
20
 
13
- We keep a running tally of provider+model results for GLM-4.6, Kimi K2
14
- Thinking, and MiniMax M2. Feel free to add more provider results!
21
+ We keep a running tally of provider+model results for tool calling and
22
+ reasoning parsing for GLM-4.7, Kimi K2 Thinking, and MiniMax M2. Feel free to
23
+ add more provider results!
15
24
 
16
25
  |Provider |Model |Success Rate|
17
26
  |---------|----------------|------------|
18
- |Synthetic|GLM-4.6 |:white_check_mark: 100%|
19
- |Synthetic|Kimi K2 Thinking|:white_check_mark: 100%|
20
- |Synthetic|MiniMax M2 |:white_check_mark: 100%|
27
+ |Synthetic.new|GLM-4.7 |:white_check_mark: 100%|
28
+ |Synthetic.new|Kimi K2 Thinking|:white_check_mark: 100%|
29
+ |Synthetic.new|MiniMax M2 |:white_check_mark: 100%|
21
30
 
22
31
  |Provider |Model |Success Rate|
23
32
  |---------|----------------|------------|
24
- |Fireworks|GLM-4.6 |:white_check_mark: 100%|
25
- |Fireworks|Kimi K2 Thinking|:x: 86%|
26
- |Fireworks|MiniMax M2 |:x: 29%|
33
+ |Fireworks|GLM-4.7 |:x: 83%|
34
+ |Fireworks|Kimi K2 Thinking|:x: 92%|
35
+ |Fireworks|MiniMax M2 |:white_check_mark: 100%|
27
36
 
28
37
  |Provider |Model |Success Rate|
29
38
  |---------|----------------|------------|
30
- |Together |GLM-4.6 |:white_check_mark: 100%|
31
- |Together |Kimi K2 Thinking|:x: 71%|
39
+ |Together |Kimi K2 Thinking|:x: 66%|
32
40
 
33
41
  |Provider |Model |Success Rate|
34
42
  |---------|----------------|------------|
35
- |Parasail |GLM-4.6 |:x: 71%|
36
- |Parasail |Kimi K2 Thinking|:x: 57%|
43
+ |Parasail |GLM-4.7 |:x: 83%|
44
+ |Parasail |Kimi K2 Thinking|:x: 75%|
45
+
46
+ Note for attempting reproductions: generally all tests are reproducible with
47
+ `--count 1` and `--count 1 --stream`, but for evaluating the
48
+ response-in-reasoning eval, you generally will need a high count to reproduce
49
+ the bug: `--count 40` and `--count 40 --stream` typically is sufficient.
50
+
51
+ All evals must pass both with and without Synbad's `--stream` parameter (which
52
+ tests streaming APIs) to be considered a pass.
53
+
54
+ ## How do I contribute inference bugs?
55
+
56
+ If you already have some problematic JSON, head over to the
57
+ [Contributing](#Contributing) section. If you don't, don't worry! Synbad makes
58
+ it easy to capture the problematic JSON you're encountering.
59
+
60
+ First, run the Synbad Proxy, specifying the local port you want to use and the
61
+ inference host you want to target. For example, to forward requests from
62
+ `localhost:3000` to Synthetic's API, you'd do:
63
+
64
+ ```bash
65
+ synbad proxy -p 3000 -t https://api.synthetic.new/openai/v1
66
+ ```
67
+
68
+ Then, configure your coding agent — or whichever local tool you're using — to
69
+ point to `http://localhost:3000` (or whichever port you selected). The Synbad
70
+ Proxy will log all request bodies to `stdout`, so all you need to do is
71
+ reproduce the bug by using your tool or coding agent, and then copy the JSON it
72
+ printed to `stdout`.
73
+
74
+ Now you have reproducible JSON to file a bug via Synbad!
37
75
 
38
76
  ## Contributing
39
77
 
@@ -49,25 +87,47 @@ TypeScript. You need to export two things from an eval:
49
87
  1. The JSON that reproduces the problem, as the const `json`. It doesn't have to
50
88
  reproduce it 100% of the time; if the bug appears even 5% of the time,
51
89
  that's fine.
52
- 2. A `test` function that runs some asserts on the output of the response,
90
+ 2. A `test` function that runs some asserts on the returned assistant message,
53
91
  which detect the error.
54
92
 
55
- For example, we can test reasoning parsing very simply (as we do in the
56
- `evals/reasoning/reasoning-parsing.ts` file):
93
+ For example, we can test parallel tool call support very simply (as we do in the
94
+ `evals/tools/parallel-tool.ts` file):
57
95
 
58
96
  ```typescript
59
97
  import * as assert from "../../source/asserts.ts";
60
- import { ChatResponse } from "../../source/chat-completion.ts";
98
+ import { ChatMessage } from "../../source/chat-completion.ts";
61
99
 
62
- export function test(response: ChatResponse) {
63
- const reasoning = response.choices[0].message.reasoning_content;
64
- assert.isNotNullish(reasoning);
100
+ export function test({ tool_calls }: ChatMessage) {
101
+ assert.isNotNullish(tool_calls);
102
+ assert.isNotEmptyArray(tool_calls);
103
+ assert.strictEqual(tool_calls.length, 2);
65
104
  }
66
105
 
67
106
  export const json = {
68
- messages: [
69
- { role: "user", content: "Why does 1+1=2?" },
107
+ "messages": [
108
+ {"role": "user", "content": "What's the weather in Paris and London?"}
70
109
  ],
110
+ "tools": [
111
+ {
112
+ "type": "function",
113
+ "function": {
114
+ "name": "get_weather",
115
+ "description": "Get current weather for a location",
116
+ "parameters": {
117
+ "type": "object",
118
+ "properties": {
119
+ "location": {
120
+ "type": "string",
121
+ "description": "City name"
122
+ }
123
+ },
124
+ "required": ["location"]
125
+ }
126
+ }
127
+ }
128
+ ],
129
+ "parallel_tool_calls": true,
130
+ "tool_choice": "auto",
71
131
  }
72
132
  ```
73
133
 
@@ -75,18 +135,43 @@ The `asserts.ts` file re-exports all of the built-in NodeJS assertion
75
135
  functions, and also adds a few extra ones, e.g. `isNotNullish` which checks
76
136
  whether an object is `null` or `undefined`.
77
137
 
78
- To run your new eval, use the `synbad.sh` script in this repo. Assuming you're
79
- testing the `evals/reasoning/reasoning-parsing` test, for GLM-4.6 on Synthetic,
80
- and you want to run it 5 times since it isn't consistently failing:
138
+ To run your new eval, use the `synbad.sh` script in this repo, which
139
+ auto-recompiles everything (including your new test!) before running the evals.
140
+ Assuming you're testing the `evals/reasoning/reasoning-parsing` test, for
141
+ GLM-4.6 on Synthetic, and you want to run it 5 times since it isn't
142
+ consistently failing:
81
143
 
82
144
  ```bash
83
- ./synbad.sh --env-var SYNTHETIC_API_KEY \
145
+ ./synbad.sh eval --env-var SYNTHETIC_API_KEY \
84
146
  --base-url "https://api.synthetic.new/openai/v1" \
85
147
  --only evals/reasoning/reasoning-parsing \
86
148
  --model "hf:zai-org/GLM-4.6" \
87
149
  --count 5
88
150
  ```
89
151
 
152
+ ### Handling reasoning parsing
153
+
154
+ The OpenAI spec didn't originally include reasoning content parsing, since the
155
+ original OpenAI models didn't reason. The open-source community added support
156
+ for reasoning later, but there are two competing specs:
157
+
158
+ 1. Storing the reasoning content in `message.reasoning_content`, or
159
+ 2. Storing the reasoning content in `message.reasoning`.
160
+
161
+ To make sure your evals work with a wider range of inference providers, use
162
+ the `getReasoning` function when testing reasoning parsing like so:
163
+
164
+ ```typescript
165
+ import { getReasoning } from "../../source/chat-completion.ts";
166
+
167
+ // In your test:
168
+
169
+ const reasoning = getReasoning(message);
170
+ ```
171
+
172
+ This ensures your test will use the correct reasoning content data regardless
173
+ of which spec the underlying inference provider is using.
174
+
90
175
  ## Running Synbad
91
176
 
92
177
  First, install it:
@@ -98,7 +183,7 @@ npm install -g @syntheticlab/synbad
98
183
  Then run:
99
184
 
100
185
  ```bash
101
- synbad --env-var SYNTHETIC_API_KEY \
186
+ synbad eval --env-var SYNTHETIC_API_KEY \
102
187
  --base-url "https://api.synthetic.new/openai/v1" \
103
188
  --model "hf:zai-org/GLM-4.6"
104
189
  ```
@@ -1,5 +1,5 @@
1
- import { ChatResponse } from "../../source/chat-completion.ts";
2
- export declare function test(response: ChatResponse): void;
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test(message: ChatMessage): void;
3
3
  export declare const json: {
4
4
  messages: ({
5
5
  role: string;
@@ -1,7 +1,7 @@
1
1
  import * as assert from "../../source/asserts.js";
2
2
  import { getReasoning } from "../../source/chat-completion.js";
3
- export function test(response) {
4
- const reasoning = getReasoning(response.choices[0].message);
3
+ export function test(message) {
4
+ const reasoning = getReasoning(message);
5
5
  assert.isNotNullish(reasoning);
6
6
  }
7
7
  export const json = {
@@ -1,5 +1,5 @@
1
- import { ChatResponse } from "../../source/chat-completion.ts";
2
- export declare function test(response: ChatResponse): void;
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ tool_calls }: ChatMessage): void;
3
3
  export declare const json: {
4
4
  messages: ({
5
5
  role: string;
@@ -1,6 +1,5 @@
1
1
  import * as assert from "../../source/asserts.js";
2
- export function test(response) {
3
- const { tool_calls } = response.choices[0].message;
2
+ export function test({ tool_calls }) {
4
3
  assert.isNotNullish(tool_calls);
5
4
  assert.isNotEmptyArray(tool_calls);
6
5
  assert.strictEqual(tool_calls.length, 1);
@@ -1,5 +1,5 @@
1
- import { ChatResponse } from "../../source/chat-completion.ts";
2
- export declare function test(response: ChatResponse): void;
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test(message: ChatMessage): void;
3
3
  export declare const json: {
4
4
  messages: {
5
5
  role: string;
@@ -1,11 +1,11 @@
1
1
  import * as assert from "../../source/asserts.js";
2
2
  import { getReasoning } from "../../source/chat-completion.js";
3
- export function test(response) {
4
- const reasoning = getReasoning(response.choices[0].message);
3
+ export function test(message) {
4
+ const reasoning = getReasoning(message);
5
5
  assert.isNotNullish(reasoning);
6
6
  }
7
7
  export const json = {
8
- "messages": [
9
- { "role": "user", "content": "Why does 1+1=2?" }
8
+ messages: [
9
+ { role: "user", content: "Why does 1+1=2?" }
10
10
  ],
11
11
  };
@@ -0,0 +1,45 @@
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test(message: ChatMessage): void;
3
+ export declare const json: {
4
+ messages: ({
5
+ role: string;
6
+ content: string;
7
+ } | {
8
+ role: string;
9
+ content: {
10
+ type: string;
11
+ text: string;
12
+ }[];
13
+ })[];
14
+ temperature: number;
15
+ tools: {
16
+ type: string;
17
+ function: {
18
+ name: string;
19
+ description: string;
20
+ parameters: {
21
+ type: string;
22
+ properties: {
23
+ description: {
24
+ description: string;
25
+ type: string;
26
+ };
27
+ prompt: {
28
+ description: string;
29
+ type: string;
30
+ };
31
+ subagent_type: {
32
+ description: string;
33
+ type: string;
34
+ };
35
+ session_id: {
36
+ description: string;
37
+ type: string;
38
+ };
39
+ };
40
+ required: string[];
41
+ };
42
+ };
43
+ }[];
44
+ tool_choice: string;
45
+ };
@@ -0,0 +1,59 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test(message) {
3
+ const content = message.content;
4
+ assert.or(() => assert.isNotNullish(content), () => assert.isNotEmptyArray(message.tool_calls));
5
+ }
6
+ export const json = {
7
+ "messages": [
8
+ {
9
+ "role": "system",
10
+ "content": "When I ask you to add a feature or resolve a problem: ALWAYS start the project explorer sub-agent to build complete understanding"
11
+ },
12
+ {
13
+ "role": "user",
14
+ "content": [
15
+ {
16
+ "type": "text",
17
+ "text": "Hello"
18
+ }
19
+ ]
20
+ }
21
+ ],
22
+ "temperature": 1,
23
+ "tools": [
24
+ {
25
+ "type": "function",
26
+ "function": {
27
+ "name": "task",
28
+ "description": "Launch a new agent to handle complex, multi-step tasks autonomously.\n\nAvailable agent types and the tools they have access to:\n- general: General-purpose agent for researching complex questions and executing multi-step tasks. Use this agent to execute multiple units of work in parallel.\n- explore: Fast agent specialized for exploring codebases. Use this when you need to quickly find files by patterns (eg. \"src/components/**/*.tsx\"), search code for keywords (eg. \"API endpoints\"), or answer questions about the codebase (eg. \"how do API endpoints work?\"). When calling this agent, specify the desired thoroughness level: \"quick\" for basic searches, \"medium\" for moderate exploration, or \"very thorough\" for comprehensive analysis across multiple locations and naming conventions.\n- code-reviewer: Expert code review specialist. Proactively reviews code for quality, security, and maintainability. Use immediately after writing or modifying code.\n\nWhen using the Task tool, you must specify a subagent_type parameter to select which agent type to use.\n\nWhen to use the Task tool:\n- When you are instructed to execute custom slash commands. Use the Task tool with the slash command invocation as the entire prompt. The slash command can take arguments. For example: Task(description=\"Check the file\", prompt=\"/check-file path/to/file.py\")\n\nWhen NOT to use the Task tool:\n- If you want to read a specific file path, use the Read or Glob tool instead of the Task tool, to find the match more quickly\n- If you are searching for a specific class definition like \"class Foo\", use the Glob tool instead, to find the match more quickly\n- If you are searching for code within a specific file or set of 2-3 files, use the Read tool instead of the Task tool, to find the match more quickly\n- Other tasks that are not related to the agent descriptions above\n\n\nUsage notes:\n1. Launch multiple agents concurrently whenever possible, to maximize performance; to do that, use a single message with multiple tool uses\n2. When the agent is done, it will return a single message back to you. The result returned by the agent is not visible to the user. To show the user the result, you should send a text message back to the user with a concise summary of the result.\n3. Each agent invocation is stateless unless you provide a session_id. Your prompt should contain a highly detailed task description for the agent to perform autonomously and you should specify exactly what information the agent should return back to you in its final and only message to you.\n4. The agent's outputs should generally be trusted\n5. Clearly tell the agent whether you expect it to write code or just to do research (search, file reads, web fetches, etc.), since it is not aware of the user's intent\n6. If the agent description mentions that it should be used proactively, then you should try your best to use it without the user having to ask for it first. Use your judgement.\n\n",
29
+ "parameters": {
30
+ "type": "object",
31
+ "properties": {
32
+ "description": {
33
+ "description": "A short (3-5 words) description of the task",
34
+ "type": "string"
35
+ },
36
+ "prompt": {
37
+ "description": "The task for the agent to perform",
38
+ "type": "string"
39
+ },
40
+ "subagent_type": {
41
+ "description": "The type of specialized agent to use for this task",
42
+ "type": "string"
43
+ },
44
+ "session_id": {
45
+ "description": "Existing Task session to continue",
46
+ "type": "string"
47
+ }
48
+ },
49
+ "required": [
50
+ "description",
51
+ "prompt",
52
+ "subagent_type"
53
+ ]
54
+ }
55
+ }
56
+ },
57
+ ],
58
+ "tool_choice": "auto"
59
+ };
@@ -1,5 +1,5 @@
1
- import OpenAI from "openai";
2
- export declare function test(response: OpenAI.ChatCompletion): void;
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ tool_calls }: ChatMessage): void;
3
3
  export declare const json: {
4
4
  messages: ({
5
5
  role: string;
@@ -1,6 +1,5 @@
1
1
  import * as assert from "../../source/asserts.js";
2
- export function test(response) {
3
- const { tool_calls } = response.choices[0].message;
2
+ export function test({ tool_calls }) {
4
3
  assert.isNotNullish(tool_calls);
5
4
  assert.isNotEmptyArray(tool_calls);
6
5
  }
@@ -1,5 +1,5 @@
1
- import OpenAI from "openai";
2
- export declare function test(response: OpenAI.ChatCompletion): void;
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ tool_calls }: ChatMessage): void;
3
3
  export declare const json: {
4
4
  messages: {
5
5
  content: string;
@@ -670,7 +670,4 @@ export declare const json: {
670
670
  })[];
671
671
  tool_choice: string;
672
672
  max_tokens: number;
673
- stream_options: {
674
- include_usage: boolean;
675
- };
676
673
  };
@@ -1,15 +1,16 @@
1
1
  import * as assert from "../../source/asserts.js";
2
- export function test(response) {
3
- const { tool_calls } = response.choices[0].message;
2
+ export function test({ tool_calls }) {
4
3
  assert.isNotNullish(tool_calls);
5
4
  assert.isNotEmptyArray(tool_calls);
6
- assert.strictEqual(tool_calls.length, 1);
5
+ assert.gte(tool_calls.length, 1);
7
6
  assert.strictEqual(tool_calls[0].type, "function");
8
7
  const fn = tool_calls[0].function;
9
8
  assert.or(() => {
10
9
  assert.strictEqual(fn.name, "ls");
11
- const args = JSON.parse(fn.arguments);
12
- assert.or(() => assert.strictEqual(args.path, "/home/reissbaker/Hack/scratch-scripts"), () => assert.strictEqual(args.path, "."), () => assert.isNullish(args.path));
10
+ if (fn.arguments) {
11
+ const args = JSON.parse(fn.arguments);
12
+ assert.or(() => assert.strictEqual(args.path, "/home/reissbaker/Hack/scratch-scripts"), () => assert.strictEqual(args.path, "."), () => assert.isNullish(args.path));
13
+ }
13
14
  }, () => {
14
15
  assert.strictEqual(fn.name, "bash");
15
16
  const args = JSON.parse(fn.arguments);
@@ -435,7 +436,4 @@ export const json = {
435
436
  ],
436
437
  "tool_choice": "auto",
437
438
  "max_tokens": 60000,
438
- "stream_options": {
439
- "include_usage": true
440
- }
441
439
  };
@@ -0,0 +1,46 @@
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ tool_calls }: ChatMessage): void;
3
+ export declare const json: {
4
+ messages: ({
5
+ role: string;
6
+ content: string;
7
+ tool_calls?: undefined;
8
+ tool_call_id?: undefined;
9
+ } | {
10
+ role: string;
11
+ tool_calls: {
12
+ id: string;
13
+ type: string;
14
+ function: {
15
+ name: string;
16
+ arguments: string;
17
+ };
18
+ }[];
19
+ content?: undefined;
20
+ tool_call_id?: undefined;
21
+ } | {
22
+ role: string;
23
+ tool_call_id: string;
24
+ content: string;
25
+ tool_calls?: undefined;
26
+ })[];
27
+ tools: {
28
+ type: string;
29
+ function: {
30
+ name: string;
31
+ description: string;
32
+ parameters: {
33
+ type: string;
34
+ properties: {
35
+ location: {
36
+ type: string;
37
+ description: string;
38
+ };
39
+ };
40
+ required: string[];
41
+ };
42
+ };
43
+ }[];
44
+ parallel_tool_calls: boolean;
45
+ tool_choice: string;
46
+ };
@@ -0,0 +1,100 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test({ tool_calls }) {
3
+ assert.isNotNullish(tool_calls);
4
+ assert.isNotEmptyArray(tool_calls);
5
+ assert.gte(tool_calls.length, 1);
6
+ assert.ok(tool_calls.some(tool_call => {
7
+ if (tool_call.type === "function" && tool_call.function.name === "get_weather") {
8
+ const location = JSON.parse(tool_call.function.arguments).location;
9
+ if (typeof location === "string") {
10
+ return location.toLowerCase().match(/las vegas/);
11
+ }
12
+ }
13
+ return false;
14
+ }), "At least one tool call must be get_weather({ location: 'las_vegas' })");
15
+ }
16
+ export const json = {
17
+ "messages": [
18
+ {
19
+ role: "user",
20
+ content: "What's the weather in Paris?"
21
+ },
22
+ {
23
+ role: "assistant",
24
+ tool_calls: [
25
+ {
26
+ id: "gw1",
27
+ type: "function",
28
+ function: {
29
+ name: "get_weather",
30
+ arguments: JSON.stringify({
31
+ location: "Paris, France",
32
+ }),
33
+ },
34
+ },
35
+ ],
36
+ },
37
+ {
38
+ role: "tool",
39
+ tool_call_id: "gw1",
40
+ content: "The weather in Paris is 24 degrees Celsius",
41
+ },
42
+ {
43
+ role: "assistant",
44
+ content: "I've looked up the weather in Paris, and it's a comfy 24 degrees Celsius today.",
45
+ },
46
+ {
47
+ role: "user",
48
+ content: "I meant Paris, Texas",
49
+ },
50
+ {
51
+ role: "assistant",
52
+ tool_calls: [
53
+ {
54
+ id: "gw2",
55
+ type: "function",
56
+ function: {
57
+ name: "get_weather",
58
+ arguments: JSON.stringify({
59
+ location: "Paris, Texas",
60
+ }),
61
+ },
62
+ },
63
+ ],
64
+ },
65
+ {
66
+ role: "tool",
67
+ tool_call_id: "gw2",
68
+ content: "The weather in Paris, Texas is 34 degrees Celsius",
69
+ },
70
+ {
71
+ role: "assistant",
72
+ content: "I've looked up the weather in Paris, Texas and it's a scorching 24 degrees Celsius today.",
73
+ },
74
+ {
75
+ role: "user",
76
+ content: "How about Las Vegas",
77
+ },
78
+ ],
79
+ "tools": [
80
+ {
81
+ "type": "function",
82
+ "function": {
83
+ "name": "get_weather",
84
+ "description": "Get current weather for a location",
85
+ "parameters": {
86
+ "type": "object",
87
+ "properties": {
88
+ "location": {
89
+ "type": "string",
90
+ "description": "City name"
91
+ }
92
+ },
93
+ "required": ["location"]
94
+ }
95
+ }
96
+ }
97
+ ],
98
+ "parallel_tool_calls": true,
99
+ "tool_choice": "auto",
100
+ };
@@ -0,0 +1,22 @@
1
+ import { ChatMessage } from "../../source/chat-completion.ts";
2
+ export declare function test({ tool_calls }: ChatMessage): void;
3
+ export declare const json: {
4
+ messages: {
5
+ role: string;
6
+ content: string;
7
+ }[];
8
+ tools: {
9
+ type: string;
10
+ function: {
11
+ name: string;
12
+ description: string;
13
+ parameters: {
14
+ $schema: string;
15
+ type: string;
16
+ properties: {};
17
+ additionalProperties: boolean;
18
+ };
19
+ };
20
+ }[];
21
+ tool_choice: string;
22
+ };
@@ -0,0 +1,31 @@
1
+ import * as assert from "../../source/asserts.js";
2
+ export function test({ tool_calls }) {
3
+ assert.isNotNullish(tool_calls);
4
+ assert.isNotEmptyArray(tool_calls);
5
+ assert.strictEqual(tool_calls.length, 1);
6
+ assert.strictEqual(tool_calls[0].type, "function");
7
+ }
8
+ export const json = {
9
+ "messages": [
10
+ {
11
+ "role": "user",
12
+ "content": "read the todos",
13
+ },
14
+ ],
15
+ "tools": [
16
+ {
17
+ "type": "function",
18
+ "function": {
19
+ "name": "get_todo_items",
20
+ "description": "Retrieves the current list of todo items, including their names and completion statuses.",
21
+ "parameters": {
22
+ "$schema": "http://json-schema.org/draft-07/schema#",
23
+ "type": "object",
24
+ "properties": {},
25
+ "additionalProperties": false
26
+ }
27
+ }
28
+ },
29
+ ],
30
+ "tool_choice": "auto",
31
+ };