@syntheticlab/synbad 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -26
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
- package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-parsing.js +4 -4
- package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
- package/dist/evals/reasoning/response-in-reasoning.js +59 -0
- package/dist/evals/tools/claude-dash.d.ts +2 -2
- package/dist/evals/tools/claude-dash.js +1 -2
- package/dist/evals/tools/crush-list-files.d.ts +2 -5
- package/dist/evals/tools/crush-list-files.js +6 -8
- package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
- package/dist/evals/tools/multi-turn-tools.js +100 -0
- package/dist/evals/tools/no-fn-args.d.ts +22 -0
- package/dist/evals/tools/no-fn-args.js +31 -0
- package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
- package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
- package/dist/evals/tools/parallel-tool.d.ts +2 -2
- package/dist/evals/tools/parallel-tool.js +1 -2
- package/dist/evals/tools/simple-tool.d.ts +2 -2
- package/dist/evals/tools/simple-tool.js +3 -2
- package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
- package/dist/evals/tools/tool-dash-underscore.js +37 -0
- package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
- package/dist/evals/tools/tool-path-corruption.js +41 -0
- package/dist/source/asserts.d.ts +4 -1
- package/dist/source/asserts.js +36 -0
- package/dist/source/chat-completion.d.ts +5 -0
- package/dist/source/chat-completion.js +1 -0
- package/dist/source/evals.d.ts +9 -0
- package/dist/source/evals.js +53 -0
- package/dist/source/evals.test.d.ts +1 -0
- package/dist/source/evals.test.js +12 -0
- package/dist/source/exports.d.ts +2 -0
- package/dist/source/exports.js +1 -0
- package/dist/source/index.js +204 -38
- package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
- package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
- package/evals/reasoning/reasoning-parsing.ts +5 -5
- package/evals/reasoning/response-in-reasoning.ts +65 -0
- package/evals/tools/claude-dash.ts +2 -3
- package/evals/tools/crush-list-files.ts +11 -13
- package/evals/tools/multi-turn-tools.ts +104 -0
- package/evals/tools/no-fn-args.ts +34 -0
- package/evals/tools/octo-list-no-optional-args.ts +81 -0
- package/evals/tools/parallel-tool.ts +2 -3
- package/evals/tools/simple-tool.ts +4 -3
- package/evals/tools/tool-dash-underscore.ts +40 -0
- package/evals/tools/tool-path-corruption.ts +46 -0
- package/package.json +10 -3
- package/source/asserts.ts +37 -1
- package/source/chat-completion.ts +6 -0
- package/source/evals.test.ts +13 -0
- package/source/evals.ts +56 -0
- package/source/exports.ts +2 -0
- package/source/index.ts +246 -38
package/README.md
CHANGED
|
@@ -8,32 +8,70 @@ inference quality as high as possible.
|
|
|
8
8
|
If you find bugs in Synthetic's model hosting, please contribute the bugs here!
|
|
9
9
|
We will fix them.
|
|
10
10
|
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
Synbad is distributed through npm. Install it with:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install -g @syntheticlab/synbad
|
|
17
|
+
```
|
|
18
|
+
|
|
11
19
|
## Results
|
|
12
20
|
|
|
13
|
-
We keep a running tally of provider+model results for
|
|
14
|
-
Thinking, and MiniMax M2. Feel free to
|
|
21
|
+
We keep a running tally of provider+model results for tool calling and
|
|
22
|
+
reasoning parsing for GLM-4.7, Kimi K2 Thinking, and MiniMax M2. Feel free to
|
|
23
|
+
add more provider results!
|
|
15
24
|
|
|
16
25
|
|Provider |Model |Success Rate|
|
|
17
26
|
|---------|----------------|------------|
|
|
18
|
-
|Synthetic|GLM-4.
|
|
19
|
-
|Synthetic|Kimi K2 Thinking|:white_check_mark: 100%|
|
|
20
|
-
|Synthetic|MiniMax M2 |:white_check_mark: 100%|
|
|
27
|
+
|Synthetic.new|GLM-4.7 |:white_check_mark: 100%|
|
|
28
|
+
|Synthetic.new|Kimi K2 Thinking|:white_check_mark: 100%|
|
|
29
|
+
|Synthetic.new|MiniMax M2 |:white_check_mark: 100%|
|
|
21
30
|
|
|
22
31
|
|Provider |Model |Success Rate|
|
|
23
32
|
|---------|----------------|------------|
|
|
24
|
-
|Fireworks|GLM-4.
|
|
25
|
-
|Fireworks|Kimi K2 Thinking|:x:
|
|
26
|
-
|Fireworks|MiniMax M2 |:
|
|
33
|
+
|Fireworks|GLM-4.7 |:x: 83%|
|
|
34
|
+
|Fireworks|Kimi K2 Thinking|:x: 92%|
|
|
35
|
+
|Fireworks|MiniMax M2 |:white_check_mark: 100%|
|
|
27
36
|
|
|
28
37
|
|Provider |Model |Success Rate|
|
|
29
38
|
|---------|----------------|------------|
|
|
30
|
-
|Together |
|
|
31
|
-
|Together |Kimi K2 Thinking|:x: 71%|
|
|
39
|
+
|Together |Kimi K2 Thinking|:x: 66%|
|
|
32
40
|
|
|
33
41
|
|Provider |Model |Success Rate|
|
|
34
42
|
|---------|----------------|------------|
|
|
35
|
-
|Parasail |GLM-4.
|
|
36
|
-
|Parasail |Kimi K2 Thinking|:x:
|
|
43
|
+
|Parasail |GLM-4.7 |:x: 83%|
|
|
44
|
+
|Parasail |Kimi K2 Thinking|:x: 75%|
|
|
45
|
+
|
|
46
|
+
Note for attempting reproductions: generally all tests are reproducible with
|
|
47
|
+
`--count 1` and `--count 1 --stream`, but for evaluating the
|
|
48
|
+
response-in-reasoning eval, you generally will need a high count to reproduce
|
|
49
|
+
the bug: `--count 40` and `--count 40 --stream` typically is sufficient.
|
|
50
|
+
|
|
51
|
+
All evals must pass both with and without Synbad's `--stream` parameter (which
|
|
52
|
+
tests streaming APIs) to be considered a pass.
|
|
53
|
+
|
|
54
|
+
## How do I contribute inference bugs?
|
|
55
|
+
|
|
56
|
+
If you already have some problematic JSON, head over to the
|
|
57
|
+
[Contributing](#Contributing) section. If you don't, don't worry! Synbad makes
|
|
58
|
+
it easy to capture the problematic JSON you're encountering.
|
|
59
|
+
|
|
60
|
+
First, run the Synbad Proxy, specifying the local port you want to use and the
|
|
61
|
+
inference host you want to target. For example, to forward requests from
|
|
62
|
+
`localhost:3000` to Synthetic's API, you'd do:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
synbad proxy -p 3000 -t https://api.synthetic.new/openai/v1
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Then, configure your coding agent — or whichever local tool you're using — to
|
|
69
|
+
point to `http://localhost:3000` (or whichever port you selected). The Synbad
|
|
70
|
+
Proxy will log all request bodies to `stdout`, so all you need to do is
|
|
71
|
+
reproduce the bug by using your tool or coding agent, and then copy the JSON it
|
|
72
|
+
printed to `stdout`.
|
|
73
|
+
|
|
74
|
+
Now you have reproducible JSON to file a bug via Synbad!
|
|
37
75
|
|
|
38
76
|
## Contributing
|
|
39
77
|
|
|
@@ -49,25 +87,47 @@ TypeScript. You need to export two things from an eval:
|
|
|
49
87
|
1. The JSON that reproduces the problem, as the const `json`. It doesn't have to
|
|
50
88
|
reproduce it 100% of the time; if the bug appears even 5% of the time,
|
|
51
89
|
that's fine.
|
|
52
|
-
2. A `test` function that runs some asserts on the
|
|
90
|
+
2. A `test` function that runs some asserts on the returned assistant message,
|
|
53
91
|
which detect the error.
|
|
54
92
|
|
|
55
|
-
For example, we can test
|
|
56
|
-
`evals/
|
|
93
|
+
For example, we can test parallel tool call support very simply (as we do in the
|
|
94
|
+
`evals/tools/parallel-tool.ts` file):
|
|
57
95
|
|
|
58
96
|
```typescript
|
|
59
97
|
import * as assert from "../../source/asserts.ts";
|
|
60
|
-
import {
|
|
98
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
61
99
|
|
|
62
|
-
export function test(
|
|
63
|
-
|
|
64
|
-
assert.
|
|
100
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
101
|
+
assert.isNotNullish(tool_calls);
|
|
102
|
+
assert.isNotEmptyArray(tool_calls);
|
|
103
|
+
assert.strictEqual(tool_calls.length, 2);
|
|
65
104
|
}
|
|
66
105
|
|
|
67
106
|
export const json = {
|
|
68
|
-
messages: [
|
|
69
|
-
{
|
|
107
|
+
"messages": [
|
|
108
|
+
{"role": "user", "content": "What's the weather in Paris and London?"}
|
|
70
109
|
],
|
|
110
|
+
"tools": [
|
|
111
|
+
{
|
|
112
|
+
"type": "function",
|
|
113
|
+
"function": {
|
|
114
|
+
"name": "get_weather",
|
|
115
|
+
"description": "Get current weather for a location",
|
|
116
|
+
"parameters": {
|
|
117
|
+
"type": "object",
|
|
118
|
+
"properties": {
|
|
119
|
+
"location": {
|
|
120
|
+
"type": "string",
|
|
121
|
+
"description": "City name"
|
|
122
|
+
}
|
|
123
|
+
},
|
|
124
|
+
"required": ["location"]
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
],
|
|
129
|
+
"parallel_tool_calls": true,
|
|
130
|
+
"tool_choice": "auto",
|
|
71
131
|
}
|
|
72
132
|
```
|
|
73
133
|
|
|
@@ -75,18 +135,43 @@ The `asserts.ts` file re-exports all of the built-in NodeJS assertion
|
|
|
75
135
|
functions, and also adds a few extra ones, e.g. `isNotNullish` which checks
|
|
76
136
|
whether an object is `null` or `undefined`.
|
|
77
137
|
|
|
78
|
-
To run your new eval, use the `synbad.sh` script in this repo
|
|
79
|
-
|
|
80
|
-
|
|
138
|
+
To run your new eval, use the `synbad.sh` script in this repo, which
|
|
139
|
+
auto-recompiles everything (including your new test!) before running the evals.
|
|
140
|
+
Assuming you're testing the `evals/reasoning/reasoning-parsing` test, for
|
|
141
|
+
GLM-4.6 on Synthetic, and you want to run it 5 times since it isn't
|
|
142
|
+
consistently failing:
|
|
81
143
|
|
|
82
144
|
```bash
|
|
83
|
-
./synbad.sh --env-var SYNTHETIC_API_KEY \
|
|
145
|
+
./synbad.sh eval --env-var SYNTHETIC_API_KEY \
|
|
84
146
|
--base-url "https://api.synthetic.new/openai/v1" \
|
|
85
147
|
--only evals/reasoning/reasoning-parsing \
|
|
86
148
|
--model "hf:zai-org/GLM-4.6" \
|
|
87
149
|
--count 5
|
|
88
150
|
```
|
|
89
151
|
|
|
152
|
+
### Handling reasoning parsing
|
|
153
|
+
|
|
154
|
+
The OpenAI spec didn't originally include reasoning content parsing, since the
|
|
155
|
+
original OpenAI models didn't reason. The open-source community added support
|
|
156
|
+
for reasoning later, but there are two competing specs:
|
|
157
|
+
|
|
158
|
+
1. Storing the reasoning content in `message.reasoning_content`, or
|
|
159
|
+
2. Storing the reasoning content in `message.reasoning`.
|
|
160
|
+
|
|
161
|
+
To make sure your evals work with a wider range of inference providers, use
|
|
162
|
+
the `getReasoning` function when testing reasoning parsing like so:
|
|
163
|
+
|
|
164
|
+
```typescript
|
|
165
|
+
import { getReasoning } from "../../source/chat-completion.ts";
|
|
166
|
+
|
|
167
|
+
// In your test:
|
|
168
|
+
|
|
169
|
+
const reasoning = getReasoning(message);
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
This ensures your test will use the correct reasoning content data regardless
|
|
173
|
+
of which spec the underlying inference provider is using.
|
|
174
|
+
|
|
90
175
|
## Running Synbad
|
|
91
176
|
|
|
92
177
|
First, install it:
|
|
@@ -98,7 +183,7 @@ npm install -g @syntheticlab/synbad
|
|
|
98
183
|
Then run:
|
|
99
184
|
|
|
100
185
|
```bash
|
|
101
|
-
synbad --env-var SYNTHETIC_API_KEY \
|
|
186
|
+
synbad eval --env-var SYNTHETIC_API_KEY \
|
|
102
187
|
--base-url "https://api.synthetic.new/openai/v1" \
|
|
103
188
|
--model "hf:zai-org/GLM-4.6"
|
|
104
189
|
```
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export declare function test(
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test(message: ChatMessage): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: ({
|
|
5
5
|
role: string;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
2
|
import { getReasoning } from "../../source/chat-completion.js";
|
|
3
|
-
export function test(
|
|
4
|
-
const reasoning = getReasoning(
|
|
3
|
+
export function test(message) {
|
|
4
|
+
const reasoning = getReasoning(message);
|
|
5
5
|
assert.isNotNullish(reasoning);
|
|
6
6
|
}
|
|
7
7
|
export const json = {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export declare function test(
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: ({
|
|
5
5
|
role: string;
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
|
-
export function test(
|
|
3
|
-
const { tool_calls } = response.choices[0].message;
|
|
2
|
+
export function test({ tool_calls }) {
|
|
4
3
|
assert.isNotNullish(tool_calls);
|
|
5
4
|
assert.isNotEmptyArray(tool_calls);
|
|
6
5
|
assert.strictEqual(tool_calls.length, 1);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export declare function test(
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test(message: ChatMessage): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: {
|
|
5
5
|
role: string;
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
2
|
import { getReasoning } from "../../source/chat-completion.js";
|
|
3
|
-
export function test(
|
|
4
|
-
const reasoning = getReasoning(
|
|
3
|
+
export function test(message) {
|
|
4
|
+
const reasoning = getReasoning(message);
|
|
5
5
|
assert.isNotNullish(reasoning);
|
|
6
6
|
}
|
|
7
7
|
export const json = {
|
|
8
|
-
|
|
9
|
-
{
|
|
8
|
+
messages: [
|
|
9
|
+
{ role: "user", content: "Why does 1+1=2?" }
|
|
10
10
|
],
|
|
11
11
|
};
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test(message: ChatMessage): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: ({
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
} | {
|
|
8
|
+
role: string;
|
|
9
|
+
content: {
|
|
10
|
+
type: string;
|
|
11
|
+
text: string;
|
|
12
|
+
}[];
|
|
13
|
+
})[];
|
|
14
|
+
temperature: number;
|
|
15
|
+
tools: {
|
|
16
|
+
type: string;
|
|
17
|
+
function: {
|
|
18
|
+
name: string;
|
|
19
|
+
description: string;
|
|
20
|
+
parameters: {
|
|
21
|
+
type: string;
|
|
22
|
+
properties: {
|
|
23
|
+
description: {
|
|
24
|
+
description: string;
|
|
25
|
+
type: string;
|
|
26
|
+
};
|
|
27
|
+
prompt: {
|
|
28
|
+
description: string;
|
|
29
|
+
type: string;
|
|
30
|
+
};
|
|
31
|
+
subagent_type: {
|
|
32
|
+
description: string;
|
|
33
|
+
type: string;
|
|
34
|
+
};
|
|
35
|
+
session_id: {
|
|
36
|
+
description: string;
|
|
37
|
+
type: string;
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
required: string[];
|
|
41
|
+
};
|
|
42
|
+
};
|
|
43
|
+
}[];
|
|
44
|
+
tool_choice: string;
|
|
45
|
+
};
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test(message) {
|
|
3
|
+
const content = message.content;
|
|
4
|
+
assert.or(() => assert.isNotNullish(content), () => assert.isNotEmptyArray(message.tool_calls));
|
|
5
|
+
}
|
|
6
|
+
export const json = {
|
|
7
|
+
"messages": [
|
|
8
|
+
{
|
|
9
|
+
"role": "system",
|
|
10
|
+
"content": "When I ask you to add a feature or resolve a problem: ALWAYS start the project explorer sub-agent to build complete understanding"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"role": "user",
|
|
14
|
+
"content": [
|
|
15
|
+
{
|
|
16
|
+
"type": "text",
|
|
17
|
+
"text": "Hello"
|
|
18
|
+
}
|
|
19
|
+
]
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"temperature": 1,
|
|
23
|
+
"tools": [
|
|
24
|
+
{
|
|
25
|
+
"type": "function",
|
|
26
|
+
"function": {
|
|
27
|
+
"name": "task",
|
|
28
|
+
"description": "Launch a new agent to handle complex, multi-step tasks autonomously.\n\nAvailable agent types and the tools they have access to:\n- general: General-purpose agent for researching complex questions and executing multi-step tasks. Use this agent to execute multiple units of work in parallel.\n- explore: Fast agent specialized for exploring codebases. Use this when you need to quickly find files by patterns (eg. \"src/components/**/*.tsx\"), search code for keywords (eg. \"API endpoints\"), or answer questions about the codebase (eg. \"how do API endpoints work?\"). When calling this agent, specify the desired thoroughness level: \"quick\" for basic searches, \"medium\" for moderate exploration, or \"very thorough\" for comprehensive analysis across multiple locations and naming conventions.\n- code-reviewer: Expert code review specialist. Proactively reviews code for quality, security, and maintainability. Use immediately after writing or modifying code.\n\nWhen using the Task tool, you must specify a subagent_type parameter to select which agent type to use.\n\nWhen to use the Task tool:\n- When you are instructed to execute custom slash commands. Use the Task tool with the slash command invocation as the entire prompt. The slash command can take arguments. For example: Task(description=\"Check the file\", prompt=\"/check-file path/to/file.py\")\n\nWhen NOT to use the Task tool:\n- If you want to read a specific file path, use the Read or Glob tool instead of the Task tool, to find the match more quickly\n- If you are searching for a specific class definition like \"class Foo\", use the Glob tool instead, to find the match more quickly\n- If you are searching for code within a specific file or set of 2-3 files, use the Read tool instead of the Task tool, to find the match more quickly\n- Other tasks that are not related to the agent descriptions above\n\n\nUsage notes:\n1. Launch multiple agents concurrently whenever possible, to maximize performance; to do that, use a single message with multiple tool uses\n2. When the agent is done, it will return a single message back to you. The result returned by the agent is not visible to the user. To show the user the result, you should send a text message back to the user with a concise summary of the result.\n3. Each agent invocation is stateless unless you provide a session_id. Your prompt should contain a highly detailed task description for the agent to perform autonomously and you should specify exactly what information the agent should return back to you in its final and only message to you.\n4. The agent's outputs should generally be trusted\n5. Clearly tell the agent whether you expect it to write code or just to do research (search, file reads, web fetches, etc.), since it is not aware of the user's intent\n6. If the agent description mentions that it should be used proactively, then you should try your best to use it without the user having to ask for it first. Use your judgement.\n\n",
|
|
29
|
+
"parameters": {
|
|
30
|
+
"type": "object",
|
|
31
|
+
"properties": {
|
|
32
|
+
"description": {
|
|
33
|
+
"description": "A short (3-5 words) description of the task",
|
|
34
|
+
"type": "string"
|
|
35
|
+
},
|
|
36
|
+
"prompt": {
|
|
37
|
+
"description": "The task for the agent to perform",
|
|
38
|
+
"type": "string"
|
|
39
|
+
},
|
|
40
|
+
"subagent_type": {
|
|
41
|
+
"description": "The type of specialized agent to use for this task",
|
|
42
|
+
"type": "string"
|
|
43
|
+
},
|
|
44
|
+
"session_id": {
|
|
45
|
+
"description": "Existing Task session to continue",
|
|
46
|
+
"type": "string"
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"required": [
|
|
50
|
+
"description",
|
|
51
|
+
"prompt",
|
|
52
|
+
"subagent_type"
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
],
|
|
58
|
+
"tool_choice": "auto"
|
|
59
|
+
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
export declare function test(
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: ({
|
|
5
5
|
role: string;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
export declare function test(
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: {
|
|
5
5
|
content: string;
|
|
@@ -670,7 +670,4 @@ export declare const json: {
|
|
|
670
670
|
})[];
|
|
671
671
|
tool_choice: string;
|
|
672
672
|
max_tokens: number;
|
|
673
|
-
stream_options: {
|
|
674
|
-
include_usage: boolean;
|
|
675
|
-
};
|
|
676
673
|
};
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
|
-
export function test(
|
|
3
|
-
const { tool_calls } = response.choices[0].message;
|
|
2
|
+
export function test({ tool_calls }) {
|
|
4
3
|
assert.isNotNullish(tool_calls);
|
|
5
4
|
assert.isNotEmptyArray(tool_calls);
|
|
6
|
-
assert.
|
|
5
|
+
assert.gte(tool_calls.length, 1);
|
|
7
6
|
assert.strictEqual(tool_calls[0].type, "function");
|
|
8
7
|
const fn = tool_calls[0].function;
|
|
9
8
|
assert.or(() => {
|
|
10
9
|
assert.strictEqual(fn.name, "ls");
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
if (fn.arguments) {
|
|
11
|
+
const args = JSON.parse(fn.arguments);
|
|
12
|
+
assert.or(() => assert.strictEqual(args.path, "/home/reissbaker/Hack/scratch-scripts"), () => assert.strictEqual(args.path, "."), () => assert.isNullish(args.path));
|
|
13
|
+
}
|
|
13
14
|
}, () => {
|
|
14
15
|
assert.strictEqual(fn.name, "bash");
|
|
15
16
|
const args = JSON.parse(fn.arguments);
|
|
@@ -435,7 +436,4 @@ export const json = {
|
|
|
435
436
|
],
|
|
436
437
|
"tool_choice": "auto",
|
|
437
438
|
"max_tokens": 60000,
|
|
438
|
-
"stream_options": {
|
|
439
|
-
"include_usage": true
|
|
440
|
-
}
|
|
441
439
|
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: ({
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
tool_calls?: undefined;
|
|
8
|
+
tool_call_id?: undefined;
|
|
9
|
+
} | {
|
|
10
|
+
role: string;
|
|
11
|
+
tool_calls: {
|
|
12
|
+
id: string;
|
|
13
|
+
type: string;
|
|
14
|
+
function: {
|
|
15
|
+
name: string;
|
|
16
|
+
arguments: string;
|
|
17
|
+
};
|
|
18
|
+
}[];
|
|
19
|
+
content?: undefined;
|
|
20
|
+
tool_call_id?: undefined;
|
|
21
|
+
} | {
|
|
22
|
+
role: string;
|
|
23
|
+
tool_call_id: string;
|
|
24
|
+
content: string;
|
|
25
|
+
tool_calls?: undefined;
|
|
26
|
+
})[];
|
|
27
|
+
tools: {
|
|
28
|
+
type: string;
|
|
29
|
+
function: {
|
|
30
|
+
name: string;
|
|
31
|
+
description: string;
|
|
32
|
+
parameters: {
|
|
33
|
+
type: string;
|
|
34
|
+
properties: {
|
|
35
|
+
location: {
|
|
36
|
+
type: string;
|
|
37
|
+
description: string;
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
required: string[];
|
|
41
|
+
};
|
|
42
|
+
};
|
|
43
|
+
}[];
|
|
44
|
+
parallel_tool_calls: boolean;
|
|
45
|
+
tool_choice: string;
|
|
46
|
+
};
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test({ tool_calls }) {
|
|
3
|
+
assert.isNotNullish(tool_calls);
|
|
4
|
+
assert.isNotEmptyArray(tool_calls);
|
|
5
|
+
assert.gte(tool_calls.length, 1);
|
|
6
|
+
assert.ok(tool_calls.some(tool_call => {
|
|
7
|
+
if (tool_call.type === "function" && tool_call.function.name === "get_weather") {
|
|
8
|
+
const location = JSON.parse(tool_call.function.arguments).location;
|
|
9
|
+
if (typeof location === "string") {
|
|
10
|
+
return location.toLowerCase().match(/las vegas/);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return false;
|
|
14
|
+
}), "At least one tool call must be get_weather({ location: 'las_vegas' })");
|
|
15
|
+
}
|
|
16
|
+
export const json = {
|
|
17
|
+
"messages": [
|
|
18
|
+
{
|
|
19
|
+
role: "user",
|
|
20
|
+
content: "What's the weather in Paris?"
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
role: "assistant",
|
|
24
|
+
tool_calls: [
|
|
25
|
+
{
|
|
26
|
+
id: "gw1",
|
|
27
|
+
type: "function",
|
|
28
|
+
function: {
|
|
29
|
+
name: "get_weather",
|
|
30
|
+
arguments: JSON.stringify({
|
|
31
|
+
location: "Paris, France",
|
|
32
|
+
}),
|
|
33
|
+
},
|
|
34
|
+
},
|
|
35
|
+
],
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
role: "tool",
|
|
39
|
+
tool_call_id: "gw1",
|
|
40
|
+
content: "The weather in Paris is 24 degrees Celsius",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
role: "assistant",
|
|
44
|
+
content: "I've looked up the weather in Paris, and it's a comfy 24 degrees Celsius today.",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
role: "user",
|
|
48
|
+
content: "I meant Paris, Texas",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
role: "assistant",
|
|
52
|
+
tool_calls: [
|
|
53
|
+
{
|
|
54
|
+
id: "gw2",
|
|
55
|
+
type: "function",
|
|
56
|
+
function: {
|
|
57
|
+
name: "get_weather",
|
|
58
|
+
arguments: JSON.stringify({
|
|
59
|
+
location: "Paris, Texas",
|
|
60
|
+
}),
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
role: "tool",
|
|
67
|
+
tool_call_id: "gw2",
|
|
68
|
+
content: "The weather in Paris, Texas is 34 degrees Celsius",
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
role: "assistant",
|
|
72
|
+
content: "I've looked up the weather in Paris, Texas and it's a scorching 24 degrees Celsius today.",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
role: "user",
|
|
76
|
+
content: "How about Las Vegas",
|
|
77
|
+
},
|
|
78
|
+
],
|
|
79
|
+
"tools": [
|
|
80
|
+
{
|
|
81
|
+
"type": "function",
|
|
82
|
+
"function": {
|
|
83
|
+
"name": "get_weather",
|
|
84
|
+
"description": "Get current weather for a location",
|
|
85
|
+
"parameters": {
|
|
86
|
+
"type": "object",
|
|
87
|
+
"properties": {
|
|
88
|
+
"location": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"description": "City name"
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
"required": ["location"]
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
],
|
|
98
|
+
"parallel_tool_calls": true,
|
|
99
|
+
"tool_choice": "auto",
|
|
100
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: {
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
}[];
|
|
8
|
+
tools: {
|
|
9
|
+
type: string;
|
|
10
|
+
function: {
|
|
11
|
+
name: string;
|
|
12
|
+
description: string;
|
|
13
|
+
parameters: {
|
|
14
|
+
$schema: string;
|
|
15
|
+
type: string;
|
|
16
|
+
properties: {};
|
|
17
|
+
additionalProperties: boolean;
|
|
18
|
+
};
|
|
19
|
+
};
|
|
20
|
+
}[];
|
|
21
|
+
tool_choice: string;
|
|
22
|
+
};
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test({ tool_calls }) {
|
|
3
|
+
assert.isNotNullish(tool_calls);
|
|
4
|
+
assert.isNotEmptyArray(tool_calls);
|
|
5
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
6
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
7
|
+
}
|
|
8
|
+
export const json = {
|
|
9
|
+
"messages": [
|
|
10
|
+
{
|
|
11
|
+
"role": "user",
|
|
12
|
+
"content": "read the todos",
|
|
13
|
+
},
|
|
14
|
+
],
|
|
15
|
+
"tools": [
|
|
16
|
+
{
|
|
17
|
+
"type": "function",
|
|
18
|
+
"function": {
|
|
19
|
+
"name": "get_todo_items",
|
|
20
|
+
"description": "Retrieves the current list of todo items, including their names and completion statuses.",
|
|
21
|
+
"parameters": {
|
|
22
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
23
|
+
"type": "object",
|
|
24
|
+
"properties": {},
|
|
25
|
+
"additionalProperties": false
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
"tool_choice": "auto",
|
|
31
|
+
};
|