@syntheticlab/synbad 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -23
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
- package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
- package/dist/evals/reasoning/response-in-reasoning.js +59 -0
- package/dist/evals/tools/claude-dash.d.ts +2 -2
- package/dist/evals/tools/claude-dash.js +1 -2
- package/dist/evals/tools/crush-list-files.d.ts +2 -5
- package/dist/evals/tools/crush-list-files.js +6 -8
- package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
- package/dist/evals/tools/multi-turn-tools.js +100 -0
- package/dist/evals/tools/no-fn-args.d.ts +22 -0
- package/dist/evals/tools/no-fn-args.js +31 -0
- package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
- package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
- package/dist/evals/tools/parallel-tool.d.ts +2 -2
- package/dist/evals/tools/parallel-tool.js +1 -2
- package/dist/evals/tools/simple-tool.d.ts +2 -2
- package/dist/evals/tools/simple-tool.js +3 -2
- package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
- package/dist/evals/tools/tool-dash-underscore.js +37 -0
- package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
- package/dist/evals/tools/tool-path-corruption.js +41 -0
- package/dist/source/asserts.d.ts +4 -1
- package/dist/source/asserts.js +36 -0
- package/dist/source/chat-completion.d.ts +5 -0
- package/dist/source/chat-completion.js +1 -0
- package/dist/source/evals.d.ts +9 -0
- package/dist/source/evals.js +53 -0
- package/dist/source/evals.test.d.ts +1 -0
- package/dist/source/evals.test.js +12 -0
- package/dist/source/exports.d.ts +2 -0
- package/dist/source/exports.js +1 -0
- package/dist/source/index.js +103 -43
- package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
- package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
- package/evals/reasoning/reasoning-parsing.ts +3 -3
- package/evals/reasoning/response-in-reasoning.ts +65 -0
- package/evals/tools/claude-dash.ts +2 -3
- package/evals/tools/crush-list-files.ts +11 -13
- package/evals/tools/multi-turn-tools.ts +104 -0
- package/evals/tools/no-fn-args.ts +34 -0
- package/evals/tools/octo-list-no-optional-args.ts +81 -0
- package/evals/tools/parallel-tool.ts +2 -3
- package/evals/tools/simple-tool.ts +4 -3
- package/evals/tools/tool-dash-underscore.ts +40 -0
- package/evals/tools/tool-path-corruption.ts +46 -0
- package/package.json +10 -3
- package/source/asserts.ts +37 -1
- package/source/chat-completion.ts +6 -0
- package/source/evals.test.ts +13 -0
- package/source/evals.ts +56 -0
- package/source/exports.ts +2 -0
- package/source/index.ts +121 -44
package/dist/source/index.js
CHANGED
|
@@ -9,10 +9,11 @@ var __rewriteRelativeImportExtension = (this && this.__rewriteRelativeImportExte
|
|
|
9
9
|
};
|
|
10
10
|
import * as http from "http";
|
|
11
11
|
import * as https from "https";
|
|
12
|
-
import fs from "fs/promises";
|
|
13
12
|
import path from "path";
|
|
14
13
|
import { Command } from "@commander-js/extra-typings";
|
|
15
14
|
import OpenAI from "openai";
|
|
15
|
+
import { getReasoning } from "./chat-completion.js";
|
|
16
|
+
import { findTestFiles, evalName } from "./evals.js";
|
|
16
17
|
const cli = new Command()
|
|
17
18
|
.name("synbad")
|
|
18
19
|
.description("A set of evals for LLM inference providers");
|
|
@@ -21,10 +22,12 @@ cli.command("eval")
|
|
|
21
22
|
.requiredOption("--env-var <env var name>", "The env var to use to authenticate with the inference provider")
|
|
22
23
|
.requiredOption("--base-url <base url>", "The base URL for the inference provider")
|
|
23
24
|
.option("--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)")
|
|
25
|
+
.option("--reasoning-effort <level>", "Set the reasoning effort to high, medium, or low")
|
|
24
26
|
.option("--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash")
|
|
25
27
|
.option("--count <num times>", "Number of times to run the eval. Any failures count as an overall failure")
|
|
28
|
+
.option("--stream", "Test streaming API calls")
|
|
26
29
|
.requiredOption("--model <model name>", "The model name to test")
|
|
27
|
-
.action(async ({ model, envVar, baseUrl, only, count }) => {
|
|
30
|
+
.action(async ({ model, envVar, baseUrl, only, count, skipReasoning, reasoningEffort, stream }) => {
|
|
28
31
|
if (!process.env[envVar]) {
|
|
29
32
|
console.error(`No env var named ${envVar} exists for the current process`);
|
|
30
33
|
process.exit(1);
|
|
@@ -35,29 +38,111 @@ cli.command("eval")
|
|
|
35
38
|
});
|
|
36
39
|
let found = 0;
|
|
37
40
|
const failures = new Set();
|
|
38
|
-
const evalPath = only ? path.join(import.meta.dirname, "..", only) : path.join(import.meta.dirname, "
|
|
41
|
+
const evalPath = only ? path.join(import.meta.dirname, "..", only) : path.join(import.meta.dirname, "..", "evals");
|
|
39
42
|
const maxRuns = count == null ? 1 : parseInt(count, 10);
|
|
40
|
-
for await (const testFile of findTestFiles(evalPath)) {
|
|
43
|
+
for await (const testFile of findTestFiles(evalPath, !!skipReasoning)) {
|
|
41
44
|
found++;
|
|
42
45
|
const test = await import(__rewriteRelativeImportExtension(testFile));
|
|
43
46
|
const json = test.json;
|
|
44
47
|
const name = evalName(testFile);
|
|
45
48
|
process.stdout.write(`Running ${name}...`);
|
|
49
|
+
async function respond() {
|
|
50
|
+
const reasoning = reasoningEffort == null ? {} : {
|
|
51
|
+
reasoning_effort: reasoningEffort,
|
|
52
|
+
};
|
|
53
|
+
if (!stream) {
|
|
54
|
+
const response = await client.chat.completions.create({
|
|
55
|
+
...json,
|
|
56
|
+
...reasoning,
|
|
57
|
+
stream: false,
|
|
58
|
+
model,
|
|
59
|
+
});
|
|
60
|
+
return response.choices[0].message;
|
|
61
|
+
}
|
|
62
|
+
const msg = {};
|
|
63
|
+
const chunkStream = await client.chat.completions.create({
|
|
64
|
+
...json,
|
|
65
|
+
...reasoning,
|
|
66
|
+
model,
|
|
67
|
+
stream: true,
|
|
68
|
+
});
|
|
69
|
+
let lastIndex = null;
|
|
70
|
+
let toolBuffer = null;
|
|
71
|
+
for await (const chunk of chunkStream) {
|
|
72
|
+
if (!chunk.choices)
|
|
73
|
+
continue;
|
|
74
|
+
const choice = chunk.choices[0];
|
|
75
|
+
if (!choice)
|
|
76
|
+
continue;
|
|
77
|
+
const content = choice.delta.content;
|
|
78
|
+
const tools = choice.delta.tool_calls;
|
|
79
|
+
const reasoning = getReasoning(choice.delta);
|
|
80
|
+
if (content) {
|
|
81
|
+
if (!msg.content)
|
|
82
|
+
msg.content = "";
|
|
83
|
+
msg.content += content;
|
|
84
|
+
}
|
|
85
|
+
if (tools) {
|
|
86
|
+
for (const toolDelta of tools) {
|
|
87
|
+
if (lastIndex == null)
|
|
88
|
+
lastIndex = toolDelta.index;
|
|
89
|
+
if (lastIndex !== toolDelta.index && toolBuffer != null) {
|
|
90
|
+
msg.tool_calls ||= [];
|
|
91
|
+
// @ts-ignore
|
|
92
|
+
msg.tool_calls.push(toolBuffer);
|
|
93
|
+
toolBuffer = {
|
|
94
|
+
index: toolDelta.index,
|
|
95
|
+
type: "function",
|
|
96
|
+
function: {},
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
if (!toolBuffer) {
|
|
100
|
+
toolBuffer = {
|
|
101
|
+
index: toolDelta.index,
|
|
102
|
+
type: "function",
|
|
103
|
+
function: {}
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
lastIndex = toolDelta.index;
|
|
107
|
+
if (toolDelta.id)
|
|
108
|
+
toolBuffer.id = toolDelta.id;
|
|
109
|
+
if (toolDelta.function) {
|
|
110
|
+
if (toolDelta.function.name) {
|
|
111
|
+
toolBuffer.function.name ||= "";
|
|
112
|
+
toolBuffer.function.name += toolDelta.function.name;
|
|
113
|
+
}
|
|
114
|
+
if (toolDelta.function.arguments) {
|
|
115
|
+
toolBuffer.function.arguments ||= "";
|
|
116
|
+
toolBuffer.function.arguments += toolDelta.function.arguments;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
if (reasoning) {
|
|
122
|
+
if (!msg.reasoning_content)
|
|
123
|
+
msg.reasoning_content = "";
|
|
124
|
+
msg.reasoning_content += reasoning;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (toolBuffer) {
|
|
128
|
+
msg.tool_calls ||= [];
|
|
129
|
+
// @ts-ignore
|
|
130
|
+
msg.tool_calls.push(toolBuffer);
|
|
131
|
+
}
|
|
132
|
+
return msg;
|
|
133
|
+
}
|
|
46
134
|
try {
|
|
47
135
|
for (let i = 0; i < maxRuns; i++) {
|
|
48
136
|
if (maxRuns > 1) {
|
|
49
137
|
process.stdout.write(` ${i + 1}/${maxRuns}`);
|
|
50
138
|
}
|
|
51
|
-
const response = await
|
|
52
|
-
model,
|
|
53
|
-
...json,
|
|
54
|
-
});
|
|
139
|
+
const response = await respond();
|
|
55
140
|
try {
|
|
56
141
|
test.test(response);
|
|
57
142
|
}
|
|
58
143
|
catch (e) {
|
|
59
144
|
console.error("Response:");
|
|
60
|
-
console.error(JSON.stringify(response
|
|
145
|
+
console.error(JSON.stringify(response, null, 2));
|
|
61
146
|
throw e;
|
|
62
147
|
}
|
|
63
148
|
}
|
|
@@ -84,6 +169,7 @@ ${passed}/${found} evals passed. Failures:
|
|
|
84
169
|
cli.command("proxy")
|
|
85
170
|
.requiredOption("-p, --port <number>", "Port to listen on")
|
|
86
171
|
.requiredOption("-t, --target <url>", "Target URL to proxy to")
|
|
172
|
+
.option("--pretty", "Pretty-print the JSON")
|
|
87
173
|
.action(async (options) => {
|
|
88
174
|
const port = parseInt(options.port, 10);
|
|
89
175
|
const targetUrl = new URL(options.target);
|
|
@@ -108,6 +194,7 @@ cli.command("proxy")
|
|
|
108
194
|
stderrLog(`[${timestamp}] 📦 Writing request data to stdout...`);
|
|
109
195
|
// Choose the right module based on target protocol
|
|
110
196
|
const httpModule = targetUrl.protocol === "https:" ? https : http;
|
|
197
|
+
const buffer = [];
|
|
111
198
|
// Create proxy request
|
|
112
199
|
const proxyReq = httpModule.request({
|
|
113
200
|
hostname: targetUrl.hostname,
|
|
@@ -151,11 +238,16 @@ cli.command("proxy")
|
|
|
151
238
|
}
|
|
152
239
|
});
|
|
153
240
|
req.on("data", (chunk) => {
|
|
154
|
-
|
|
241
|
+
buffer.push(chunk);
|
|
242
|
+
if (!options.pretty)
|
|
243
|
+
process.stdout.write(chunk);
|
|
155
244
|
proxyReq.write(chunk);
|
|
156
245
|
});
|
|
157
246
|
req.on("end", () => {
|
|
158
|
-
|
|
247
|
+
if (options.pretty)
|
|
248
|
+
console.log(JSON.stringify(JSON.parse(buffer.join()), null, 2));
|
|
249
|
+
else
|
|
250
|
+
process.stdout.write("\n");
|
|
159
251
|
console.log(`[${timestamp}] ✅ Request complete`);
|
|
160
252
|
proxyReq.end();
|
|
161
253
|
});
|
|
@@ -178,38 +270,6 @@ cli.command("proxy")
|
|
|
178
270
|
stderrLog("🤓 Terminal UI messages (such as this one) will be logged to stderr");
|
|
179
271
|
});
|
|
180
272
|
});
|
|
181
|
-
function evalName(file) {
|
|
182
|
-
return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`;
|
|
183
|
-
}
|
|
184
|
-
async function* findTestFiles(dir) {
|
|
185
|
-
try {
|
|
186
|
-
await fs.stat(dir);
|
|
187
|
-
}
|
|
188
|
-
catch (e) {
|
|
189
|
-
const pathname = `${dir}.js`;
|
|
190
|
-
const stat = await fs.stat(pathname);
|
|
191
|
-
if (stat.isFile()) {
|
|
192
|
-
yield pathname;
|
|
193
|
-
return;
|
|
194
|
-
}
|
|
195
|
-
throw e;
|
|
196
|
-
}
|
|
197
|
-
const entryNames = await fs.readdir(dir);
|
|
198
|
-
const entries = await Promise.all(entryNames.map(async (entry) => {
|
|
199
|
-
return {
|
|
200
|
-
path: path.join(dir, entry),
|
|
201
|
-
stat: await fs.stat(path.join(dir, entry)),
|
|
202
|
-
};
|
|
203
|
-
}));
|
|
204
|
-
for (const entry of entries) {
|
|
205
|
-
if (entry.stat.isFile() && entry.path.endsWith(".js")) {
|
|
206
|
-
yield entry.path;
|
|
207
|
-
}
|
|
208
|
-
if (entry.stat.isDirectory()) {
|
|
209
|
-
yield* findTestFiles(entry.path);
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
273
|
function stderrLog(item, ...items) {
|
|
214
274
|
let formatted = item;
|
|
215
275
|
if (items.length > 0) {
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.ts";
|
|
2
|
-
import {
|
|
2
|
+
import { ChatMessage, getReasoning } from "../../source/chat-completion.ts";
|
|
3
3
|
|
|
4
|
-
export function test(
|
|
5
|
-
const reasoning = getReasoning(
|
|
4
|
+
export function test(message: ChatMessage) {
|
|
5
|
+
const reasoning = getReasoning(message);
|
|
6
6
|
assert.isNotNullish(reasoning);
|
|
7
7
|
}
|
|
8
8
|
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
2
|
import * as assert from "../../source/asserts.ts";
|
|
3
3
|
|
|
4
|
-
export function test(
|
|
5
|
-
const { tool_calls } = response.choices[0].message;
|
|
4
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
6
5
|
assert.isNotNullish(tool_calls);
|
|
7
6
|
assert.isNotEmptyArray(tool_calls);
|
|
8
7
|
assert.strictEqual(tool_calls.length, 1);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.ts";
|
|
2
|
-
import {
|
|
2
|
+
import { ChatMessage, getReasoning } from "../../source/chat-completion.ts";
|
|
3
3
|
|
|
4
|
-
export function test(
|
|
5
|
-
const reasoning = getReasoning(
|
|
4
|
+
export function test(message: ChatMessage) {
|
|
5
|
+
const reasoning = getReasoning(message);
|
|
6
6
|
assert.isNotNullish(reasoning);
|
|
7
7
|
}
|
|
8
8
|
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.ts";
|
|
2
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
3
|
+
|
|
4
|
+
export function test(message: ChatMessage) {
|
|
5
|
+
const content = message.content;
|
|
6
|
+
assert.or(
|
|
7
|
+
() => assert.isNotNullish(content),
|
|
8
|
+
() => assert.isNotEmptyArray(message.tool_calls),
|
|
9
|
+
);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export const json = {
|
|
13
|
+
"messages": [
|
|
14
|
+
{
|
|
15
|
+
"role": "system",
|
|
16
|
+
"content": "When I ask you to add a feature or resolve a problem: ALWAYS start the project explorer sub-agent to build complete understanding"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"role": "user",
|
|
20
|
+
"content": [
|
|
21
|
+
{
|
|
22
|
+
"type": "text",
|
|
23
|
+
"text": "Hello"
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
],
|
|
28
|
+
"temperature": 1,
|
|
29
|
+
"tools": [
|
|
30
|
+
{
|
|
31
|
+
"type": "function",
|
|
32
|
+
"function": {
|
|
33
|
+
"name": "task",
|
|
34
|
+
"description": "Launch a new agent to handle complex, multi-step tasks autonomously.\n\nAvailable agent types and the tools they have access to:\n- general: General-purpose agent for researching complex questions and executing multi-step tasks. Use this agent to execute multiple units of work in parallel.\n- explore: Fast agent specialized for exploring codebases. Use this when you need to quickly find files by patterns (eg. \"src/components/**/*.tsx\"), search code for keywords (eg. \"API endpoints\"), or answer questions about the codebase (eg. \"how do API endpoints work?\"). When calling this agent, specify the desired thoroughness level: \"quick\" for basic searches, \"medium\" for moderate exploration, or \"very thorough\" for comprehensive analysis across multiple locations and naming conventions.\n- code-reviewer: Expert code review specialist. Proactively reviews code for quality, security, and maintainability. Use immediately after writing or modifying code.\n\nWhen using the Task tool, you must specify a subagent_type parameter to select which agent type to use.\n\nWhen to use the Task tool:\n- When you are instructed to execute custom slash commands. Use the Task tool with the slash command invocation as the entire prompt. The slash command can take arguments. For example: Task(description=\"Check the file\", prompt=\"/check-file path/to/file.py\")\n\nWhen NOT to use the Task tool:\n- If you want to read a specific file path, use the Read or Glob tool instead of the Task tool, to find the match more quickly\n- If you are searching for a specific class definition like \"class Foo\", use the Glob tool instead, to find the match more quickly\n- If you are searching for code within a specific file or set of 2-3 files, use the Read tool instead of the Task tool, to find the match more quickly\n- Other tasks that are not related to the agent descriptions above\n\n\nUsage notes:\n1. Launch multiple agents concurrently whenever possible, to maximize performance; to do that, use a single message with multiple tool uses\n2. When the agent is done, it will return a single message back to you. The result returned by the agent is not visible to the user. To show the user the result, you should send a text message back to the user with a concise summary of the result.\n3. Each agent invocation is stateless unless you provide a session_id. Your prompt should contain a highly detailed task description for the agent to perform autonomously and you should specify exactly what information the agent should return back to you in its final and only message to you.\n4. The agent's outputs should generally be trusted\n5. Clearly tell the agent whether you expect it to write code or just to do research (search, file reads, web fetches, etc.), since it is not aware of the user's intent\n6. If the agent description mentions that it should be used proactively, then you should try your best to use it without the user having to ask for it first. Use your judgement.\n\n",
|
|
35
|
+
"parameters": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"description": {
|
|
39
|
+
"description": "A short (3-5 words) description of the task",
|
|
40
|
+
"type": "string"
|
|
41
|
+
},
|
|
42
|
+
"prompt": {
|
|
43
|
+
"description": "The task for the agent to perform",
|
|
44
|
+
"type": "string"
|
|
45
|
+
},
|
|
46
|
+
"subagent_type": {
|
|
47
|
+
"description": "The type of specialized agent to use for this task",
|
|
48
|
+
"type": "string"
|
|
49
|
+
},
|
|
50
|
+
"session_id": {
|
|
51
|
+
"description": "Existing Task session to continue",
|
|
52
|
+
"type": "string"
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
"required": [
|
|
56
|
+
"description",
|
|
57
|
+
"prompt",
|
|
58
|
+
"subagent_type"
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
"tool_choice": "auto"
|
|
65
|
+
}
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import OpenAI from "openai";
|
|
2
1
|
import * as assert from "../../source/asserts.ts";
|
|
2
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
3
3
|
|
|
4
|
-
export function test(
|
|
5
|
-
const { tool_calls } = response.choices[0].message;
|
|
4
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
6
5
|
assert.isNotNullish(tool_calls);
|
|
7
6
|
assert.isNotEmptyArray(tool_calls);
|
|
8
7
|
}
|
|
@@ -1,22 +1,23 @@
|
|
|
1
|
-
import OpenAI from "openai";
|
|
2
1
|
import * as assert from "../../source/asserts.ts";
|
|
2
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
3
3
|
|
|
4
|
-
export function test(
|
|
5
|
-
const { tool_calls } = response.choices[0].message;
|
|
4
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
6
5
|
assert.isNotNullish(tool_calls);
|
|
7
6
|
assert.isNotEmptyArray(tool_calls);
|
|
8
|
-
assert.
|
|
7
|
+
assert.gte(tool_calls.length, 1);
|
|
9
8
|
assert.strictEqual(tool_calls[0].type, "function");
|
|
10
9
|
const fn = tool_calls[0].function;
|
|
11
10
|
assert.or(
|
|
12
11
|
() => {
|
|
13
12
|
assert.strictEqual(fn.name, "ls");
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
13
|
+
if(fn.arguments) {
|
|
14
|
+
const args = JSON.parse(fn.arguments);
|
|
15
|
+
assert.or(
|
|
16
|
+
() => assert.strictEqual(args.path, "/home/reissbaker/Hack/scratch-scripts"),
|
|
17
|
+
() => assert.strictEqual(args.path, "."),
|
|
18
|
+
() => assert.isNullish(args.path),
|
|
19
|
+
);
|
|
20
|
+
}
|
|
20
21
|
},
|
|
21
22
|
() => {
|
|
22
23
|
assert.strictEqual(fn.name, "bash");
|
|
@@ -448,7 +449,4 @@ export const json = {
|
|
|
448
449
|
],
|
|
449
450
|
"tool_choice": "auto",
|
|
450
451
|
"max_tokens": 60000,
|
|
451
|
-
"stream_options": {
|
|
452
|
-
"include_usage": true
|
|
453
|
-
}
|
|
454
452
|
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
import * as assert from "../../source/asserts.ts";
|
|
3
|
+
|
|
4
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
5
|
+
assert.isNotNullish(tool_calls);
|
|
6
|
+
assert.isNotEmptyArray(tool_calls);
|
|
7
|
+
assert.gte(tool_calls.length, 1);
|
|
8
|
+
|
|
9
|
+
assert.ok(tool_calls.some(tool_call => {
|
|
10
|
+
if (tool_call.type === "function" && tool_call.function.name === "get_weather") {
|
|
11
|
+
const location = JSON.parse(tool_call.function.arguments).location;
|
|
12
|
+
if (typeof location === "string") {
|
|
13
|
+
return location.toLowerCase().match(/las vegas/);
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
return false;
|
|
17
|
+
}), "At least one tool call must be get_weather({ location: 'las_vegas' })");
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export const json = {
|
|
21
|
+
"messages": [
|
|
22
|
+
{
|
|
23
|
+
role: "user",
|
|
24
|
+
content: "What's the weather in Paris?"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
role: "assistant",
|
|
28
|
+
tool_calls: [
|
|
29
|
+
{
|
|
30
|
+
id: "gw1",
|
|
31
|
+
type: "function",
|
|
32
|
+
function: {
|
|
33
|
+
name: "get_weather",
|
|
34
|
+
arguments: JSON.stringify({
|
|
35
|
+
location: "Paris, France",
|
|
36
|
+
}),
|
|
37
|
+
},
|
|
38
|
+
},
|
|
39
|
+
],
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
role: "tool",
|
|
43
|
+
tool_call_id: "gw1",
|
|
44
|
+
content: "The weather in Paris is 24 degrees Celsius",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
role: "assistant",
|
|
48
|
+
content: "I've looked up the weather in Paris, and it's a comfy 24 degrees Celsius today.",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
role: "user",
|
|
52
|
+
content: "I meant Paris, Texas",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
role: "assistant",
|
|
56
|
+
tool_calls: [
|
|
57
|
+
{
|
|
58
|
+
id: "gw2",
|
|
59
|
+
type: "function",
|
|
60
|
+
function: {
|
|
61
|
+
name: "get_weather",
|
|
62
|
+
arguments: JSON.stringify({
|
|
63
|
+
location: "Paris, Texas",
|
|
64
|
+
}),
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
role: "tool",
|
|
71
|
+
tool_call_id: "gw2",
|
|
72
|
+
content: "The weather in Paris, Texas is 34 degrees Celsius",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
role: "assistant",
|
|
76
|
+
content: "I've looked up the weather in Paris, Texas and it's a scorching 24 degrees Celsius today.",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
role: "user",
|
|
80
|
+
content: "How about Las Vegas",
|
|
81
|
+
},
|
|
82
|
+
],
|
|
83
|
+
"tools": [
|
|
84
|
+
{
|
|
85
|
+
"type": "function",
|
|
86
|
+
"function": {
|
|
87
|
+
"name": "get_weather",
|
|
88
|
+
"description": "Get current weather for a location",
|
|
89
|
+
"parameters": {
|
|
90
|
+
"type": "object",
|
|
91
|
+
"properties": {
|
|
92
|
+
"location": {
|
|
93
|
+
"type": "string",
|
|
94
|
+
"description": "City name"
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
"required": ["location"]
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
],
|
|
102
|
+
"parallel_tool_calls": true,
|
|
103
|
+
"tool_choice": "auto",
|
|
104
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
import * as assert from "../../source/asserts.ts";
|
|
3
|
+
|
|
4
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
5
|
+
assert.isNotNullish(tool_calls);
|
|
6
|
+
assert.isNotEmptyArray(tool_calls);
|
|
7
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
8
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export const json = {
|
|
12
|
+
"messages": [
|
|
13
|
+
{
|
|
14
|
+
"role": "user",
|
|
15
|
+
"content": "read the todos",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
"tools": [
|
|
19
|
+
{
|
|
20
|
+
"type": "function",
|
|
21
|
+
"function": {
|
|
22
|
+
"name": "get_todo_items",
|
|
23
|
+
"description": "Retrieves the current list of todo items, including their names and completion statuses.",
|
|
24
|
+
"parameters": {
|
|
25
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
26
|
+
"type": "object",
|
|
27
|
+
"properties": {},
|
|
28
|
+
"additionalProperties": false
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
"tool_choice": "auto",
|
|
34
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
import * as assert from "../../source/asserts.ts";
|
|
3
|
+
|
|
4
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
5
|
+
assert.isNotNullish(tool_calls);
|
|
6
|
+
assert.isNotEmptyArray(tool_calls);
|
|
7
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
8
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
9
|
+
const parsed = JSON.parse(tool_calls[0].function.arguments);
|
|
10
|
+
assert.or(
|
|
11
|
+
() => assert.isNullish(parsed),
|
|
12
|
+
() => assert.deepEqual(parsed, {}),
|
|
13
|
+
() => assert.isNullish(parsed.dirPath),
|
|
14
|
+
() => assert.strictEqual(parsed.dirPath, ""),
|
|
15
|
+
);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export const json = {
|
|
19
|
+
"messages":[
|
|
20
|
+
{
|
|
21
|
+
"role":"system",
|
|
22
|
+
"content":"You are a coding assistant called Octo."
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"role":"user",
|
|
26
|
+
"content":"call the list tool with no args"
|
|
27
|
+
}
|
|
28
|
+
],
|
|
29
|
+
tools: [
|
|
30
|
+
{
|
|
31
|
+
"type":"function",
|
|
32
|
+
"function":{
|
|
33
|
+
"name":"append",
|
|
34
|
+
"description":"The append tool",
|
|
35
|
+
"parameters":{
|
|
36
|
+
"type":"object",
|
|
37
|
+
"required":["filePath","text"],
|
|
38
|
+
"properties":{
|
|
39
|
+
"filePath":{
|
|
40
|
+
"description":"The path to the file",
|
|
41
|
+
"type":"string"
|
|
42
|
+
},
|
|
43
|
+
"text":{
|
|
44
|
+
"description":"The text to append",
|
|
45
|
+
"type":"string"
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"strict":true
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"type":"function",
|
|
54
|
+
"function":{
|
|
55
|
+
"name":"create",
|
|
56
|
+
"description":"The create tool",
|
|
57
|
+
"parameters":{
|
|
58
|
+
"type":"object",
|
|
59
|
+
"required":["filePath","content"],
|
|
60
|
+
"properties":{
|
|
61
|
+
"filePath":{
|
|
62
|
+
"description":"Path where the file should be created",
|
|
63
|
+
"type":"string"
|
|
64
|
+
},"content":{
|
|
65
|
+
"description":"Content to write to the file",
|
|
66
|
+
"type":"string"
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"strict":true
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
{"type":"function","function":{"name":"edit","description":"The edit tool","parameters":{"type":"object","required":["filePath","search","replace"],"properties":{"filePath":{"description":"The path to the file","type":"string"},"search":{"description":"The search string to replace. Must EXACTLY match the text you intend to replace, including\nwhitespace, punctuation, etc. Make sure to give a few lines of context above and below so you\ndon't accidentally replace a different matching substring in the same file.","type":"string"},"replace":{"description":"The string you want to insert into the file","type":"string"}}},"strict":true}},
|
|
74
|
+
{"type":"function","function":{"name":"fetch","description":"The fetch tool","parameters":{"type":"object","required":["url"],"properties":{"url":{"description":"Full url to fetch, e.g. https://...","type":"string"},"includeMarkup":{"description":"Include the HTML markup? Defaults to false. By default or when set to false, markup will be\nstripped and converted to plain text. Prefer markup stripping, and only set this to true if the\noutput is confusing: otherwise you may download a massive amount of data","type":"boolean"}}},"strict":true}},
|
|
75
|
+
{"type":"function","function":{"name":"list","description":"The list tool","parameters":{"type":"object","required":[],"properties":{"dirPath":{"description":"Path to the directory","type":"string"}}},"strict":true}},
|
|
76
|
+
{"type":"function","function":{"name":"prepend","description":"The prepend tool","parameters":{"type":"object","required":["filePath","text"],"properties":{"filePath":{"description":"The path to the file","type":"string"},"text":{"description":"The text to prepend","type":"string"}}},"strict":true}},
|
|
77
|
+
{"type":"function","function":{"name":"read","description":"The read tool","parameters":{"type":"object","required":["filePath"],"properties":{"filePath":{"description":"Path to file to read","type":"string"}}},"strict":true}},
|
|
78
|
+
{"type":"function","function":{"name":"rewrite","description":"The rewrite tool","parameters":{"type":"object","required":["filePath","text"],"properties":{"filePath":{"description":"The path to the file","type":"string"},"text":{"description":"The replaced file contents. This will rewrite and replace the entire file","type":"string"}}},"strict":true}},
|
|
79
|
+
{"type":"function","function":{"name":"shell","description":"The shell tool","parameters":{"type":"object","required":["timeout","cmd"],"properties":{"timeout":{"description":"A timeout for the command, in milliseconds. Be generous. You MUST specify this.","type":"number"},"cmd":{"description":"The command to run","type":"string"}}},"strict":true}}
|
|
80
|
+
],
|
|
81
|
+
};
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import OpenAI from "openai";
|
|
2
1
|
import * as assert from "../../source/asserts.ts";
|
|
2
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
3
3
|
|
|
4
|
-
export function test(
|
|
5
|
-
const { tool_calls } = response.choices[0].message;
|
|
4
|
+
export function test({ tool_calls }: ChatMessage) {
|
|
6
5
|
assert.isNotNullish(tool_calls);
|
|
7
6
|
assert.isNotEmptyArray(tool_calls);
|
|
8
7
|
assert.strictEqual(tool_calls.length, 2);
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
2
|
import * as assert from "../../source/asserts.ts";
|
|
3
3
|
|
|
4
|
-
export function test(
|
|
5
|
-
const { tool_calls } = response.choices[0].message;
|
|
4
|
+
export function test({ content, tool_calls }: ChatMessage) {
|
|
6
5
|
assert.isNotNullish(tool_calls);
|
|
7
6
|
assert.isNotEmptyArray(tool_calls);
|
|
8
7
|
assert.strictEqual(tool_calls.length, 1);
|
|
@@ -10,6 +9,8 @@ export function test(response: ChatResponse) {
|
|
|
10
9
|
assert.strictEqual(tool_calls[0].function.name, "get_weather");
|
|
11
10
|
const args = JSON.parse(tool_calls[0].function.arguments);
|
|
12
11
|
assert.match(args.location.toLowerCase(), /paris/);
|
|
12
|
+
// Assert the tool call didn't leak into the content
|
|
13
|
+
assert.doesNotMatch(content || "", /get_weather/);
|
|
13
14
|
}
|
|
14
15
|
|
|
15
16
|
export const json = {
|