@syntheticlab/synbad 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -23
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
- package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
- package/dist/evals/reasoning/reasoning-parsing.js +2 -2
- package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
- package/dist/evals/reasoning/response-in-reasoning.js +59 -0
- package/dist/evals/tools/claude-dash.d.ts +2 -2
- package/dist/evals/tools/claude-dash.js +1 -2
- package/dist/evals/tools/crush-list-files.d.ts +2 -5
- package/dist/evals/tools/crush-list-files.js +6 -8
- package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
- package/dist/evals/tools/multi-turn-tools.js +100 -0
- package/dist/evals/tools/no-fn-args.d.ts +22 -0
- package/dist/evals/tools/no-fn-args.js +31 -0
- package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
- package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
- package/dist/evals/tools/parallel-tool.d.ts +2 -2
- package/dist/evals/tools/parallel-tool.js +1 -2
- package/dist/evals/tools/simple-tool.d.ts +2 -2
- package/dist/evals/tools/simple-tool.js +3 -2
- package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
- package/dist/evals/tools/tool-dash-underscore.js +37 -0
- package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
- package/dist/evals/tools/tool-path-corruption.js +41 -0
- package/dist/source/asserts.d.ts +4 -1
- package/dist/source/asserts.js +36 -0
- package/dist/source/chat-completion.d.ts +5 -0
- package/dist/source/chat-completion.js +1 -0
- package/dist/source/evals.d.ts +9 -0
- package/dist/source/evals.js +53 -0
- package/dist/source/evals.test.d.ts +1 -0
- package/dist/source/evals.test.js +12 -0
- package/dist/source/exports.d.ts +2 -0
- package/dist/source/exports.js +1 -0
- package/dist/source/index.js +103 -43
- package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
- package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
- package/evals/reasoning/reasoning-parsing.ts +3 -3
- package/evals/reasoning/response-in-reasoning.ts +65 -0
- package/evals/tools/claude-dash.ts +2 -3
- package/evals/tools/crush-list-files.ts +11 -13
- package/evals/tools/multi-turn-tools.ts +104 -0
- package/evals/tools/no-fn-args.ts +34 -0
- package/evals/tools/octo-list-no-optional-args.ts +81 -0
- package/evals/tools/parallel-tool.ts +2 -3
- package/evals/tools/simple-tool.ts +4 -3
- package/evals/tools/tool-dash-underscore.ts +40 -0
- package/evals/tools/tool-path-corruption.ts +46 -0
- package/package.json +10 -3
- package/source/asserts.ts +37 -1
- package/source/chat-completion.ts +6 -0
- package/source/evals.test.ts +13 -0
- package/source/evals.ts +56 -0
- package/source/exports.ts +2 -0
- package/source/index.ts +121 -44
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: {
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
}[];
|
|
8
|
+
tools: ({
|
|
9
|
+
type: string;
|
|
10
|
+
function: {
|
|
11
|
+
name: string;
|
|
12
|
+
description: string;
|
|
13
|
+
parameters: {
|
|
14
|
+
type: string;
|
|
15
|
+
required: string[];
|
|
16
|
+
properties: {
|
|
17
|
+
filePath: {
|
|
18
|
+
description: string;
|
|
19
|
+
type: string;
|
|
20
|
+
};
|
|
21
|
+
text: {
|
|
22
|
+
description: string;
|
|
23
|
+
type: string;
|
|
24
|
+
};
|
|
25
|
+
content?: undefined;
|
|
26
|
+
search?: undefined;
|
|
27
|
+
replace?: undefined;
|
|
28
|
+
url?: undefined;
|
|
29
|
+
includeMarkup?: undefined;
|
|
30
|
+
dirPath?: undefined;
|
|
31
|
+
timeout?: undefined;
|
|
32
|
+
cmd?: undefined;
|
|
33
|
+
};
|
|
34
|
+
};
|
|
35
|
+
strict: boolean;
|
|
36
|
+
};
|
|
37
|
+
} | {
|
|
38
|
+
type: string;
|
|
39
|
+
function: {
|
|
40
|
+
name: string;
|
|
41
|
+
description: string;
|
|
42
|
+
parameters: {
|
|
43
|
+
type: string;
|
|
44
|
+
required: string[];
|
|
45
|
+
properties: {
|
|
46
|
+
filePath: {
|
|
47
|
+
description: string;
|
|
48
|
+
type: string;
|
|
49
|
+
};
|
|
50
|
+
content: {
|
|
51
|
+
description: string;
|
|
52
|
+
type: string;
|
|
53
|
+
};
|
|
54
|
+
text?: undefined;
|
|
55
|
+
search?: undefined;
|
|
56
|
+
replace?: undefined;
|
|
57
|
+
url?: undefined;
|
|
58
|
+
includeMarkup?: undefined;
|
|
59
|
+
dirPath?: undefined;
|
|
60
|
+
timeout?: undefined;
|
|
61
|
+
cmd?: undefined;
|
|
62
|
+
};
|
|
63
|
+
};
|
|
64
|
+
strict: boolean;
|
|
65
|
+
};
|
|
66
|
+
} | {
|
|
67
|
+
type: string;
|
|
68
|
+
function: {
|
|
69
|
+
name: string;
|
|
70
|
+
description: string;
|
|
71
|
+
parameters: {
|
|
72
|
+
type: string;
|
|
73
|
+
required: string[];
|
|
74
|
+
properties: {
|
|
75
|
+
filePath: {
|
|
76
|
+
description: string;
|
|
77
|
+
type: string;
|
|
78
|
+
};
|
|
79
|
+
search: {
|
|
80
|
+
description: string;
|
|
81
|
+
type: string;
|
|
82
|
+
};
|
|
83
|
+
replace: {
|
|
84
|
+
description: string;
|
|
85
|
+
type: string;
|
|
86
|
+
};
|
|
87
|
+
text?: undefined;
|
|
88
|
+
content?: undefined;
|
|
89
|
+
url?: undefined;
|
|
90
|
+
includeMarkup?: undefined;
|
|
91
|
+
dirPath?: undefined;
|
|
92
|
+
timeout?: undefined;
|
|
93
|
+
cmd?: undefined;
|
|
94
|
+
};
|
|
95
|
+
};
|
|
96
|
+
strict: boolean;
|
|
97
|
+
};
|
|
98
|
+
} | {
|
|
99
|
+
type: string;
|
|
100
|
+
function: {
|
|
101
|
+
name: string;
|
|
102
|
+
description: string;
|
|
103
|
+
parameters: {
|
|
104
|
+
type: string;
|
|
105
|
+
required: string[];
|
|
106
|
+
properties: {
|
|
107
|
+
url: {
|
|
108
|
+
description: string;
|
|
109
|
+
type: string;
|
|
110
|
+
};
|
|
111
|
+
includeMarkup: {
|
|
112
|
+
description: string;
|
|
113
|
+
type: string;
|
|
114
|
+
};
|
|
115
|
+
filePath?: undefined;
|
|
116
|
+
text?: undefined;
|
|
117
|
+
content?: undefined;
|
|
118
|
+
search?: undefined;
|
|
119
|
+
replace?: undefined;
|
|
120
|
+
dirPath?: undefined;
|
|
121
|
+
timeout?: undefined;
|
|
122
|
+
cmd?: undefined;
|
|
123
|
+
};
|
|
124
|
+
};
|
|
125
|
+
strict: boolean;
|
|
126
|
+
};
|
|
127
|
+
} | {
|
|
128
|
+
type: string;
|
|
129
|
+
function: {
|
|
130
|
+
name: string;
|
|
131
|
+
description: string;
|
|
132
|
+
parameters: {
|
|
133
|
+
type: string;
|
|
134
|
+
required: never[];
|
|
135
|
+
properties: {
|
|
136
|
+
dirPath: {
|
|
137
|
+
description: string;
|
|
138
|
+
type: string;
|
|
139
|
+
};
|
|
140
|
+
filePath?: undefined;
|
|
141
|
+
text?: undefined;
|
|
142
|
+
content?: undefined;
|
|
143
|
+
search?: undefined;
|
|
144
|
+
replace?: undefined;
|
|
145
|
+
url?: undefined;
|
|
146
|
+
includeMarkup?: undefined;
|
|
147
|
+
timeout?: undefined;
|
|
148
|
+
cmd?: undefined;
|
|
149
|
+
};
|
|
150
|
+
};
|
|
151
|
+
strict: boolean;
|
|
152
|
+
};
|
|
153
|
+
} | {
|
|
154
|
+
type: string;
|
|
155
|
+
function: {
|
|
156
|
+
name: string;
|
|
157
|
+
description: string;
|
|
158
|
+
parameters: {
|
|
159
|
+
type: string;
|
|
160
|
+
required: string[];
|
|
161
|
+
properties: {
|
|
162
|
+
filePath: {
|
|
163
|
+
description: string;
|
|
164
|
+
type: string;
|
|
165
|
+
};
|
|
166
|
+
text?: undefined;
|
|
167
|
+
content?: undefined;
|
|
168
|
+
search?: undefined;
|
|
169
|
+
replace?: undefined;
|
|
170
|
+
url?: undefined;
|
|
171
|
+
includeMarkup?: undefined;
|
|
172
|
+
dirPath?: undefined;
|
|
173
|
+
timeout?: undefined;
|
|
174
|
+
cmd?: undefined;
|
|
175
|
+
};
|
|
176
|
+
};
|
|
177
|
+
strict: boolean;
|
|
178
|
+
};
|
|
179
|
+
} | {
|
|
180
|
+
type: string;
|
|
181
|
+
function: {
|
|
182
|
+
name: string;
|
|
183
|
+
description: string;
|
|
184
|
+
parameters: {
|
|
185
|
+
type: string;
|
|
186
|
+
required: string[];
|
|
187
|
+
properties: {
|
|
188
|
+
timeout: {
|
|
189
|
+
description: string;
|
|
190
|
+
type: string;
|
|
191
|
+
};
|
|
192
|
+
cmd: {
|
|
193
|
+
description: string;
|
|
194
|
+
type: string;
|
|
195
|
+
};
|
|
196
|
+
filePath?: undefined;
|
|
197
|
+
text?: undefined;
|
|
198
|
+
content?: undefined;
|
|
199
|
+
search?: undefined;
|
|
200
|
+
replace?: undefined;
|
|
201
|
+
url?: undefined;
|
|
202
|
+
includeMarkup?: undefined;
|
|
203
|
+
dirPath?: undefined;
|
|
204
|
+
};
|
|
205
|
+
};
|
|
206
|
+
strict: boolean;
|
|
207
|
+
};
|
|
208
|
+
})[];
|
|
209
|
+
};
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test({ tool_calls }) {
|
|
3
|
+
assert.isNotNullish(tool_calls);
|
|
4
|
+
assert.isNotEmptyArray(tool_calls);
|
|
5
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
6
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
7
|
+
const parsed = JSON.parse(tool_calls[0].function.arguments);
|
|
8
|
+
assert.or(() => assert.isNullish(parsed), () => assert.deepEqual(parsed, {}), () => assert.isNullish(parsed.dirPath), () => assert.strictEqual(parsed.dirPath, ""));
|
|
9
|
+
}
|
|
10
|
+
export const json = {
|
|
11
|
+
"messages": [
|
|
12
|
+
{
|
|
13
|
+
"role": "system",
|
|
14
|
+
"content": "You are a coding assistant called Octo."
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"role": "user",
|
|
18
|
+
"content": "call the list tool with no args"
|
|
19
|
+
}
|
|
20
|
+
],
|
|
21
|
+
tools: [
|
|
22
|
+
{
|
|
23
|
+
"type": "function",
|
|
24
|
+
"function": {
|
|
25
|
+
"name": "append",
|
|
26
|
+
"description": "The append tool",
|
|
27
|
+
"parameters": {
|
|
28
|
+
"type": "object",
|
|
29
|
+
"required": ["filePath", "text"],
|
|
30
|
+
"properties": {
|
|
31
|
+
"filePath": {
|
|
32
|
+
"description": "The path to the file",
|
|
33
|
+
"type": "string"
|
|
34
|
+
},
|
|
35
|
+
"text": {
|
|
36
|
+
"description": "The text to append",
|
|
37
|
+
"type": "string"
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"strict": true
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"type": "function",
|
|
46
|
+
"function": {
|
|
47
|
+
"name": "create",
|
|
48
|
+
"description": "The create tool",
|
|
49
|
+
"parameters": {
|
|
50
|
+
"type": "object",
|
|
51
|
+
"required": ["filePath", "content"],
|
|
52
|
+
"properties": {
|
|
53
|
+
"filePath": {
|
|
54
|
+
"description": "Path where the file should be created",
|
|
55
|
+
"type": "string"
|
|
56
|
+
}, "content": {
|
|
57
|
+
"description": "Content to write to the file",
|
|
58
|
+
"type": "string"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"strict": true
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
{ "type": "function", "function": { "name": "edit", "description": "The edit tool", "parameters": { "type": "object", "required": ["filePath", "search", "replace"], "properties": { "filePath": { "description": "The path to the file", "type": "string" }, "search": { "description": "The search string to replace. Must EXACTLY match the text you intend to replace, including\nwhitespace, punctuation, etc. Make sure to give a few lines of context above and below so you\ndon't accidentally replace a different matching substring in the same file.", "type": "string" }, "replace": { "description": "The string you want to insert into the file", "type": "string" } } }, "strict": true } },
|
|
66
|
+
{ "type": "function", "function": { "name": "fetch", "description": "The fetch tool", "parameters": { "type": "object", "required": ["url"], "properties": { "url": { "description": "Full url to fetch, e.g. https://...", "type": "string" }, "includeMarkup": { "description": "Include the HTML markup? Defaults to false. By default or when set to false, markup will be\nstripped and converted to plain text. Prefer markup stripping, and only set this to true if the\noutput is confusing: otherwise you may download a massive amount of data", "type": "boolean" } } }, "strict": true } },
|
|
67
|
+
{ "type": "function", "function": { "name": "list", "description": "The list tool", "parameters": { "type": "object", "required": [], "properties": { "dirPath": { "description": "Path to the directory", "type": "string" } } }, "strict": true } },
|
|
68
|
+
{ "type": "function", "function": { "name": "prepend", "description": "The prepend tool", "parameters": { "type": "object", "required": ["filePath", "text"], "properties": { "filePath": { "description": "The path to the file", "type": "string" }, "text": { "description": "The text to prepend", "type": "string" } } }, "strict": true } },
|
|
69
|
+
{ "type": "function", "function": { "name": "read", "description": "The read tool", "parameters": { "type": "object", "required": ["filePath"], "properties": { "filePath": { "description": "Path to file to read", "type": "string" } } }, "strict": true } },
|
|
70
|
+
{ "type": "function", "function": { "name": "rewrite", "description": "The rewrite tool", "parameters": { "type": "object", "required": ["filePath", "text"], "properties": { "filePath": { "description": "The path to the file", "type": "string" }, "text": { "description": "The replaced file contents. This will rewrite and replace the entire file", "type": "string" } } }, "strict": true } },
|
|
71
|
+
{ "type": "function", "function": { "name": "shell", "description": "The shell tool", "parameters": { "type": "object", "required": ["timeout", "cmd"], "properties": { "timeout": { "description": "A timeout for the command, in milliseconds. Be generous. You MUST specify this.", "type": "number" }, "cmd": { "description": "The command to run", "type": "string" } } }, "strict": true } }
|
|
72
|
+
],
|
|
73
|
+
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
export declare function test(
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: {
|
|
5
5
|
role: string;
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
|
-
export function test(
|
|
3
|
-
const { tool_calls } = response.choices[0].message;
|
|
2
|
+
export function test({ tool_calls }) {
|
|
4
3
|
assert.isNotNullish(tool_calls);
|
|
5
4
|
assert.isNotEmptyArray(tool_calls);
|
|
6
5
|
assert.strictEqual(tool_calls.length, 2);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export declare function test(
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ content, tool_calls }: ChatMessage): void;
|
|
3
3
|
export declare const json: {
|
|
4
4
|
messages: {
|
|
5
5
|
role: string;
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import * as assert from "../../source/asserts.js";
|
|
2
|
-
export function test(
|
|
3
|
-
const { tool_calls } = response.choices[0].message;
|
|
2
|
+
export function test({ content, tool_calls }) {
|
|
4
3
|
assert.isNotNullish(tool_calls);
|
|
5
4
|
assert.isNotEmptyArray(tool_calls);
|
|
6
5
|
assert.strictEqual(tool_calls.length, 1);
|
|
@@ -8,6 +7,8 @@ export function test(response) {
|
|
|
8
7
|
assert.strictEqual(tool_calls[0].function.name, "get_weather");
|
|
9
8
|
const args = JSON.parse(tool_calls[0].function.arguments);
|
|
10
9
|
assert.match(args.location.toLowerCase(), /paris/);
|
|
10
|
+
// Assert the tool call didn't leak into the content
|
|
11
|
+
assert.doesNotMatch(content || "", /get_weather/);
|
|
11
12
|
}
|
|
12
13
|
export const json = {
|
|
13
14
|
"messages": [
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ content, tool_calls }: ChatMessage): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: {
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
}[];
|
|
8
|
+
tools: {
|
|
9
|
+
type: string;
|
|
10
|
+
function: {
|
|
11
|
+
name: string;
|
|
12
|
+
description: string;
|
|
13
|
+
parameters: {
|
|
14
|
+
type: string;
|
|
15
|
+
properties: {
|
|
16
|
+
location: {
|
|
17
|
+
type: string;
|
|
18
|
+
description: string;
|
|
19
|
+
};
|
|
20
|
+
};
|
|
21
|
+
required: string[];
|
|
22
|
+
};
|
|
23
|
+
};
|
|
24
|
+
}[];
|
|
25
|
+
tool_choice: string;
|
|
26
|
+
};
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test({ content, tool_calls }) {
|
|
3
|
+
assert.isNotNullish(tool_calls);
|
|
4
|
+
assert.isNotEmptyArray(tool_calls);
|
|
5
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
6
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
7
|
+
assert.strictEqual(tool_calls[0].function.name, "get-weather__v1");
|
|
8
|
+
const args = JSON.parse(tool_calls[0].function.arguments);
|
|
9
|
+
assert.match(args.location.toLowerCase(), /paris/);
|
|
10
|
+
// Assert the tool call didn't leak into the content
|
|
11
|
+
assert.doesNotMatch(content || "", /get_weather/);
|
|
12
|
+
}
|
|
13
|
+
export const json = {
|
|
14
|
+
"messages": [
|
|
15
|
+
{ "role": "user", "content": "What's the weather in Paris?" }
|
|
16
|
+
],
|
|
17
|
+
"tools": [
|
|
18
|
+
{
|
|
19
|
+
"type": "function",
|
|
20
|
+
"function": {
|
|
21
|
+
"name": "get-weather__v1",
|
|
22
|
+
"description": "Get current weather for a location",
|
|
23
|
+
"parameters": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"properties": {
|
|
26
|
+
"location": {
|
|
27
|
+
"type": "string",
|
|
28
|
+
"description": "City name"
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"required": ["location"]
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"tool_choice": "auto",
|
|
37
|
+
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { ChatMessage } from "../../source/chat-completion.ts";
|
|
2
|
+
export declare function test({ tool_calls }: ChatMessage): void;
|
|
3
|
+
export declare const json: {
|
|
4
|
+
messages: {
|
|
5
|
+
role: string;
|
|
6
|
+
content: string;
|
|
7
|
+
}[];
|
|
8
|
+
tools: {
|
|
9
|
+
type: string;
|
|
10
|
+
function: {
|
|
11
|
+
name: string;
|
|
12
|
+
description: string;
|
|
13
|
+
parameters: {
|
|
14
|
+
type: string;
|
|
15
|
+
required: string[];
|
|
16
|
+
properties: {
|
|
17
|
+
filePath: {
|
|
18
|
+
description: string;
|
|
19
|
+
type: string;
|
|
20
|
+
};
|
|
21
|
+
};
|
|
22
|
+
};
|
|
23
|
+
strict: boolean;
|
|
24
|
+
};
|
|
25
|
+
}[];
|
|
26
|
+
};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
const PATH = "/development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts";
|
|
3
|
+
export function test({ tool_calls }) {
|
|
4
|
+
assert.isNotNullish(tool_calls);
|
|
5
|
+
assert.isNotEmptyArray(tool_calls);
|
|
6
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
7
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
8
|
+
assert.strictEqual(tool_calls[0].function.name, "read");
|
|
9
|
+
const args = JSON.parse(tool_calls[0].function.arguments);
|
|
10
|
+
assert.stringContains(args.filePath, PATH);
|
|
11
|
+
}
|
|
12
|
+
export const json = {
|
|
13
|
+
"messages": [
|
|
14
|
+
{
|
|
15
|
+
"role": "user",
|
|
16
|
+
"content": "Read and summarize the file /development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts"
|
|
17
|
+
}
|
|
18
|
+
],
|
|
19
|
+
"tools": [
|
|
20
|
+
{
|
|
21
|
+
"type": "function",
|
|
22
|
+
"function": {
|
|
23
|
+
"name": "read",
|
|
24
|
+
"description": "The read tool",
|
|
25
|
+
"parameters": {
|
|
26
|
+
"type": "object",
|
|
27
|
+
"required": [
|
|
28
|
+
"filePath"
|
|
29
|
+
],
|
|
30
|
+
"properties": {
|
|
31
|
+
"filePath": {
|
|
32
|
+
"description": "Path to file to read",
|
|
33
|
+
"type": "string"
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
"strict": true
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
],
|
|
41
|
+
};
|
package/dist/source/asserts.d.ts
CHANGED
|
@@ -3,5 +3,8 @@ export declare function or(a: () => void, ...rest: Array<() => void>): void;
|
|
|
3
3
|
export declare function isNullish(a: unknown): asserts a is null | undefined;
|
|
4
4
|
export declare function isNotNullish<T extends any>(a: T): asserts a is Exclude<T, null | undefined>;
|
|
5
5
|
export declare function isEmptyArray(a: any[]): boolean;
|
|
6
|
-
export declare function isNotEmptyArray(a: any[]): boolean;
|
|
6
|
+
export declare function isNotEmptyArray(a: any[] | undefined): boolean;
|
|
7
7
|
export declare function startsWith(a: string, prefix: string): boolean;
|
|
8
|
+
export declare function gt(num: number, target: number): boolean;
|
|
9
|
+
export declare function gte(num: number, target: number): boolean;
|
|
10
|
+
export declare function stringContains(str: string, expected: string): boolean;
|
package/dist/source/asserts.js
CHANGED
|
@@ -45,6 +45,12 @@ export function isEmptyArray(a) {
|
|
|
45
45
|
});
|
|
46
46
|
}
|
|
47
47
|
export function isNotEmptyArray(a) {
|
|
48
|
+
if (a == null) {
|
|
49
|
+
throw new assert.AssertionError({
|
|
50
|
+
message: "Expected a non-empty array",
|
|
51
|
+
actual: a,
|
|
52
|
+
});
|
|
53
|
+
}
|
|
48
54
|
if (a.length !== 0)
|
|
49
55
|
return true;
|
|
50
56
|
throw new assert.AssertionError({
|
|
@@ -60,3 +66,33 @@ export function startsWith(a, prefix) {
|
|
|
60
66
|
actual: a,
|
|
61
67
|
});
|
|
62
68
|
}
|
|
69
|
+
export function gt(num, target) {
|
|
70
|
+
if (num > target)
|
|
71
|
+
return true;
|
|
72
|
+
throw new assert.AssertionError({
|
|
73
|
+
message: `Expected ${num} > ${target}`,
|
|
74
|
+
actual: num,
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
export function gte(num, target) {
|
|
78
|
+
if (num >= target)
|
|
79
|
+
return true;
|
|
80
|
+
throw new assert.AssertionError({
|
|
81
|
+
message: `Expected ${num} >= ${target}`,
|
|
82
|
+
actual: num,
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
export function stringContains(str, expected) {
|
|
86
|
+
if (typeof str !== "string") {
|
|
87
|
+
throw new assert.AssertionError({
|
|
88
|
+
message: "Expected input to be of type string.",
|
|
89
|
+
actual: typeof str,
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
if (str.includes(expected))
|
|
93
|
+
return true;
|
|
94
|
+
throw new assert.AssertionError({
|
|
95
|
+
message: `Expected string to contain: "${expected}"`,
|
|
96
|
+
actual: str,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
@@ -9,9 +9,13 @@ export type ChatResponse = OpenAI.ChatCompletion & {
|
|
|
9
9
|
message: {
|
|
10
10
|
reasoning_content?: string;
|
|
11
11
|
reasoning?: string;
|
|
12
|
+
tool_calls?: Array<{
|
|
13
|
+
index: number;
|
|
14
|
+
}>;
|
|
12
15
|
};
|
|
13
16
|
}>;
|
|
14
17
|
};
|
|
18
|
+
export type ChatMessage = ChatResponse["choices"][number]["message"];
|
|
15
19
|
export declare const ChatCompletion: t.Struct<{
|
|
16
20
|
messages: t.Arr<t.UnwrappedTypeStruct<{
|
|
17
21
|
content: t.Type<string | t.UnwrappedTypeStruct<{
|
|
@@ -54,6 +58,7 @@ export declare const ChatCompletion: t.Struct<{
|
|
|
54
58
|
name: t.TypeOf<string>;
|
|
55
59
|
}>>;
|
|
56
60
|
reasoning_content: t.OptionalKey<t.Type<string | null>>;
|
|
61
|
+
reasoning: t.OptionalKey<t.Type<string | null>>;
|
|
57
62
|
}> | t.UnwrappedTypeStruct<{
|
|
58
63
|
role: t.Value<"tool">;
|
|
59
64
|
content: t.Type<string | t.UnwrappedTypeStruct<{
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { ChatMessage } from "./chat-completion.ts";
|
|
2
|
+
export type Eval = {
|
|
3
|
+
test: (response: ChatMessage) => any;
|
|
4
|
+
json: any;
|
|
5
|
+
name: string;
|
|
6
|
+
};
|
|
7
|
+
export declare function getEvals(): Promise<Eval[]>;
|
|
8
|
+
export declare function evalName(file: string): string;
|
|
9
|
+
export declare function findTestFiles(dir: string, skipReasoning: boolean): AsyncGenerator<string>;
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
var __rewriteRelativeImportExtension = (this && this.__rewriteRelativeImportExtension) || function (path, preserveJsx) {
|
|
2
|
+
if (typeof path === "string" && /^\.\.?\//.test(path)) {
|
|
3
|
+
return path.replace(/\.(tsx)$|((?:\.d)?)((?:\.[^./]+?)?)\.([cm]?)ts$/i, function (m, tsx, d, ext, cm) {
|
|
4
|
+
return tsx ? preserveJsx ? ".jsx" : ".js" : d && (!ext || !cm) ? m : (d + ext + "." + cm.toLowerCase() + "js");
|
|
5
|
+
});
|
|
6
|
+
}
|
|
7
|
+
return path;
|
|
8
|
+
};
|
|
9
|
+
import fs from "fs/promises";
|
|
10
|
+
import path from "path";
|
|
11
|
+
export async function getEvals() {
|
|
12
|
+
const evals = [];
|
|
13
|
+
const evalsPath = path.join(import.meta.dirname, "..", "evals");
|
|
14
|
+
for await (const testFile of findTestFiles(evalsPath, false)) {
|
|
15
|
+
const { test, json } = await import(__rewriteRelativeImportExtension(testFile));
|
|
16
|
+
evals.push({ test, json, name: evalName(testFile) });
|
|
17
|
+
}
|
|
18
|
+
return evals;
|
|
19
|
+
}
|
|
20
|
+
export function evalName(file) {
|
|
21
|
+
return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`;
|
|
22
|
+
}
|
|
23
|
+
export async function* findTestFiles(dir, skipReasoning) {
|
|
24
|
+
try {
|
|
25
|
+
await fs.stat(dir);
|
|
26
|
+
}
|
|
27
|
+
catch (e) {
|
|
28
|
+
const pathname = `${dir}.js`;
|
|
29
|
+
const stat = await fs.stat(pathname);
|
|
30
|
+
if (stat.isFile()) {
|
|
31
|
+
yield pathname;
|
|
32
|
+
return;
|
|
33
|
+
}
|
|
34
|
+
throw e;
|
|
35
|
+
}
|
|
36
|
+
const entryNames = await fs.readdir(dir);
|
|
37
|
+
const entries = await Promise.all(entryNames.map(async (entry) => {
|
|
38
|
+
return {
|
|
39
|
+
path: path.join(dir, entry),
|
|
40
|
+
stat: await fs.stat(path.join(dir, entry)),
|
|
41
|
+
};
|
|
42
|
+
}));
|
|
43
|
+
for (const entry of entries) {
|
|
44
|
+
if (entry.stat.isFile() && entry.path.endsWith(".js")) {
|
|
45
|
+
yield entry.path;
|
|
46
|
+
}
|
|
47
|
+
if (entry.stat.isDirectory()) {
|
|
48
|
+
if (skipReasoning && path.basename(entry.path) === "reasoning")
|
|
49
|
+
continue;
|
|
50
|
+
yield* findTestFiles(entry.path, skipReasoning);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { getEvals } from "./evals.js";
|
|
3
|
+
describe("get-evals", () => {
|
|
4
|
+
it("works", async () => {
|
|
5
|
+
const evals = await getEvals();
|
|
6
|
+
evals.map(({ test, json, name }) => {
|
|
7
|
+
expect(name).toBeTypeOf("string");
|
|
8
|
+
expect(json).toBeTruthy();
|
|
9
|
+
expect(test).toBeTypeOf("function");
|
|
10
|
+
});
|
|
11
|
+
});
|
|
12
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { getEvals } from "./evals.js";
|