@syntheticlab/synbad 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -0
- package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +13 -0
- package/dist/evals/reasoning/multiturn-reasoning-parsing.js +16 -0
- package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +46 -0
- package/dist/evals/reasoning/reasoning-claude-tool-call.js +60 -0
- package/dist/evals/reasoning/reasoning-parsing.d.ts +8 -0
- package/dist/evals/reasoning/reasoning-parsing.js +10 -0
- package/dist/evals/tools/claude-dash.d.ts +1249 -0
- package/dist/evals/tools/claude-dash.js +637 -0
- package/dist/evals/tools/crush-list-files.d.ts +676 -0
- package/dist/evals/tools/crush-list-files.js +441 -0
- package/dist/evals/tools/parallel-tool.d.ts +27 -0
- package/dist/evals/tools/parallel-tool.js +33 -0
- package/dist/evals/tools/simple-tool.d.ts +27 -0
- package/dist/evals/tools/simple-tool.js +37 -0
- package/dist/source/asserts.d.ts +6 -0
- package/dist/source/asserts.js +54 -0
- package/dist/source/chat-completion.d.ts +117 -0
- package/dist/source/chat-completion.js +113 -0
- package/dist/source/index.d.ts +2 -0
- package/dist/source/index.js +107 -0
- package/evals/reasoning/multiturn-reasoning-parsing.ts +19 -0
- package/evals/reasoning/reasoning-claude-tool-call.ts +63 -0
- package/evals/reasoning/reasoning-parsing.ts +13 -0
- package/evals/tools/claude-dash.ts +640 -0
- package/evals/tools/crush-list-files.ts +451 -0
- package/evals/tools/parallel-tool.ts +36 -0
- package/evals/tools/simple-tool.ts +40 -0
- package/package.json +37 -0
- package/source/asserts.ts +70 -0
- package/source/chat-completion.ts +140 -0
- package/source/index.ts +115 -0
- package/tsconfig.json +32 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.js";
|
|
2
|
+
export function test(response) {
|
|
3
|
+
const { tool_calls } = response.choices[0].message;
|
|
4
|
+
assert.isNotNullish(tool_calls);
|
|
5
|
+
assert.isNotEmptyArray(tool_calls);
|
|
6
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
7
|
+
assert.strictEqual(tool_calls[0].type, "function");
|
|
8
|
+
assert.strictEqual(tool_calls[0].function.name, "get_weather");
|
|
9
|
+
const args = JSON.parse(tool_calls[0].function.arguments);
|
|
10
|
+
assert.match(args.location.toLowerCase(), /paris/);
|
|
11
|
+
}
|
|
12
|
+
export const json = {
|
|
13
|
+
"messages": [
|
|
14
|
+
{ "role": "user", "content": "What's the weather in Paris?" }
|
|
15
|
+
],
|
|
16
|
+
"tools": [
|
|
17
|
+
{
|
|
18
|
+
"type": "function",
|
|
19
|
+
"function": {
|
|
20
|
+
"name": "get_weather",
|
|
21
|
+
"description": "Get current weather for a location",
|
|
22
|
+
"parameters": {
|
|
23
|
+
"type": "object",
|
|
24
|
+
"properties": {
|
|
25
|
+
"location": {
|
|
26
|
+
"type": "string",
|
|
27
|
+
"description": "City name"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"required": ["location"]
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
],
|
|
35
|
+
"parallel_tool_calls": true,
|
|
36
|
+
"tool_choice": "auto",
|
|
37
|
+
};
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { deepEqual, deepStrictEqual, doesNotMatch, doesNotReject, doesNotThrow, equal, fail, ifError, match, notDeepEqual, notDeepStrictEqual, notEqual, notStrictEqual, ok, rejects, strictEqual, throws, } from "assert";
|
|
2
|
+
export declare function or(a: () => void, ...rest: Array<() => void>): void;
|
|
3
|
+
export declare function isNullish(a: unknown): asserts a is null | undefined;
|
|
4
|
+
export declare function isNotNullish<T extends any>(a: T): asserts a is Exclude<T, null | undefined>;
|
|
5
|
+
export declare function isEmptyArray(a: any[]): boolean;
|
|
6
|
+
export declare function isNotEmptyArray(a: any[]): boolean;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import assert from "assert";
|
|
2
|
+
export { deepEqual, deepStrictEqual, doesNotMatch, doesNotReject, doesNotThrow, equal, fail, ifError, match, notDeepEqual, notDeepStrictEqual, notEqual, notStrictEqual, ok, rejects, strictEqual, throws, } from "assert";
|
|
3
|
+
export function or(a, ...rest) {
|
|
4
|
+
try {
|
|
5
|
+
a();
|
|
6
|
+
}
|
|
7
|
+
catch (aErr) {
|
|
8
|
+
if (rest.length === 0)
|
|
9
|
+
throw aErr;
|
|
10
|
+
try {
|
|
11
|
+
or(rest[0], ...rest.slice(1));
|
|
12
|
+
}
|
|
13
|
+
catch (bErr) {
|
|
14
|
+
throw new assert.AssertionError({
|
|
15
|
+
message: `Tried multiple asserts, but they all failed.\n${aErr}\n\n${bErr}`,
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
export function isNullish(a) {
|
|
21
|
+
if (a == null)
|
|
22
|
+
return;
|
|
23
|
+
throw new assert.AssertionError({
|
|
24
|
+
message: "Expected a null or undefined value",
|
|
25
|
+
actual: a,
|
|
26
|
+
expected: null,
|
|
27
|
+
operator: "==",
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
export function isNotNullish(a) {
|
|
31
|
+
if (a != null)
|
|
32
|
+
return;
|
|
33
|
+
throw new assert.AssertionError({
|
|
34
|
+
message: "Expected a non-null, non-undefined value",
|
|
35
|
+
actual: a,
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
export function isEmptyArray(a) {
|
|
39
|
+
if (a.length === 0)
|
|
40
|
+
return true;
|
|
41
|
+
throw new assert.AssertionError({
|
|
42
|
+
message: "Expected an empty array",
|
|
43
|
+
actual: a,
|
|
44
|
+
expected: [],
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
export function isNotEmptyArray(a) {
|
|
48
|
+
if (a.length !== 0)
|
|
49
|
+
return true;
|
|
50
|
+
throw new assert.AssertionError({
|
|
51
|
+
message: "Expected a non-empty array",
|
|
52
|
+
actual: a,
|
|
53
|
+
});
|
|
54
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { t } from "structural";
|
|
2
|
+
import OpenAI from "openai";
|
|
3
|
+
export type ChatResponse = OpenAI.ChatCompletion & {
|
|
4
|
+
choices: Array<{
|
|
5
|
+
message: {
|
|
6
|
+
reasoning_content?: string;
|
|
7
|
+
};
|
|
8
|
+
}>;
|
|
9
|
+
};
|
|
10
|
+
export declare const ChatCompletion: t.Struct<{
|
|
11
|
+
messages: t.Arr<t.UnwrappedTypeStruct<{
|
|
12
|
+
content: t.Type<string | t.UnwrappedTypeStruct<{
|
|
13
|
+
type: t.Value<"text">;
|
|
14
|
+
text: t.TypeOf<string>;
|
|
15
|
+
}>[]>;
|
|
16
|
+
role: t.Value<"system">;
|
|
17
|
+
name: t.OptionalKey<t.TypeOf<string>>;
|
|
18
|
+
}> | t.UnwrappedTypeStruct<{
|
|
19
|
+
content: t.Type<string | (t.UnwrappedTypeStruct<{
|
|
20
|
+
type: t.Value<"text">;
|
|
21
|
+
text: t.TypeOf<string>;
|
|
22
|
+
}> | t.UnwrappedTypeStruct<{
|
|
23
|
+
type: t.Value<"image_url">;
|
|
24
|
+
image_url: t.Struct<{
|
|
25
|
+
url: t.TypeOf<string>;
|
|
26
|
+
}>;
|
|
27
|
+
}>)[]>;
|
|
28
|
+
role: t.Value<"user">;
|
|
29
|
+
name: t.OptionalKey<t.TypeOf<string>>;
|
|
30
|
+
}> | t.UnwrappedTypeStruct<{
|
|
31
|
+
content: t.OptionalKey<t.Type<string | (t.UnwrappedTypeStruct<{
|
|
32
|
+
type: t.Value<"text">;
|
|
33
|
+
text: t.TypeOf<string>;
|
|
34
|
+
}> | t.UnwrappedTypeStruct<{
|
|
35
|
+
type: t.Value<"refusal">;
|
|
36
|
+
refusal: t.TypeOf<string>;
|
|
37
|
+
}>)[] | null>>;
|
|
38
|
+
role: t.Value<"assistant">;
|
|
39
|
+
tool_calls: t.OptionalKey<t.Arr<t.UnwrappedTypeStruct<{
|
|
40
|
+
id: t.TypeOf<string>;
|
|
41
|
+
type: t.Value<"function">;
|
|
42
|
+
function: t.Struct<{
|
|
43
|
+
name: t.TypeOf<string>;
|
|
44
|
+
arguments: t.TypeOf<string>;
|
|
45
|
+
}>;
|
|
46
|
+
}>>>;
|
|
47
|
+
function_call: t.OptionalKey<t.Struct<{
|
|
48
|
+
arguments: t.TypeOf<string>;
|
|
49
|
+
name: t.TypeOf<string>;
|
|
50
|
+
}>>;
|
|
51
|
+
reasoning_content: t.OptionalKey<t.Type<string | null>>;
|
|
52
|
+
}> | t.UnwrappedTypeStruct<{
|
|
53
|
+
role: t.Value<"tool">;
|
|
54
|
+
content: t.Type<string | t.UnwrappedTypeStruct<{
|
|
55
|
+
type: t.Value<"text">;
|
|
56
|
+
text: t.TypeOf<string>;
|
|
57
|
+
}>[]>;
|
|
58
|
+
tool_call_id: t.TypeOf<string>;
|
|
59
|
+
}> | t.UnwrappedTypeStruct<{
|
|
60
|
+
role: t.Value<"function">;
|
|
61
|
+
content: t.Type<string | null>;
|
|
62
|
+
name: t.TypeOf<string>;
|
|
63
|
+
}>>;
|
|
64
|
+
user: t.OptionalKey<t.TypeOf<string>>;
|
|
65
|
+
tools: t.OptionalKey<t.Arr<t.UnwrappedTypeStruct<{
|
|
66
|
+
type: t.Value<"function">;
|
|
67
|
+
function: t.Struct<{
|
|
68
|
+
description: t.OptionalKey<t.TypeOf<string>>;
|
|
69
|
+
name: t.TypeOf<string>;
|
|
70
|
+
parameters: t.OptionalKey<t.Any>;
|
|
71
|
+
strict: t.OptionalKey<t.TypeOf<boolean>>;
|
|
72
|
+
}>;
|
|
73
|
+
}>>>;
|
|
74
|
+
tool_choice: t.OptionalKey<t.Type<"auto" | "none" | "required" | t.UnwrappedTypeStruct<{
|
|
75
|
+
type: t.Value<"function">;
|
|
76
|
+
function: t.Struct<{
|
|
77
|
+
name: t.TypeOf<string>;
|
|
78
|
+
}>;
|
|
79
|
+
}>>>;
|
|
80
|
+
parallel_tool_calls: t.OptionalKey<t.TypeOf<boolean>>;
|
|
81
|
+
function_call: t.OptionalKey<t.Type<"auto" | "none" | t.UnwrappedTypeStruct<{
|
|
82
|
+
name: t.TypeOf<string>;
|
|
83
|
+
}>>>;
|
|
84
|
+
functions: t.OptionalKey<t.Arr<t.UnwrappedTypeStruct<{
|
|
85
|
+
description: t.OptionalKey<t.TypeOf<string>>;
|
|
86
|
+
name: t.TypeOf<string>;
|
|
87
|
+
parameters: t.OptionalKey<t.Any>;
|
|
88
|
+
}>>>;
|
|
89
|
+
frequency_penalty: t.OptionalKey<t.Type<number | null>>;
|
|
90
|
+
logit_bias: t.OptionalKey<t.Dict<number>>;
|
|
91
|
+
logprobs: t.OptionalKey<t.Type<number | boolean | null>>;
|
|
92
|
+
top_k: t.OptionalKey<t.Type<number | null>>;
|
|
93
|
+
top_logprobs: t.OptionalKey<t.Type<number | null>>;
|
|
94
|
+
max_tokens: t.OptionalKey<t.Type<number | null>>;
|
|
95
|
+
max_completion_tokens: t.OptionalKey<t.Type<number | null>>;
|
|
96
|
+
n: t.OptionalKey<t.Type<number | null>>;
|
|
97
|
+
presence_penalty: t.OptionalKey<t.Type<number | null>>;
|
|
98
|
+
min_p: t.OptionalKey<t.Type<number | null>>;
|
|
99
|
+
response_format: t.OptionalKey<t.Type<t.UnwrappedTypeStruct<{
|
|
100
|
+
type: t.Type<"text" | "json_object">;
|
|
101
|
+
}> | t.UnwrappedTypeStruct<{
|
|
102
|
+
type: t.Value<"json_schema">;
|
|
103
|
+
json_schema: t.Struct<{
|
|
104
|
+
name: t.TypeOf<string>;
|
|
105
|
+
description: t.OptionalKey<t.TypeOf<string>>;
|
|
106
|
+
schema: t.Any;
|
|
107
|
+
strict: t.OptionalKey<t.Type<boolean | null>>;
|
|
108
|
+
}>;
|
|
109
|
+
}>>>;
|
|
110
|
+
seed: t.OptionalKey<t.Type<number | null>>;
|
|
111
|
+
stop: t.OptionalKey<t.Type<string | string[] | null>>;
|
|
112
|
+
stream: t.OptionalKey<t.Type<boolean | null>>;
|
|
113
|
+
temperature: t.OptionalKey<t.Type<number | null>>;
|
|
114
|
+
top_p: t.OptionalKey<t.Type<number | null>>;
|
|
115
|
+
reasoning_effort: t.OptionalKey<t.Type<"low" | "medium" | "high">>;
|
|
116
|
+
enable_thinking: t.OptionalKey<t.TypeOf<boolean>>;
|
|
117
|
+
}>;
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { t } from "structural";
|
|
2
|
+
const TextContentPart = t.subtype({
|
|
3
|
+
type: t.value("text"),
|
|
4
|
+
text: t.str,
|
|
5
|
+
});
|
|
6
|
+
const ImageContentPart = t.subtype({
|
|
7
|
+
type: t.value("image_url"),
|
|
8
|
+
image_url: t.subtype({
|
|
9
|
+
url: t.str,
|
|
10
|
+
}),
|
|
11
|
+
});
|
|
12
|
+
const RefusalContentPart = t.subtype({
|
|
13
|
+
type: t.value("refusal"),
|
|
14
|
+
refusal: t.str,
|
|
15
|
+
});
|
|
16
|
+
const TextContent = t.str.or(t.array(TextContentPart));
|
|
17
|
+
const UserContent = t.str.or(t.array(TextContentPart.or(ImageContentPart)));
|
|
18
|
+
const AssistantContent = t.str.or(t.array(TextContentPart.or(RefusalContentPart)));
|
|
19
|
+
const ToolResultSchema = t.subtype({
|
|
20
|
+
role: t.value("tool"),
|
|
21
|
+
content: TextContent,
|
|
22
|
+
tool_call_id: t.str,
|
|
23
|
+
});
|
|
24
|
+
const ToolCall = t.subtype({
|
|
25
|
+
id: t.str,
|
|
26
|
+
type: t.value("function"),
|
|
27
|
+
function: t.subtype({
|
|
28
|
+
name: t.str,
|
|
29
|
+
arguments: t.str,
|
|
30
|
+
}),
|
|
31
|
+
});
|
|
32
|
+
const AssistantMessageSchema = t.subtype({
|
|
33
|
+
content: t.optional(AssistantContent.or(t.nil)),
|
|
34
|
+
role: t.value("assistant"),
|
|
35
|
+
tool_calls: t.optional(t.array(ToolCall)),
|
|
36
|
+
function_call: t.optional(t.subtype({
|
|
37
|
+
arguments: t.str,
|
|
38
|
+
name: t.str,
|
|
39
|
+
})),
|
|
40
|
+
reasoning_content: t.optional(t.str.or(t.nil)),
|
|
41
|
+
});
|
|
42
|
+
const UserMessageSchema = t.subtype({
|
|
43
|
+
content: UserContent,
|
|
44
|
+
role: t.value("user"),
|
|
45
|
+
name: t.optional(t.str),
|
|
46
|
+
});
|
|
47
|
+
const ChatCompletionMessage = t.subtype({
|
|
48
|
+
content: TextContent,
|
|
49
|
+
role: t.value("system"),
|
|
50
|
+
name: t.optional(t.str),
|
|
51
|
+
}).or(UserMessageSchema).or(AssistantMessageSchema).or(ToolResultSchema).or(t.subtype({
|
|
52
|
+
role: t.value("function"),
|
|
53
|
+
content: t.str.or(t.nil),
|
|
54
|
+
name: t.str
|
|
55
|
+
}));
|
|
56
|
+
const ReasoningSchema = t.value("low").or(t.value("medium")).or(t.value("high"));
|
|
57
|
+
const ToolDef = t.subtype({
|
|
58
|
+
type: t.value("function"),
|
|
59
|
+
function: t.subtype({
|
|
60
|
+
description: t.optional(t.str),
|
|
61
|
+
name: t.str,
|
|
62
|
+
parameters: t.optional(t.any),
|
|
63
|
+
strict: t.optional(t.bool),
|
|
64
|
+
}),
|
|
65
|
+
});
|
|
66
|
+
export const ChatCompletion = t.subtype({
|
|
67
|
+
messages: t.array(ChatCompletionMessage),
|
|
68
|
+
user: t.optional(t.str),
|
|
69
|
+
tools: t.optional(t.array(ToolDef)),
|
|
70
|
+
tool_choice: t.optional(t.value("auto").or(t.value("none").or(t.value("required"))).or(t.subtype({
|
|
71
|
+
type: t.value("function"),
|
|
72
|
+
function: t.subtype({
|
|
73
|
+
name: t.str
|
|
74
|
+
}),
|
|
75
|
+
}))),
|
|
76
|
+
parallel_tool_calls: t.optional(t.bool),
|
|
77
|
+
function_call: t.optional(t.value("auto").or(t.value("none")).or(t.subtype({
|
|
78
|
+
name: t.str,
|
|
79
|
+
}))),
|
|
80
|
+
functions: t.optional(t.array(t.subtype({
|
|
81
|
+
description: t.optional(t.str),
|
|
82
|
+
name: t.str,
|
|
83
|
+
parameters: t.optional(t.any),
|
|
84
|
+
}))),
|
|
85
|
+
frequency_penalty: t.optional(t.num.or(t.nil)),
|
|
86
|
+
logit_bias: t.optional(t.dict(t.num)),
|
|
87
|
+
logprobs: t.optional(t.bool.or(t.nil).or(t.num)),
|
|
88
|
+
top_k: t.optional(t.num.or(t.nil)),
|
|
89
|
+
top_logprobs: t.optional(t.num.or(t.nil)),
|
|
90
|
+
max_tokens: t.optional(t.num.or(t.nil)),
|
|
91
|
+
max_completion_tokens: t.optional(t.num.or(t.nil)),
|
|
92
|
+
n: t.optional(t.num.or(t.nil)),
|
|
93
|
+
presence_penalty: t.optional(t.num.or(t.nil)),
|
|
94
|
+
min_p: t.optional(t.num.or(t.nil)),
|
|
95
|
+
response_format: t.optional(t.subtype({
|
|
96
|
+
type: t.value("text").or(t.value("json_object")),
|
|
97
|
+
}).or(t.subtype({
|
|
98
|
+
type: t.value("json_schema"),
|
|
99
|
+
json_schema: t.subtype({
|
|
100
|
+
name: t.str,
|
|
101
|
+
description: t.optional(t.str),
|
|
102
|
+
schema: t.any,
|
|
103
|
+
strict: t.optional(t.bool.or(t.nil)),
|
|
104
|
+
}),
|
|
105
|
+
}))),
|
|
106
|
+
seed: t.optional(t.num.or(t.nil)),
|
|
107
|
+
stop: t.optional(t.str.or(t.array(t.str)).or(t.nil)),
|
|
108
|
+
stream: t.optional(t.bool.or(t.nil)),
|
|
109
|
+
temperature: t.optional(t.num.or(t.nil)),
|
|
110
|
+
top_p: t.optional(t.num.or(t.nil)),
|
|
111
|
+
reasoning_effort: t.optional(ReasoningSchema),
|
|
112
|
+
enable_thinking: t.optional(t.bool),
|
|
113
|
+
});
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
var __rewriteRelativeImportExtension = (this && this.__rewriteRelativeImportExtension) || function (path, preserveJsx) {
|
|
3
|
+
if (typeof path === "string" && /^\.\.?\//.test(path)) {
|
|
4
|
+
return path.replace(/\.(tsx)$|((?:\.d)?)((?:\.[^./]+?)?)\.([cm]?)ts$/i, function (m, tsx, d, ext, cm) {
|
|
5
|
+
return tsx ? preserveJsx ? ".jsx" : ".js" : d && (!ext || !cm) ? m : (d + ext + "." + cm.toLowerCase() + "js");
|
|
6
|
+
});
|
|
7
|
+
}
|
|
8
|
+
return path;
|
|
9
|
+
};
|
|
10
|
+
import { Command } from "@commander-js/extra-typings";
|
|
11
|
+
import fs from "fs/promises";
|
|
12
|
+
import path from "path";
|
|
13
|
+
import OpenAI from "openai";
|
|
14
|
+
const cli = new Command()
|
|
15
|
+
.name("synbad")
|
|
16
|
+
.description("A set of evals for LLM inference providers");
|
|
17
|
+
cli.command("eval")
|
|
18
|
+
.description("Runs the evals")
|
|
19
|
+
.requiredOption("--env-var <env var name>", "The env var to use to authenticate with the inference provider")
|
|
20
|
+
.requiredOption("--base-url <base url>", "The base URL for the inference provider")
|
|
21
|
+
.option("--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)")
|
|
22
|
+
.option("--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash")
|
|
23
|
+
.option("--count <num times>", "Number of times to run the eval. Any failures count as an overall failure")
|
|
24
|
+
.requiredOption("--model <model name>", "The model name to test")
|
|
25
|
+
.action(async ({ model, envVar, baseUrl, only, count }) => {
|
|
26
|
+
if (!process.env[envVar]) {
|
|
27
|
+
console.error(`No env var named ${envVar} exists for the current process`);
|
|
28
|
+
process.exit(1);
|
|
29
|
+
}
|
|
30
|
+
const client = new OpenAI({
|
|
31
|
+
apiKey: process.env[envVar],
|
|
32
|
+
baseURL: baseUrl,
|
|
33
|
+
});
|
|
34
|
+
let found = 0;
|
|
35
|
+
const failures = new Set();
|
|
36
|
+
const evalPath = only ? path.join(import.meta.dirname, "..", only) : path.join(import.meta.dirname, "../evals");
|
|
37
|
+
const maxRuns = count == null ? 1 : parseInt(count, 10);
|
|
38
|
+
for await (const testFile of findTestFiles(evalPath)) {
|
|
39
|
+
found++;
|
|
40
|
+
const test = await import(__rewriteRelativeImportExtension(testFile));
|
|
41
|
+
const json = test.json;
|
|
42
|
+
const name = evalName(testFile);
|
|
43
|
+
process.stdout.write(`Running ${name}...`);
|
|
44
|
+
try {
|
|
45
|
+
for (let i = 0; i < maxRuns; i++) {
|
|
46
|
+
if (maxRuns > 1) {
|
|
47
|
+
process.stdout.write(` ${i + 1}/${maxRuns}`);
|
|
48
|
+
}
|
|
49
|
+
const response = await client.chat.completions.create({
|
|
50
|
+
model,
|
|
51
|
+
...json,
|
|
52
|
+
});
|
|
53
|
+
test.test(response);
|
|
54
|
+
}
|
|
55
|
+
process.stdout.write(" ✅ passed\n");
|
|
56
|
+
}
|
|
57
|
+
catch (e) {
|
|
58
|
+
failures.add(testFile);
|
|
59
|
+
console.error(e);
|
|
60
|
+
console.error(`❌ ${name} failed`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
const passed = found - failures.size;
|
|
64
|
+
if (passed === found) {
|
|
65
|
+
console.log("\n✅ All evals passed!");
|
|
66
|
+
process.exit(0);
|
|
67
|
+
}
|
|
68
|
+
console.log("");
|
|
69
|
+
console.log(`
|
|
70
|
+
${passed}/${found} evals passed. Failures:
|
|
71
|
+
|
|
72
|
+
- ${Array.from(failures).map(evalName).join("\n- ")}
|
|
73
|
+
`.trim());
|
|
74
|
+
});
|
|
75
|
+
function evalName(file) {
|
|
76
|
+
return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`;
|
|
77
|
+
}
|
|
78
|
+
async function* findTestFiles(dir) {
|
|
79
|
+
try {
|
|
80
|
+
await fs.stat(dir);
|
|
81
|
+
}
|
|
82
|
+
catch (e) {
|
|
83
|
+
const pathname = `${dir}.js`;
|
|
84
|
+
const stat = await fs.stat(pathname);
|
|
85
|
+
if (stat.isFile()) {
|
|
86
|
+
yield pathname;
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
throw e;
|
|
90
|
+
}
|
|
91
|
+
const entryNames = await fs.readdir(dir);
|
|
92
|
+
const entries = await Promise.all(entryNames.map(async (entry) => {
|
|
93
|
+
return {
|
|
94
|
+
path: path.join(dir, entry),
|
|
95
|
+
stat: await fs.stat(path.join(dir, entry)),
|
|
96
|
+
};
|
|
97
|
+
}));
|
|
98
|
+
for (const entry of entries) {
|
|
99
|
+
if (entry.stat.isFile() && entry.path.endsWith(".js")) {
|
|
100
|
+
yield entry.path;
|
|
101
|
+
}
|
|
102
|
+
if (entry.stat.isDirectory()) {
|
|
103
|
+
yield* findTestFiles(entry.path);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
cli.parse();
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.ts";
|
|
2
|
+
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
3
|
+
|
|
4
|
+
export function test(response: ChatResponse) {
|
|
5
|
+
const reasoning = response.choices[0].message.reasoning_content;
|
|
6
|
+
assert.isNotNullish(reasoning);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export const json = {
|
|
10
|
+
messages: [
|
|
11
|
+
{ role: "user", content: "Why does 1+1=2?" },
|
|
12
|
+
{
|
|
13
|
+
role: "assistant",
|
|
14
|
+
reasoning_content: "Because it does",
|
|
15
|
+
content: "Consider the successor function",
|
|
16
|
+
},
|
|
17
|
+
{ role: "user", content: "please explain that much more deeply" },
|
|
18
|
+
],
|
|
19
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import OpenAI from "openai";
|
|
2
|
+
import * as assert from "../../source/asserts.ts";
|
|
3
|
+
|
|
4
|
+
export function test(response: OpenAI.ChatCompletion) {
|
|
5
|
+
const { tool_calls } = response.choices[0].message;
|
|
6
|
+
assert.isNotNullish(tool_calls);
|
|
7
|
+
assert.isNotEmptyArray(tool_calls);
|
|
8
|
+
assert.strictEqual(tool_calls.length, 1);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export const json = {
|
|
12
|
+
"messages": [
|
|
13
|
+
{
|
|
14
|
+
"role": "system",
|
|
15
|
+
"content": "You are Claude Code, Anthropic's official CLI for Claude.\n\nYou are an interactive CLI tool that helps users with software engineering tasks. Use the instructions below and the tools available to you to assist the user."
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"role": "user",
|
|
19
|
+
"content": [
|
|
20
|
+
{
|
|
21
|
+
"type": "text",
|
|
22
|
+
"text": "run a quick git status for me. put the tool call inside your thinking"
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
],
|
|
27
|
+
"max_tokens": 32000,
|
|
28
|
+
"temperature": 1,
|
|
29
|
+
"reasoning_effort": "high",
|
|
30
|
+
"tools": [
|
|
31
|
+
{
|
|
32
|
+
"type": "function",
|
|
33
|
+
"function": {
|
|
34
|
+
"name": "Bash",
|
|
35
|
+
"description": "Executes a given bash command in a persistent shell session with optional timeout, ensuring proper handling and security measures.",
|
|
36
|
+
"parameters": {
|
|
37
|
+
"type": "object",
|
|
38
|
+
"properties": {
|
|
39
|
+
"command": {
|
|
40
|
+
"type": "string",
|
|
41
|
+
"description": "The command to execute"
|
|
42
|
+
},
|
|
43
|
+
"timeout": {
|
|
44
|
+
"type": "number",
|
|
45
|
+
"description": "Optional timeout in milliseconds (max 600000)"
|
|
46
|
+
},
|
|
47
|
+
"description": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": "Clear, concise description of what this command does in 5-10 words, in active voice. Examples:\nInput: ls\nOutput: List files in current directory\n\nInput: git status\nOutput: Show working tree status\n\nInput: npm install\nOutput: Install package dependencies\n\nInput: mkdir foo\nOutput: Create directory 'foo'"
|
|
50
|
+
},
|
|
51
|
+
"run_in_background": {
|
|
52
|
+
"type": "boolean",
|
|
53
|
+
"description": "Set to true to run this command in the background. Use BashOutput to read the output later."
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"required": [
|
|
57
|
+
"command"
|
|
58
|
+
]
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import * as assert from "../../source/asserts.ts";
|
|
2
|
+
import { ChatResponse } from "../../source/chat-completion.ts";
|
|
3
|
+
|
|
4
|
+
export function test(response: ChatResponse) {
|
|
5
|
+
const reasoning = response.choices[0].message.reasoning_content;
|
|
6
|
+
assert.isNotNullish(reasoning);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export const json = {
|
|
10
|
+
"messages": [
|
|
11
|
+
{"role": "user", "content": "Why does 1+1=2?"}
|
|
12
|
+
],
|
|
13
|
+
}
|