@syntheticlab/synbad 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +77 -0
  2. package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +13 -0
  3. package/dist/evals/reasoning/multiturn-reasoning-parsing.js +16 -0
  4. package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +46 -0
  5. package/dist/evals/reasoning/reasoning-claude-tool-call.js +60 -0
  6. package/dist/evals/reasoning/reasoning-parsing.d.ts +8 -0
  7. package/dist/evals/reasoning/reasoning-parsing.js +10 -0
  8. package/dist/evals/tools/claude-dash.d.ts +1249 -0
  9. package/dist/evals/tools/claude-dash.js +637 -0
  10. package/dist/evals/tools/crush-list-files.d.ts +676 -0
  11. package/dist/evals/tools/crush-list-files.js +441 -0
  12. package/dist/evals/tools/parallel-tool.d.ts +27 -0
  13. package/dist/evals/tools/parallel-tool.js +33 -0
  14. package/dist/evals/tools/simple-tool.d.ts +27 -0
  15. package/dist/evals/tools/simple-tool.js +37 -0
  16. package/dist/source/asserts.d.ts +6 -0
  17. package/dist/source/asserts.js +54 -0
  18. package/dist/source/chat-completion.d.ts +117 -0
  19. package/dist/source/chat-completion.js +113 -0
  20. package/dist/source/index.d.ts +2 -0
  21. package/dist/source/index.js +107 -0
  22. package/evals/reasoning/multiturn-reasoning-parsing.ts +19 -0
  23. package/evals/reasoning/reasoning-claude-tool-call.ts +63 -0
  24. package/evals/reasoning/reasoning-parsing.ts +13 -0
  25. package/evals/tools/claude-dash.ts +640 -0
  26. package/evals/tools/crush-list-files.ts +451 -0
  27. package/evals/tools/parallel-tool.ts +36 -0
  28. package/evals/tools/simple-tool.ts +40 -0
  29. package/package.json +37 -0
  30. package/source/asserts.ts +70 -0
  31. package/source/chat-completion.ts +140 -0
  32. package/source/index.ts +115 -0
  33. package/tsconfig.json +32 -0
@@ -0,0 +1,70 @@
1
+ import assert from "assert";
2
+ export {
3
+ deepEqual,
4
+ deepStrictEqual,
5
+ doesNotMatch,
6
+ doesNotReject,
7
+ doesNotThrow,
8
+ equal,
9
+ fail,
10
+ ifError,
11
+ match,
12
+ notDeepEqual,
13
+ notDeepStrictEqual,
14
+ notEqual,
15
+ notStrictEqual,
16
+ ok,
17
+ rejects,
18
+ strictEqual,
19
+ throws,
20
+ } from "assert";
21
+
22
+ export function or(a: () => void, ...rest: Array<() => void>) {
23
+ try {
24
+ a();
25
+ } catch(aErr) {
26
+ if(rest.length === 0) throw aErr;
27
+ try {
28
+ or(rest[0], ...rest.slice(1));
29
+ } catch(bErr) {
30
+ throw new assert.AssertionError({
31
+ message: `Tried multiple asserts, but they all failed.\n${aErr}\n\n${bErr}`,
32
+ });
33
+ }
34
+ }
35
+ }
36
+
37
+ export function isNullish(a: unknown): asserts a is null | undefined {
38
+ if(a == null) return;
39
+ throw new assert.AssertionError({
40
+ message: "Expected a null or undefined value",
41
+ actual: a,
42
+ expected: null,
43
+ operator: "==",
44
+ });
45
+ }
46
+
47
+ export function isNotNullish<T extends any>(a: T): asserts a is Exclude<T, null | undefined> {
48
+ if(a != null) return;
49
+ throw new assert.AssertionError({
50
+ message: "Expected a non-null, non-undefined value",
51
+ actual: a,
52
+ });
53
+ }
54
+
55
+ export function isEmptyArray(a: any[]) {
56
+ if(a.length === 0) return true;
57
+ throw new assert.AssertionError({
58
+ message: "Expected an empty array",
59
+ actual: a,
60
+ expected: [],
61
+ });
62
+ }
63
+
64
+ export function isNotEmptyArray(a: any[]) {
65
+ if(a.length !== 0) return true;
66
+ throw new assert.AssertionError({
67
+ message: "Expected a non-empty array",
68
+ actual: a,
69
+ });
70
+ }
@@ -0,0 +1,140 @@
1
+ import { t } from "structural";
2
+ import OpenAI from "openai";
3
+
4
+ export type ChatResponse = OpenAI.ChatCompletion & {
5
+ choices: Array<{
6
+ message: {
7
+ reasoning_content?: string,
8
+ },
9
+ }>
10
+ };
11
+
12
+ const TextContentPart = t.subtype({
13
+ type: t.value("text"),
14
+ text: t.str,
15
+ });
16
+ const ImageContentPart = t.subtype({
17
+ type: t.value("image_url"),
18
+ image_url: t.subtype({
19
+ url: t.str,
20
+ }),
21
+ });
22
+ const RefusalContentPart = t.subtype({
23
+ type: t.value("refusal"),
24
+ refusal: t.str,
25
+ });
26
+
27
+ const TextContent = t.str.or(t.array(TextContentPart));
28
+ const UserContent = t.str.or(t.array(TextContentPart.or(ImageContentPart)));
29
+ const AssistantContent = t.str.or(t.array(TextContentPart.or(RefusalContentPart)));
30
+
31
+ const ToolResultSchema = t.subtype({
32
+ role: t.value("tool"),
33
+ content: TextContent,
34
+ tool_call_id: t.str,
35
+ });
36
+
37
+ const ToolCall = t.subtype({
38
+ id: t.str,
39
+ type: t.value("function"),
40
+ function: t.subtype({
41
+ name: t.str,
42
+ arguments: t.str,
43
+ }),
44
+ });
45
+ const AssistantMessageSchema = t.subtype({
46
+ content: t.optional(AssistantContent.or(t.nil)),
47
+ role: t.value("assistant"),
48
+ tool_calls: t.optional(t.array(ToolCall)),
49
+ function_call: t.optional(t.subtype({
50
+ arguments: t.str,
51
+ name: t.str,
52
+ })),
53
+ reasoning_content: t.optional(t.str.or(t.nil)),
54
+ });
55
+
56
+ const UserMessageSchema = t.subtype({
57
+ content: UserContent,
58
+ role: t.value("user"),
59
+ name: t.optional(t.str),
60
+ });
61
+
62
+ const ChatCompletionMessage = t.subtype({
63
+ content: TextContent,
64
+ role: t.value("system"),
65
+ name: t.optional(t.str),
66
+ }).or(
67
+ UserMessageSchema
68
+ ).or(
69
+ AssistantMessageSchema
70
+ ).or(
71
+ ToolResultSchema
72
+ ).or(t.subtype({
73
+ role: t.value("function"),
74
+ content: t.str.or(t.nil),
75
+ name: t.str
76
+ }));
77
+
78
+ const ReasoningSchema = t.value("low").or(t.value("medium")).or(t.value("high"));
79
+
80
+ const ToolDef = t.subtype({
81
+ type: t.value("function"),
82
+ function: t.subtype({
83
+ description: t.optional(t.str),
84
+ name: t.str,
85
+ parameters: t.optional(t.any),
86
+ strict: t.optional(t.bool),
87
+ }),
88
+ });
89
+
90
+ export const ChatCompletion = t.subtype({
91
+ messages: t.array(ChatCompletionMessage),
92
+ user: t.optional(t.str),
93
+
94
+ tools: t.optional(t.array(ToolDef)),
95
+ tool_choice: t.optional(t.value("auto").or(t.value("none").or(t.value("required"))).or(t.subtype({
96
+ type: t.value("function"),
97
+ function: t.subtype({
98
+ name: t.str
99
+ }),
100
+ }))),
101
+ parallel_tool_calls: t.optional(t.bool),
102
+ function_call: t.optional(t.value("auto").or(t.value("none")).or(t.subtype({
103
+ name: t.str,
104
+ }))),
105
+ functions: t.optional(t.array(t.subtype({
106
+ description: t.optional(t.str),
107
+ name: t.str,
108
+ parameters: t.optional(t.any),
109
+ }))),
110
+
111
+ frequency_penalty: t.optional(t.num.or(t.nil)),
112
+ logit_bias: t.optional(t.dict(t.num)),
113
+ logprobs: t.optional(t.bool.or(t.nil).or(t.num)),
114
+ top_k: t.optional(t.num.or(t.nil)),
115
+ top_logprobs: t.optional(t.num.or(t.nil)),
116
+ max_tokens: t.optional(t.num.or(t.nil)),
117
+ max_completion_tokens: t.optional(t.num.or(t.nil)),
118
+ n: t.optional(t.num.or(t.nil)),
119
+ presence_penalty: t.optional(t.num.or(t.nil)),
120
+ min_p: t.optional(t.num.or(t.nil)),
121
+ response_format: t.optional(t.subtype({
122
+ type: t.value("text").or(t.value("json_object")),
123
+ }).or(t.subtype({
124
+ type: t.value("json_schema"),
125
+ json_schema: t.subtype({
126
+ name: t.str,
127
+ description: t.optional(t.str),
128
+ schema: t.any,
129
+ strict: t.optional(t.bool.or(t.nil)),
130
+ }),
131
+ }))),
132
+ seed: t.optional(t.num.or(t.nil)),
133
+ stop: t.optional(t.str.or(t.array(t.str)).or(t.nil)),
134
+ stream: t.optional(t.bool.or(t.nil)),
135
+ temperature: t.optional(t.num.or(t.nil)),
136
+ top_p: t.optional(t.num.or(t.nil)),
137
+
138
+ reasoning_effort: t.optional(ReasoningSchema),
139
+ enable_thinking: t.optional(t.bool),
140
+ });
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env node
2
+ import { Command } from "@commander-js/extra-typings";
3
+ import fs from "fs/promises";
4
+ import path from "path";
5
+ import OpenAI from "openai";
6
+
7
+ const cli = new Command()
8
+ .name("synbad")
9
+ .description("A set of evals for LLM inference providers");
10
+
11
+ cli.command("eval")
12
+ .description("Runs the evals")
13
+ .requiredOption(
14
+ "--env-var <env var name>", "The env var to use to authenticate with the inference provider"
15
+ )
16
+ .requiredOption(
17
+ "--base-url <base url>", "The base URL for the inference provider"
18
+ )
19
+ .option(
20
+ "--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)"
21
+ )
22
+ .option(
23
+ "--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash"
24
+ )
25
+ .option(
26
+ "--count <num times>", "Number of times to run the eval. Any failures count as an overall failure",
27
+ )
28
+ .requiredOption("--model <model name>", "The model name to test")
29
+ .action(async ({ model, envVar, baseUrl, only, count }) => {
30
+ if(!process.env[envVar]) {
31
+ console.error(`No env var named ${envVar} exists for the current process`);
32
+ process.exit(1);
33
+ }
34
+ const client = new OpenAI({
35
+ apiKey: process.env[envVar],
36
+ baseURL: baseUrl,
37
+ });
38
+ let found = 0;
39
+ const failures = new Set<string>();
40
+ const evalPath = only ? path.join(
41
+ import.meta.dirname, "..", only
42
+ ) : path.join(import.meta.dirname, "../evals");
43
+ const maxRuns = count == null ? 1 : parseInt(count, 10);
44
+ for await(const testFile of findTestFiles(evalPath)) {
45
+ found++;
46
+ const test = await import(testFile);
47
+ const json = test.json;
48
+ const name = evalName(testFile);
49
+ process.stdout.write(`Running ${name}...`);
50
+ try {
51
+ for(let i = 0; i < maxRuns; i++) {
52
+ if(maxRuns > 1) {
53
+ process.stdout.write(` ${i + 1}/${maxRuns}`);
54
+ }
55
+ const response = await client.chat.completions.create({
56
+ model,
57
+ ...json,
58
+ });
59
+ test.test(response);
60
+ }
61
+ process.stdout.write(" āœ… passed\n");
62
+ } catch(e) {
63
+ failures.add(testFile);
64
+ console.error(e);
65
+ console.error(`āŒ ${name} failed`);
66
+ }
67
+ }
68
+ const passed = found - failures.size
69
+ if(passed === found) {
70
+ console.log("\nāœ… All evals passed!");
71
+ process.exit(0);
72
+ }
73
+
74
+ console.log("");
75
+ console.log(`
76
+ ${passed}/${found} evals passed. Failures:
77
+
78
+ - ${Array.from(failures).map(evalName).join("\n- ")}
79
+ `.trim());
80
+ });
81
+
82
+ function evalName(file: string) {
83
+ return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
84
+ }
85
+
86
+ async function* findTestFiles(dir: string): AsyncGenerator<string> {
87
+ try {
88
+ await fs.stat(dir);
89
+ } catch(e) {
90
+ const pathname = `${dir}.js`;
91
+ const stat = await fs.stat(pathname);
92
+ if(stat.isFile()) {
93
+ yield pathname;
94
+ return;
95
+ }
96
+ throw e;
97
+ }
98
+ const entryNames = await fs.readdir(dir);
99
+ const entries = await Promise.all(entryNames.map(async (entry) => {
100
+ return {
101
+ path: path.join(dir, entry),
102
+ stat: await fs.stat(path.join(dir, entry)),
103
+ };
104
+ }));
105
+ for(const entry of entries) {
106
+ if(entry.stat.isFile() && entry.path.endsWith(".js")) {
107
+ yield entry.path;
108
+ }
109
+ if(entry.stat.isDirectory()) {
110
+ yield* findTestFiles(entry.path);
111
+ }
112
+ }
113
+ }
114
+
115
+ cli.parse();
package/tsconfig.json ADDED
@@ -0,0 +1,32 @@
1
+ {
2
+ "include": [ "source", "evals" ],
3
+ "exclude": [ "node_modules" ],
4
+ "compilerOptions": {
5
+ "skipLibCheck": true,
6
+ "outDir": "dist",
7
+ "module": "node16",
8
+ "moduleResolution": "node16",
9
+ "moduleDetection": "force",
10
+ "target": "esnext",
11
+ "lib": [
12
+ "DOM",
13
+ "DOM.Iterable",
14
+ "ES2022" // Node.js 18
15
+ ],
16
+ "resolveJsonModule": false, // ESM doesn't yet support JSON modules.
17
+ "jsx": "react",
18
+ "declaration": true,
19
+ "newLine": "lf",
20
+ "stripInternal": true,
21
+ "strict": true,
22
+ "noErrorTruncation": true,
23
+ "noImplicitReturns": true,
24
+ "noImplicitOverride": true,
25
+ "noFallthroughCasesInSwitch": true,
26
+ "noPropertyAccessFromIndexSignature": true,
27
+ "noUncheckedSideEffectImports": true,
28
+ "forceConsistentCasingInFileNames": true,
29
+ "rewriteRelativeImportExtensions": true,
30
+ "allowImportingTsExtensions": true
31
+ }
32
+ }