npm - @syntheticlab/synbad - Versions diffs - 0.0.5 → 0.0.6 - Mend

@syntheticlab/synbad 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/README.md +60 -23
package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
package/dist/evals/reasoning/reasoning-parsing.js +2 -2
package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
package/dist/evals/reasoning/response-in-reasoning.js +59 -0
package/dist/evals/tools/claude-dash.d.ts +2 -2
package/dist/evals/tools/claude-dash.js +1 -2
package/dist/evals/tools/crush-list-files.d.ts +2 -5
package/dist/evals/tools/crush-list-files.js +6 -8
package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
package/dist/evals/tools/multi-turn-tools.js +100 -0
package/dist/evals/tools/no-fn-args.d.ts +22 -0
package/dist/evals/tools/no-fn-args.js +31 -0
package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
package/dist/evals/tools/parallel-tool.d.ts +2 -2
package/dist/evals/tools/parallel-tool.js +1 -2
package/dist/evals/tools/simple-tool.d.ts +2 -2
package/dist/evals/tools/simple-tool.js +3 -2
package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
package/dist/evals/tools/tool-dash-underscore.js +37 -0
package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
package/dist/evals/tools/tool-path-corruption.js +41 -0
package/dist/source/asserts.d.ts +4 -1
package/dist/source/asserts.js +36 -0
package/dist/source/chat-completion.d.ts +5 -0
package/dist/source/chat-completion.js +1 -0
package/dist/source/evals.d.ts +9 -0
package/dist/source/evals.js +53 -0
package/dist/source/evals.test.d.ts +1 -0
package/dist/source/evals.test.js +12 -0
package/dist/source/exports.d.ts +2 -0
package/dist/source/exports.js +1 -0
package/dist/source/index.js +103 -43
package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
package/evals/reasoning/reasoning-parsing.ts +3 -3
package/evals/reasoning/response-in-reasoning.ts +65 -0
package/evals/tools/claude-dash.ts +2 -3
package/evals/tools/crush-list-files.ts +11 -13
package/evals/tools/multi-turn-tools.ts +104 -0
package/evals/tools/no-fn-args.ts +34 -0
package/evals/tools/octo-list-no-optional-args.ts +81 -0
package/evals/tools/parallel-tool.ts +2 -3
package/evals/tools/simple-tool.ts +4 -3
package/evals/tools/tool-dash-underscore.ts +40 -0
package/evals/tools/tool-path-corruption.ts +46 -0
package/package.json +10 -3
package/source/asserts.ts +37 -1
package/source/chat-completion.ts +6 -0
package/source/evals.test.ts +13 -0
package/source/evals.ts +56 -0
package/source/exports.ts +2 -0
package/source/index.ts +121 -44

package/evals/tools/tool-dash-underscore.ts ADDED Viewed

@@ -0,0 +1,40 @@
+import { ChatMessage } from "../../source/chat-completion.ts";
+import * as assert from "../../source/asserts.ts";
+export function test({ content, tool_calls }: ChatMessage) {
+  assert.isNotNullish(tool_calls);
+  assert.isNotEmptyArray(tool_calls);
+  assert.strictEqual(tool_calls.length, 1);
+  assert.strictEqual(tool_calls[0].type, "function");
+  assert.strictEqual(tool_calls[0].function.name, "get-weather__v1");
+  const args = JSON.parse(tool_calls[0].function.arguments);
+  assert.match(args.location.toLowerCase(), /paris/);
+  // Assert the tool call didn't leak into the content
+  assert.doesNotMatch(content || "", /get_weather/);
+}
+export const json = {
+  "messages": [
+    {"role": "user", "content": "What's the weather in Paris?"}
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get-weather__v1",
+        "description": "Get current weather for a location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "City name"
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto",
+}

package/evals/tools/tool-path-corruption.ts ADDED Viewed

@@ -0,0 +1,46 @@
+import { ChatMessage } from "../../source/chat-completion.ts";
+import * as assert from "../../source/asserts.ts";
+const PATH = "/development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts";
+export function test({ tool_calls }: ChatMessage) {
+  assert.isNotNullish(tool_calls);
+  assert.isNotEmptyArray(tool_calls);
+  assert.strictEqual(tool_calls.length, 1);
+  assert.strictEqual(tool_calls[0].type, "function");
+  assert.strictEqual(tool_calls[0].function.name, "read");
+  const args = JSON.parse(tool_calls[0].function.arguments);
+  assert.stringContains(args.filePath, PATH);
+}
+export const json = {
+  "messages": [
+    {
+      "role": "user",
+      "content": "Read and summarize the file /development/evals/reasoning/Scratch/reasoning-claude-tool-call.ts"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "read",
+        "description": "The read tool",
+        "parameters": {
+          "type": "object",
+          "required": [
+            "filePath"
+          ],
+          "properties": {
+            "filePath": {
+              "description": "Path to file to read",
+              "type": "string"
+            }
+          }
+        },
+        "strict": true
+      }
+    },
+  ],
+};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@syntheticlab/synbad",
-  "version": "0.0.5",
+  "version": "0.0.6",
   "description": "LLM inference provider evals",
   "main": "dist/source/index.js",
   "bin": {
@@ -9,7 +9,7 @@
   "preferGlobal": true,
   "type": "module",
   "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1",
+    "test": "vitest",
     "build": "tsc",
     "prepublishOnly": "tsc"
   },
@@ -23,6 +23,12 @@
     "package-lock.json",
     "tsconfig.json"
   ],
+  "exports": {
+    ".": {
+      "types": "./dist/source/exports.d.ts",
+      "import": "./dist/source/exports.js"
+    }
+  },
   "dependencies": {
     "@commander-js/extra-typings": "^14.0.0",
     "commander": "^14.0.2",
@@ -32,6 +38,7 @@
   },
   "devDependencies": {
     "@types/node": "^24.10.1",
-    "tsx": "^4.20.6"
+    "tsx": "^4.20.6",
+    "vitest": "^4.0.17"
   }
 }

package/source/asserts.ts CHANGED Viewed

@@ -61,7 +61,13 @@ export function isEmptyArray(a: any[]) {
   });
 }
-export function isNotEmptyArray(a: any[]) {
+export function isNotEmptyArray(a: any[] | undefined) {
+  if(a == null) {
+    throw new assert.AssertionError({
+      message: "Expected a non-empty array",
+      actual: a,
+    });
+  }
   if(a.length !== 0) return true;
   throw new assert.AssertionError({
     message: "Expected a non-empty array",
@@ -76,3 +82,33 @@ export function startsWith(a: string, prefix: string) {
     actual: a,
   });
 }
+export function gt(num: number, target: number) {
+  if(num > target) return true;
+  throw new assert.AssertionError({
+    message: `Expected ${num} > ${target}`,
+    actual: num,
+  });
+}
+export function gte(num: number, target: number) {
+  if(num >= target) return true;
+  throw new assert.AssertionError({
+    message: `Expected ${num} >= ${target}`,
+    actual: num,
+  });
+}
+export function stringContains(str: string, expected: string) {
+  if(typeof str !== "string") {
+    throw new assert.AssertionError({
+      message: "Expected input to be of type string.",
+      actual: typeof str,
+    });
+  }
+  if(str.includes(expected)) return true;
+  throw new assert.AssertionError({
+    message: `Expected string to contain: "${expected}"`,
+    actual: str,
+  });
+}

package/source/chat-completion.ts CHANGED Viewed

@@ -10,10 +10,15 @@ export type ChatResponse = OpenAI.ChatCompletion & {
     message: {
       reasoning_content?: string,
       reasoning?: string,
+      tool_calls?: Array<{
+        index: number,
+      }>
     },
   }>
 };
+export type ChatMessage = ChatResponse["choices"][number]["message"];
 const TextContentPart =  t.subtype({
   type: t.value("text"),
   text: t.str,
@@ -56,6 +61,7 @@ const AssistantMessageSchema = t.subtype({
     name: t.str,
   })),
   reasoning_content: t.optional(t.str.or(t.nil)),
+  reasoning: t.optional(t.str.or(t.nil)),
 });
 const UserMessageSchema = t.subtype({

package/source/evals.test.ts ADDED Viewed

@@ -0,0 +1,13 @@
+import { describe, expect, it } from "vitest";
+import { getEvals } from "./evals.ts";
+describe("get-evals", () => {
+  it("works", async () => {
+    const evals = await getEvals();
+    evals.map(({ test, json, name }) => {
+      expect(name).toBeTypeOf("string");
+      expect(json).toBeTruthy();
+      expect(test).toBeTypeOf("function");
+    });
+  });
+})

package/source/evals.ts ADDED Viewed

@@ -0,0 +1,56 @@
+import fs from "fs/promises";
+import path from "path";
+import { ChatMessage } from "./chat-completion.ts";
+export type Eval = {
+  test: (response: ChatMessage) => any;
+  json: any;
+  name: string;
+};
+export async function getEvals(): Promise<Eval[]> {
+  const evals: Eval[] = [];
+  const evalsPath = path.join(import.meta.dirname, "..", "evals");
+  for await (const testFile of findTestFiles(evalsPath, false)) {
+    const { test, json } = await import(testFile);
+    evals.push({ test, json, name: evalName(testFile) });
+  }
+  return evals;
+}
+export function evalName(file: string) {
+  return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
+}
+export async function* findTestFiles(dir: string, skipReasoning: boolean): AsyncGenerator<string> {
+  try {
+    await fs.stat(dir);
+  } catch(e) {
+    const pathname = `${dir}.js`;
+    const stat = await fs.stat(pathname);
+    if(stat.isFile()) {
+      yield pathname;
+      return;
+    }
+    throw e;
+  }
+  const entryNames = await fs.readdir(dir);
+  const entries = await Promise.all(entryNames.map(async (entry) => {
+    return {
+      path: path.join(dir, entry),
+      stat: await fs.stat(path.join(dir, entry)),
+    };
+  }));
+  for(const entry of entries) {
+    if(entry.stat.isFile() && entry.path.endsWith(".js")) {
+      yield entry.path;
+    }
+    if(entry.stat.isDirectory()) {
+      if(skipReasoning && path.basename(entry.path) === "reasoning") continue;
+      yield* findTestFiles(entry.path, skipReasoning);
+    }
+  }
+}

package/source/exports.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export { getEvals, type Eval as SynbadEval } from "./evals.ts";
2	+ export { ChatMessage as SynbadChatMessage } from "./chat-completion.ts";

package/source/index.ts CHANGED Viewed

@@ -1,10 +1,11 @@
 #!/usr/bin/env node
 import * as http from "http";
 import * as https from "https";
-import fs from "fs/promises";
 import path from "path";
 import { Command } from "@commander-js/extra-typings";
 import OpenAI from "openai";
+import { ChatMessage, getReasoning } from "./chat-completion.ts";
+import { findTestFiles, evalName } from "./evals.ts";
 const cli = new Command()
 .name("synbad")
@@ -21,14 +22,20 @@ cli.command("eval")
 .option(
   "--skip-reasoning", "Skip reasoning evals (set this for non-reasoning models)"
 )
+.option(
+  "--reasoning-effort <level>", "Set the reasoning effort to high, medium, or low"
+)
 .option(
   "--only <eval path within synbad>", "Specific evals you want to run, e.g. evals/reasoning or evals/tools/claude-dash"
 )
 .option(
   "--count <num times>", "Number of times to run the eval. Any failures count as an overall failure",
 )
+.option(
+  "--stream", "Test streaming API calls",
+)
 .requiredOption("--model <model name>", "The model name to test")
-.action(async ({ model, envVar, baseUrl, only, count }) => {
+.action(async ({ model, envVar, baseUrl, only, count, skipReasoning, reasoningEffort, stream }) => {
   if(!process.env[envVar]) {
     console.error(`No env var named ${envVar} exists for the current process`);
     process.exit(1);
@@ -41,28 +48,126 @@ cli.command("eval")
   const failures = new Set<string>();
   const evalPath = only ? path.join(
     import.meta.dirname, "..", only
-  ) : path.join(import.meta.dirname, "../evals");
+  ) : path.join(import.meta.dirname, "..", "evals");
   const maxRuns = count == null ? 1 : parseInt(count, 10);
-  for await(const testFile of findTestFiles(evalPath)) {
+  for await(const testFile of findTestFiles(evalPath, !!skipReasoning)) {
     found++;
     const test = await import(testFile);
     const json = test.json;
     const name = evalName(testFile);
     process.stdout.write(`Running ${name}...`);
+    async function respond(): Promise<ChatMessage> {
+      const reasoning = reasoningEffort == null ? {} : {
+        reasoning_effort: reasoningEffort,
+      };
+      if(!stream) {
+        const response = await client.chat.completions.create({
+          ...json,
+          ...reasoning,
+          stream: false,
+          model,
+        });
+        return response.choices[0].message as ChatMessage;
+      }
+      const msg: Partial<ChatMessage> = {};
+      const chunkStream = await (client.chat.completions.create({
+        ...json,
+        ...reasoning,
+        model,
+        stream: true,
+      }) as unknown as Promise<AsyncIterable<OpenAI.ChatCompletionChunk & {
+        choices: Array<{
+          delta: {
+            reasoning?: string,
+            reasoning_content?: string,
+          },
+        }>
+      }>>);
+      let lastIndex: number | null = null;
+      let toolBuffer: {
+        id?: string,
+        type: "function",
+        index: number,
+        function: {
+          name?: string,
+          arguments?: string,
+        },
+      } | null = null;
+      for await(const chunk of chunkStream) {
+        if(!chunk.choices) continue;
+        const choice = chunk.choices[0];
+        if(!choice) continue;
+        const content = choice.delta.content;
+        const tools = choice.delta.tool_calls;
+        const reasoning = getReasoning(choice.delta);
+        if(content) {
+          if(!msg.content) msg.content = "";
+          msg.content += content;
+        }
+        if(tools) {
+          for(const toolDelta of tools) {
+            if(lastIndex == null) lastIndex = toolDelta.index;
+            if(lastIndex !== toolDelta.index && toolBuffer != null) {
+              msg.tool_calls ||= [];
+              // @ts-ignore
+              msg.tool_calls.push(toolBuffer);
+              toolBuffer = {
+                index: toolDelta.index,
+                type: "function",
+                function: {},
+              };
+            }
+            if(!toolBuffer) {
+              toolBuffer = {
+                index: toolDelta.index,
+                type: "function",
+                function: {}
+              };
+            }
+            lastIndex = toolDelta.index;
+            if(toolDelta.id) toolBuffer.id = toolDelta.id;
+            if(toolDelta.function) {
+              if(toolDelta.function.name) {
+                toolBuffer.function.name ||= "";
+                toolBuffer.function.name += toolDelta.function.name;
+              }
+              if(toolDelta.function.arguments) {
+                toolBuffer.function.arguments ||= "";
+                toolBuffer.function.arguments += toolDelta.function.arguments;
+              }
+            }
+          }
+        }
+        if(reasoning) {
+          if(!msg.reasoning_content) msg.reasoning_content = "";
+          msg.reasoning_content += reasoning;
+        }
+      }
+      if(toolBuffer) {
+        msg.tool_calls ||= [];
+        // @ts-ignore
+        msg.tool_calls.push(toolBuffer);
+      }
+      return msg as ChatMessage;
+    }
     try {
       for(let i = 0; i < maxRuns; i++) {
         if(maxRuns > 1) {
           process.stdout.write(` ${i + 1}/${maxRuns}`);
         }
-        const response = await client.chat.completions.create({
-          model,
-          ...json,
-        });
+        const response = await respond();
         try {
           test.test(response);
         } catch(e) {
           console.error("Response:");
-          console.error(JSON.stringify(response.choices[0], null, 2));
+          console.error(JSON.stringify(response, null, 2));
           throw e;
         }
       }
@@ -90,6 +195,7 @@ ${passed}/${found} evals passed. Failures:
 cli.command("proxy")
 .requiredOption("-p, --port <number>", "Port to listen on")
 .requiredOption("-t, --target <url>", "Target URL to proxy to")
+.option("--pretty", "Pretty-print the JSON")
 .action(async (options) => {
   const port = parseInt(options.port, 10);
   const targetUrl = new URL(options.target);
@@ -122,6 +228,8 @@ cli.command("proxy")
       // Choose the right module based on target protocol
       const httpModule = targetUrl.protocol === "https:" ? https : http;
+      const buffer: string[] = [];
       // Create proxy request
       const proxyReq = httpModule.request(
         {
@@ -177,12 +285,14 @@ cli.command("proxy")
       });
       req.on("data", (chunk) => {
-        process.stdout.write(chunk);
+        buffer.push(chunk);
+        if(!options.pretty) process.stdout.write(chunk);
         proxyReq.write(chunk);
       });
       req.on("end", () => {
-        process.stdout.write("\n");
+        if(options.pretty) console.log(JSON.stringify(JSON.parse(buffer.join()), null, 2));
+        else process.stdout.write("\n");
         console.log(`[${timestamp}] ✅ Request complete`);
         proxyReq.end();
       });
@@ -208,39 +318,6 @@ cli.command("proxy")
   });
 });
-function evalName(file: string) {
-  return `${path.basename(path.dirname(file))}/${path.basename(file).replace(/.js$/, "")}`
-}
-async function* findTestFiles(dir: string): AsyncGenerator<string> {
-  try {
-    await fs.stat(dir);
-  } catch(e) {
-    const pathname = `${dir}.js`;
-    const stat = await fs.stat(pathname);
-    if(stat.isFile()) {
-      yield pathname;
-      return;
-    }
-    throw e;
-  }
-  const entryNames = await fs.readdir(dir);
-  const entries = await Promise.all(entryNames.map(async (entry) => {
-    return {
-      path: path.join(dir, entry),
-      stat: await fs.stat(path.join(dir, entry)),
-    };
-  }));
-  for(const entry of entries) {
-    if(entry.stat.isFile() && entry.path.endsWith(".js")) {
-      yield entry.path;
-    }
-    if(entry.stat.isDirectory()) {
-      yield* findTestFiles(entry.path);
-    }
-  }
-}
 function stderrLog(item: string, ...items: string[]) {
   let formatted = item;
   if(items.length > 0) {