npm - @syntheticlab/synbad - Versions diffs - 0.0.5 → 0.0.7 - Mend

@syntheticlab/synbad 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

package/README.md +60 -23
package/dist/evals/reasoning/custom-harness.d.ts +679 -0
package/dist/evals/reasoning/custom-harness.js +847 -0
package/dist/evals/reasoning/multiturn-reasoning-parsing.d.ts +2 -2
package/dist/evals/reasoning/multiturn-reasoning-parsing.js +2 -2
package/dist/evals/reasoning/reasoning-claude-tool-call.d.ts +2 -2
package/dist/evals/reasoning/reasoning-claude-tool-call.js +1 -2
package/dist/evals/reasoning/reasoning-parsing.d.ts +2 -2
package/dist/evals/reasoning/reasoning-parsing.js +2 -2
package/dist/evals/reasoning/response-in-reasoning.d.ts +45 -0
package/dist/evals/reasoning/response-in-reasoning.js +59 -0
package/dist/evals/tools/claude-dash.d.ts +2 -2
package/dist/evals/tools/claude-dash.js +1 -2
package/dist/evals/tools/crush-list-files.d.ts +2 -5
package/dist/evals/tools/crush-list-files.js +6 -8
package/dist/evals/tools/multi-turn-tools.d.ts +46 -0
package/dist/evals/tools/multi-turn-tools.js +100 -0
package/dist/evals/tools/no-fn-args.d.ts +22 -0
package/dist/evals/tools/no-fn-args.js +31 -0
package/dist/evals/tools/octo-list-no-optional-args.d.ts +209 -0
package/dist/evals/tools/octo-list-no-optional-args.js +73 -0
package/dist/evals/tools/parallel-tool.d.ts +2 -2
package/dist/evals/tools/parallel-tool.js +1 -2
package/dist/evals/tools/simple-tool.d.ts +2 -2
package/dist/evals/tools/simple-tool.js +3 -2
package/dist/evals/tools/tool-dash-underscore.d.ts +26 -0
package/dist/evals/tools/tool-dash-underscore.js +37 -0
package/dist/evals/tools/tool-path-corruption.d.ts +26 -0
package/dist/evals/tools/tool-path-corruption.js +41 -0
package/dist/source/asserts.d.ts +4 -1
package/dist/source/asserts.js +36 -0
package/dist/source/chat-completion.d.ts +5 -0
package/dist/source/chat-completion.js +1 -0
package/dist/source/evals.d.ts +9 -0
package/dist/source/evals.js +53 -0
package/dist/source/evals.test.d.ts +1 -0
package/dist/source/evals.test.js +12 -0
package/dist/source/exports.d.ts +2 -0
package/dist/source/exports.js +1 -0
package/dist/source/index.js +103 -43
package/evals/reasoning/multiturn-reasoning-parsing.ts +3 -3
package/evals/reasoning/reasoning-claude-tool-call.ts +2 -3
package/evals/reasoning/reasoning-parsing.ts +3 -3
package/evals/reasoning/response-in-reasoning.ts +65 -0
package/evals/tools/claude-dash.ts +2 -3
package/evals/tools/crush-list-files.ts +11 -13
package/evals/tools/multi-turn-tools.ts +104 -0
package/evals/tools/no-fn-args.ts +34 -0
package/evals/tools/octo-list-no-optional-args.ts +81 -0
package/evals/tools/parallel-tool.ts +2 -3
package/evals/tools/simple-tool.ts +4 -3
package/evals/tools/tool-dash-underscore.ts +40 -0
package/evals/tools/tool-path-corruption.ts +46 -0
package/package.json +10 -3
package/source/asserts.ts +37 -1
package/source/chat-completion.ts +6 -0
package/source/evals.test.ts +13 -0
package/source/evals.ts +56 -0
package/source/exports.ts +2 -0
package/source/index.ts +121 -44

package/README.md CHANGED Viewed

@@ -8,32 +8,48 @@ inference quality as high as possible.
 If you find bugs in Synthetic's model hosting, please contribute the bugs here!
 We will fix them.
+## Install
+Synbad is distributed through npm. Install it with:
+```bash
+npm install -g @syntheticlab/synbad
+```
 ## Results
-We keep a running tally of provider+model results for GLM-4.6, Kimi K2
-Thinking, and MiniMax M2. Feel free to add more provider results!
+We keep a running tally of provider+model results for tool calling and
+reasoning parsing for GLM-4.7, Kimi K2 Thinking, and MiniMax M2. Feel free to
+add more provider results!
 |Provider |Model           |Success Rate|
 |---------|----------------|------------|
-|Synthetic|GLM-4.6         |:white_check_mark: 100%|
-|Synthetic|Kimi K2 Thinking|:white_check_mark: 100%|
-|Synthetic|MiniMax M2      |:white_check_mark: 100%|
+|Synthetic.new|GLM-4.7         |:white_check_mark: 100%|
+|Synthetic.new|Kimi K2 Thinking|:white_check_mark: 100%|
+|Synthetic.new|MiniMax M2      |:white_check_mark: 100%|
 |Provider |Model           |Success Rate|
 |---------|----------------|------------|
-|Fireworks|GLM-4.6         |:white_check_mark: 100%|
-|Fireworks|Kimi K2 Thinking|:x: 86%|
-|Fireworks|MiniMax M2      |:x: 29%|
+|Fireworks|GLM-4.7         |:x: 83%|
+|Fireworks|Kimi K2 Thinking|:x: 92%|
+|Fireworks|MiniMax M2      |:white_check_mark: 100%|
 |Provider |Model           |Success Rate|
 |---------|----------------|------------|
-|Together |GLM-4.6         |:white_check_mark: 100%|
-|Together |Kimi K2 Thinking|:x: 71%|
+|Together |Kimi K2 Thinking|:x: 66%|
 |Provider |Model           |Success Rate|
 |---------|----------------|------------|
-|Parasail |GLM-4.6         |:x: 71%|
-|Parasail |Kimi K2 Thinking|:x: 57%|
+|Parasail |GLM-4.7         |:x: 83%|
+|Parasail |Kimi K2 Thinking|:x: 75%|
+Note for attempting reproductions: generally all tests are reproducible with
+`--count 1` and `--count 1 --stream`, but for evaluating the
+response-in-reasoning eval, you generally will need a high count to reproduce
+the bug: `--count 40` and `--count 40 --stream` typically is sufficient.
+All evals must pass both with and without Synbad's `--stream` parameter (which
+tests streaming APIs) to be considered a pass.
 ## How do I contribute inference bugs?
@@ -71,26 +87,47 @@ TypeScript. You need to export two things from an eval:
 1. The JSON that reproduces the problem, as the const `json`. It doesn't have to
    reproduce it 100% of the time; if the bug appears even 5% of the time,
    that's fine.
-2. A `test` function that runs some asserts on the output of the response,
+2. A `test` function that runs some asserts on the returned assistant message,
    which detect the error.
-For example, we can test reasoning parsing very simply (as we do in the
-`evals/reasoning/reasoning-parsing.ts` file):
+For example, we can test parallel tool call support very simply (as we do in the
+`evals/tools/parallel-tool.ts` file):
 ```typescript
 import * as assert from "../../source/asserts.ts";
-import { ChatResponse, getReasoning } from "../../source/chat-completion.ts";
+import { ChatMessage } from "../../source/chat-completion.ts";
-export function test(response: ChatResponse) {
-  const reasoning = getReasoning(response.choices[0].message);
-  assert.isNotNullish(reasoning);
+export function test({ tool_calls }: ChatMessage) {
+  assert.isNotNullish(tool_calls);
+  assert.isNotEmptyArray(tool_calls);
+  assert.strictEqual(tool_calls.length, 2);
 }
-// Insert your JSON. You can paste your results from the Synbad proxy here.
 export const json = {
-  messages: [
-    { role: "user", content: "Why does 1+1=2?" }
+  "messages": [
+    {"role": "user", "content": "What's the weather in Paris and London?"}
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get current weather for a location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "City name"
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
   ],
+  "parallel_tool_calls": true,
+  "tool_choice": "auto",
 }
 ```
@@ -129,7 +166,7 @@ import { getReasoning } from "../../source/chat-completion.ts";
 // In your test:
-const reasoning = getReasoning(response.choices[0].message);
+const reasoning = getReasoning(message);
 ```
 This ensures your test will use the correct reasoning content data regardless