npm - @botbotgo/better-call - Versions diffs - 0.1.12 → 0.1.14 - Mend

@botbotgo/better-call 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md +21 -17
package/benchmarks/bfcl-real-remote-completed-summary.json +119 -1
package/docs/banner.svg +2 -5
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -4,7 +4,7 @@
 # BetterCall
-**One-line wrapper. Seven full BFCL remote runs completed. Best: 73.4% → 83.8%.**
+**One-line wrapper. Eight full BFCL remote runs completed. Best: 73.4% → 83.8%.**
 ```ts
 const tools = betterTools([searchTool, calculatorTool]);
@@ -142,6 +142,9 @@ qwen3.5:2b
 lfm2.5-thinking:latest
   Raw         50.8% | ####################....................
   BetterCall  54.8% | ######################..................
+qwen3.5:4b
+  Raw         43.6% | #################.......................
+  BetterCall  43.4% | #################.......................
 gemma4:e2b
   Raw         24.3% | ##########..............................
   BetterCall  24.7% | ##########..............................
@@ -155,26 +158,27 @@ gemma4:e2b
 | 4 | `qwen3.5:0.8b` | 3,625 | 54.6% | 56.9% | +2.3pp | 901 |
 | 5 | `qwen3.5:2b` | 3,625 | 53.9% | 54.9% | +1.0pp | 1,308 |
 | 6 | `lfm2.5-thinking:latest` | 3,625 | 50.8% | 54.8% | +4.0pp | 1,142 |
-| 7 | `gemma4:e2b` | 3,625 | 24.3% | 24.7% | +0.4pp | 2,641 |
+| 7 | `qwen3.5:4b` | 3,625 | 43.6% | 43.4% | -0.2pp | 1,847 |
+| 8 | `gemma4:e2b` | 3,625 | 24.3% | 24.7% | +0.4pp | 2,641 |
-Latest completed model category detail: `qwen2.5:7b-instruct`.
+Latest completed model category detail: `qwen3.5:4b`.
 | Category | Cases | Raw | BetterCall repair | Lift | Request errors |
 | --- | ---: | ---: | ---: | ---: | ---: |
-| `simple_python` | 400 | 91.3% | 93.0% | +1.8pp | 9 |
-| `simple_java` | 100 | 55.0% | 59.0% | +4.0pp | 3 |
-| `simple_javascript` | 50 | 60.0% | 64.0% | +4.0pp | 0 |
-| `multiple` | 200 | 89.5% | 91.0% | +1.5pp | 3 |
-| `parallel` | 200 | 81.0% | 84.5% | +3.5pp | 2 |
-| `parallel_multiple` | 200 | 77.0% | 78.0% | +1.0pp | 6 |
-| `irrelevance` | 240 | 64.6% | 89.6% | +25.0pp | 0 |
-| `live_simple` | 258 | 69.0% | 72.1% | +3.1pp | 1 |
-| `live_multiple` | 1,053 | 69.0% | 73.4% | +4.4pp | 18 |
-| `live_parallel` | 16 | 25.0% | 37.5% | +12.5pp | 2 |
-| `live_parallel_multiple` | 24 | 50.0% | 50.0% | +0.0pp | 2 |
-| `live_irrelevance` | 884 | 67.6% | 75.9% | +8.3pp | 34 |
-The strongest `qwen2.5:7b-instruct` category lift was `irrelevance`, improving from 64.6% to 89.6%. The largest absolute live gain was `live_irrelevance`, improving by 73 cases.
+| `simple_python` | 400 | 81.3% | 81.3% | +0.0pp | 54 |
+| `simple_java` | 100 | 56.0% | 56.0% | +0.0pp | 32 |
+| `simple_javascript` | 50 | 48.0% | 48.0% | +0.0pp | 18 |
+| `multiple` | 200 | 83.5% | 83.5% | +0.0pp | 20 |
+| `parallel` | 200 | 70.0% | 70.0% | +0.0pp | 45 |
+| `parallel_multiple` | 200 | 47.0% | 47.0% | +0.0pp | 96 |
+| `irrelevance` | 240 | 68.8% | 68.8% | +0.0pp | 75 |
+| `live_simple` | 258 | 66.7% | 66.3% | -0.4pp | 45 |
+| `live_multiple` | 1,053 | 41.6% | 41.0% | -0.6pp | 538 |
+| `live_parallel` | 16 | 0.0% | 0.0% | +0.0pp | 16 |
+| `live_parallel_multiple` | 24 | 0.0% | 0.0% | +0.0pp | 24 |
+| `live_irrelevance` | 884 | 0.0% | 0.0% | +0.0pp | 884 |
+This `qwen3.5:4b` run hit sustained remote request failures in the live categories; those failures are counted as incorrect by the benchmark.
 Historical targeted wrapper benchmark:

package/benchmarks/bfcl-real-remote-completed-summary.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "BFCL v4 full remote Ollama runs",
   "source": "Real model calls against all supported BFCL v4 single-turn tool-call categories completed in this repository",
   "note": "Endpoint redacted. Scores count request errors/timeouts as incorrect. This is not an official BFCL leaderboard submission.",
-  "generatedAt": "2026-05-08T22:24:15.073Z",
+  "generatedAt": "2026-05-09T01:31:12.909Z",
   "results": [
     {
       "model": "granite4.1:3b",
@@ -712,6 +712,124 @@
         }
       ]
     },
+    {
+      "model": "qwen3.5:4b",
+      "total": 3625,
+      "rawCorrect": 1581,
+      "betterCorrect": 1574,
+      "errors": 1847,
+      "repaired": 0,
+      "categories": [
+        {
+          "model": "qwen3.5:4b",
+          "category": "simple_python",
+          "total": 400,
+          "rawCorrect": 325,
+          "betterCorrect": 325,
+          "errors": 54,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "simple_java",
+          "total": 100,
+          "rawCorrect": 56,
+          "betterCorrect": 56,
+          "errors": 32,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "simple_javascript",
+          "total": 50,
+          "rawCorrect": 24,
+          "betterCorrect": 24,
+          "errors": 18,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "multiple",
+          "total": 200,
+          "rawCorrect": 167,
+          "betterCorrect": 167,
+          "errors": 20,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "parallel",
+          "total": 200,
+          "rawCorrect": 140,
+          "betterCorrect": 140,
+          "errors": 45,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "parallel_multiple",
+          "total": 200,
+          "rawCorrect": 94,
+          "betterCorrect": 94,
+          "errors": 96,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "irrelevance",
+          "total": 240,
+          "rawCorrect": 165,
+          "betterCorrect": 165,
+          "errors": 75,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "live_simple",
+          "total": 258,
+          "rawCorrect": 172,
+          "betterCorrect": 171,
+          "errors": 45,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "live_multiple",
+          "total": 1053,
+          "rawCorrect": 438,
+          "betterCorrect": 432,
+          "errors": 538,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "live_parallel",
+          "total": 16,
+          "rawCorrect": 0,
+          "betterCorrect": 0,
+          "errors": 16,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "live_parallel_multiple",
+          "total": 24,
+          "rawCorrect": 0,
+          "betterCorrect": 0,
+          "errors": 24,
+          "repaired": 0
+        },
+        {
+          "model": "qwen3.5:4b",
+          "category": "live_irrelevance",
+          "total": 884,
+          "rawCorrect": 0,
+          "betterCorrect": 0,
+          "errors": 884,
+          "repaired": 0
+        }
+      ]
+    },
     {
       "model": "gemma4:e2b",
       "total": 3625,

package/docs/banner.svg CHANGED Viewed

@@ -28,9 +28,6 @@
         <feMergeNode in="SourceGraphic"/>
       </feMerge>
     </filter>
-    <marker id="growth-arrow" viewBox="0 0 16 16" refX="13" refY="8" markerWidth="18" markerHeight="18" orient="auto">
-      <path d="M1 1 L15 8 L1 15 Z" fill="#35ff87"/>
-    </marker>
   </defs>
   <rect width="1400" height="520" fill="url(#background)"/>
@@ -58,8 +55,8 @@
   <rect x="0" y="0" width="870" height="520" fill="url(#left-fade)"/>
   <path d="M858 326 C 928 302, 992 312, 1060 292 S 1168 276, 1268 238" fill="none" stroke="#67d4ff" stroke-width="8" stroke-linecap="round" stroke-linejoin="round" opacity="0.52"/>
-  <path d="M858 246 C 926 206, 994 214, 1060 182 S 1168 154, 1276 84" fill="none" stroke="#35ff87" stroke-width="15" stroke-linecap="round" stroke-linejoin="round" marker-end="url(#growth-arrow)" opacity="0.92"/>
-  <polygon points="1256,56 1324,36 1296,106" fill="#35ff87" opacity="0.92"/>
+  <path d="M858 246 C 926 206, 994 214, 1060 182 S 1168 154, 1288 72" fill="none" stroke="#35ff87" stroke-width="15" stroke-linecap="round" stroke-linejoin="round" opacity="0.92"/>
+  <polygon points="1270,58 1312,44 1292,90" fill="#35ff87" opacity="0.92"/>
   <g transform="translate(104 166)">
     <text x="0" y="0" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="90" font-weight="850" fill="#f8fbff">BetterCall</text>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@botbotgo/better-call",
-  "version": "0.1.12",
+  "version": "0.1.14",
   "description": "LLM tool-call reliability layer.",
   "type": "module",
   "license": "Apache-2.0",