@botbotgo/better-call 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -16
- package/benchmarks/bfcl-real-remote-completed-summary.json +119 -1
- package/docs/banner.svg +2 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# BetterCall
|
|
6
6
|
|
|
7
|
-
**One-line wrapper.
|
|
7
|
+
**One-line wrapper. Four full BFCL remote runs completed. Best: 55.5% → 63.6%.**
|
|
8
8
|
|
|
9
9
|
```ts
|
|
10
10
|
const tools = betterTools([searchTool, calculatorTool]);
|
|
@@ -125,26 +125,27 @@ Latest completed remote run artifact: `benchmarks/bfcl-real-remote-completed-sum
|
|
|
125
125
|
| --- | ---: | ---: | ---: | ---: | ---: |
|
|
126
126
|
| `qwen3:0.6b` | 3,625 | 55.5% | 63.6% | +8.2pp | 217 |
|
|
127
127
|
| `qwen3.5:0.8b` | 3,625 | 54.6% | 56.9% | +2.3pp | 901 |
|
|
128
|
+
| `qwen3.5:2b` | 3,625 | 53.9% | 54.9% | +1.0pp | 1,308 |
|
|
128
129
|
| `lfm2.5-thinking:latest` | 3,625 | 50.8% | 54.8% | +4.0pp | 1,142 |
|
|
129
130
|
|
|
130
|
-
Latest completed model category detail: `
|
|
131
|
+
Latest completed model category detail: `qwen3.5:2b`.
|
|
131
132
|
|
|
132
133
|
| Category | Cases | Raw | BetterCall repair | Lift | Request errors |
|
|
133
134
|
| --- | ---: | ---: | ---: | ---: | ---: |
|
|
134
|
-
| `simple_python` | 400 |
|
|
135
|
-
| `simple_java` | 100 |
|
|
136
|
-
| `simple_javascript` | 50 |
|
|
137
|
-
| `multiple` | 200 |
|
|
138
|
-
| `parallel` | 200 |
|
|
139
|
-
| `parallel_multiple` | 200 |
|
|
140
|
-
| `irrelevance` | 240 |
|
|
141
|
-
| `live_simple` | 258 |
|
|
142
|
-
| `live_multiple` | 1,053 |
|
|
143
|
-
| `live_parallel` | 16 |
|
|
144
|
-
| `live_parallel_multiple` | 24 |
|
|
145
|
-
| `live_irrelevance` | 884 |
|
|
146
|
-
|
|
147
|
-
The strongest `
|
|
135
|
+
| `simple_python` | 400 | 29.5% | 32.5% | +3.0pp | 256 |
|
|
136
|
+
| `simple_java` | 100 | 14.0% | 17.0% | +3.0pp | 81 |
|
|
137
|
+
| `simple_javascript` | 50 | 2.0% | 2.0% | +0.0pp | 48 |
|
|
138
|
+
| `multiple` | 200 | 23.0% | 24.0% | +1.0pp | 151 |
|
|
139
|
+
| `parallel` | 200 | 18.0% | 18.0% | +0.0pp | 157 |
|
|
140
|
+
| `parallel_multiple` | 200 | 17.5% | 17.5% | +0.0pp | 159 |
|
|
141
|
+
| `irrelevance` | 240 | 69.6% | 72.5% | +2.9pp | 66 |
|
|
142
|
+
| `live_simple` | 258 | 60.1% | 62.4% | +2.3pp | 41 |
|
|
143
|
+
| `live_multiple` | 1,053 | 68.4% | 68.5% | +0.1pp | 110 |
|
|
144
|
+
| `live_parallel` | 16 | 62.5% | 62.5% | +0.0pp | 2 |
|
|
145
|
+
| `live_parallel_multiple` | 24 | 50.0% | 45.8% | -4.2pp | 6 |
|
|
146
|
+
| `live_irrelevance` | 884 | 72.4% | 73.1% | +0.7pp | 231 |
|
|
147
|
+
|
|
148
|
+
The strongest `qwen3.5:2b` categories were `simple_python` and `simple_java`, both improving by +3.0pp after BetterCall repair. `live_parallel_multiple` regressed by -4.2pp and is tracked in the full artifact.
|
|
148
149
|
|
|
149
150
|
Historical targeted wrapper benchmark:
|
|
150
151
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "BFCL v4 full remote Ollama runs",
|
|
3
3
|
"source": "Real model calls against all supported BFCL v4 single-turn tool-call categories completed in this repository",
|
|
4
4
|
"note": "Endpoint redacted. Scores count request errors/timeouts as incorrect. This is not an official BFCL leaderboard submission.",
|
|
5
|
-
"generatedAt": "2026-05-
|
|
5
|
+
"generatedAt": "2026-05-08T08:12:17.258Z",
|
|
6
6
|
"results": [
|
|
7
7
|
{
|
|
8
8
|
"model": "qwen3:0.6b",
|
|
@@ -240,6 +240,124 @@
|
|
|
240
240
|
}
|
|
241
241
|
]
|
|
242
242
|
},
|
|
243
|
+
{
|
|
244
|
+
"model": "qwen3.5:2b",
|
|
245
|
+
"total": 3625,
|
|
246
|
+
"rawCorrect": 1954,
|
|
247
|
+
"betterCorrect": 1990,
|
|
248
|
+
"errors": 1308,
|
|
249
|
+
"repaired": 55,
|
|
250
|
+
"categories": [
|
|
251
|
+
{
|
|
252
|
+
"model": "qwen3.5:2b",
|
|
253
|
+
"category": "simple_python",
|
|
254
|
+
"total": 400,
|
|
255
|
+
"rawCorrect": 118,
|
|
256
|
+
"betterCorrect": 130,
|
|
257
|
+
"errors": 256,
|
|
258
|
+
"repaired": 15
|
|
259
|
+
},
|
|
260
|
+
{
|
|
261
|
+
"model": "qwen3.5:2b",
|
|
262
|
+
"category": "simple_java",
|
|
263
|
+
"total": 100,
|
|
264
|
+
"rawCorrect": 14,
|
|
265
|
+
"betterCorrect": 17,
|
|
266
|
+
"errors": 81,
|
|
267
|
+
"repaired": 6
|
|
268
|
+
},
|
|
269
|
+
{
|
|
270
|
+
"model": "qwen3.5:2b",
|
|
271
|
+
"category": "simple_javascript",
|
|
272
|
+
"total": 50,
|
|
273
|
+
"rawCorrect": 1,
|
|
274
|
+
"betterCorrect": 1,
|
|
275
|
+
"errors": 48,
|
|
276
|
+
"repaired": 0
|
|
277
|
+
},
|
|
278
|
+
{
|
|
279
|
+
"model": "qwen3.5:2b",
|
|
280
|
+
"category": "multiple",
|
|
281
|
+
"total": 200,
|
|
282
|
+
"rawCorrect": 46,
|
|
283
|
+
"betterCorrect": 48,
|
|
284
|
+
"errors": 151,
|
|
285
|
+
"repaired": 2
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
"model": "qwen3.5:2b",
|
|
289
|
+
"category": "parallel",
|
|
290
|
+
"total": 200,
|
|
291
|
+
"rawCorrect": 36,
|
|
292
|
+
"betterCorrect": 36,
|
|
293
|
+
"errors": 157,
|
|
294
|
+
"repaired": 1
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
"model": "qwen3.5:2b",
|
|
298
|
+
"category": "parallel_multiple",
|
|
299
|
+
"total": 200,
|
|
300
|
+
"rawCorrect": 35,
|
|
301
|
+
"betterCorrect": 35,
|
|
302
|
+
"errors": 159,
|
|
303
|
+
"repaired": 0
|
|
304
|
+
},
|
|
305
|
+
{
|
|
306
|
+
"model": "qwen3.5:2b",
|
|
307
|
+
"category": "irrelevance",
|
|
308
|
+
"total": 240,
|
|
309
|
+
"rawCorrect": 167,
|
|
310
|
+
"betterCorrect": 174,
|
|
311
|
+
"errors": 66,
|
|
312
|
+
"repaired": 7
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
"model": "qwen3.5:2b",
|
|
316
|
+
"category": "live_simple",
|
|
317
|
+
"total": 258,
|
|
318
|
+
"rawCorrect": 155,
|
|
319
|
+
"betterCorrect": 161,
|
|
320
|
+
"errors": 41,
|
|
321
|
+
"repaired": 8
|
|
322
|
+
},
|
|
323
|
+
{
|
|
324
|
+
"model": "qwen3.5:2b",
|
|
325
|
+
"category": "live_multiple",
|
|
326
|
+
"total": 1053,
|
|
327
|
+
"rawCorrect": 720,
|
|
328
|
+
"betterCorrect": 721,
|
|
329
|
+
"errors": 110,
|
|
330
|
+
"repaired": 10
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
"model": "qwen3.5:2b",
|
|
334
|
+
"category": "live_parallel",
|
|
335
|
+
"total": 16,
|
|
336
|
+
"rawCorrect": 10,
|
|
337
|
+
"betterCorrect": 10,
|
|
338
|
+
"errors": 2,
|
|
339
|
+
"repaired": 0
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
"model": "qwen3.5:2b",
|
|
343
|
+
"category": "live_parallel_multiple",
|
|
344
|
+
"total": 24,
|
|
345
|
+
"rawCorrect": 12,
|
|
346
|
+
"betterCorrect": 11,
|
|
347
|
+
"errors": 6,
|
|
348
|
+
"repaired": 0
|
|
349
|
+
},
|
|
350
|
+
{
|
|
351
|
+
"model": "qwen3.5:2b",
|
|
352
|
+
"category": "live_irrelevance",
|
|
353
|
+
"total": 884,
|
|
354
|
+
"rawCorrect": 640,
|
|
355
|
+
"betterCorrect": 646,
|
|
356
|
+
"errors": 231,
|
|
357
|
+
"repaired": 6
|
|
358
|
+
}
|
|
359
|
+
]
|
|
360
|
+
},
|
|
243
361
|
{
|
|
244
362
|
"model": "lfm2.5-thinking:latest",
|
|
245
363
|
"total": 3625,
|
package/docs/banner.svg
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<svg xmlns="http://www.w3.org/2000/svg" width="1400" height="520" viewBox="0 0 1400 520" role="img" aria-labelledby="title desc">
|
|
2
2
|
<title id="title">BetterCall banner</title>
|
|
3
|
-
<desc id="desc">BetterCall is a one-line wrapper with
|
|
3
|
+
<desc id="desc">BetterCall is a one-line wrapper with four full BFCL remote runs completed; the best run improves from 55.5 percent to 63.6 percent.</desc>
|
|
4
4
|
<defs>
|
|
5
5
|
<linearGradient id="background" x1="0" y1="0" x2="1" y2="1">
|
|
6
6
|
<stop offset="0" stop-color="#07111f"/>
|
|
@@ -24,6 +24,6 @@
|
|
|
24
24
|
<g transform="translate(112 112)">
|
|
25
25
|
<text x="0" y="80" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="78" font-weight="800" fill="#f8fbff">BetterCall</text>
|
|
26
26
|
<rect x="4" y="116" width="430" height="8" rx="4" fill="url(#accent)"/>
|
|
27
|
-
<text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">
|
|
27
|
+
<text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">Four full BFCL remote runs. Best: 55.5% → 63.6%.</text>
|
|
28
28
|
</g>
|
|
29
29
|
</svg>
|