@botbotgo/better-call 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -24
- package/benchmarks/bfcl-real-remote-completed-summary.json +119 -1
- package/docs/banner.svg +2 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# BetterCall
|
|
6
6
|
|
|
7
|
-
**One-line wrapper.
|
|
7
|
+
**One-line wrapper. Seven full BFCL remote runs completed. Best: 73.4% → 83.8%.**
|
|
8
8
|
|
|
9
9
|
```ts
|
|
10
10
|
const tools = betterTools([searchTool, calculatorTool]);
|
|
@@ -121,33 +121,60 @@ Measured with real Ollama `/api/chat` calls over all supported BFCL v4 single-tu
|
|
|
121
121
|
|
|
122
122
|
Latest completed remote run artifact: `benchmarks/bfcl-real-remote-completed-summary.json`.
|
|
123
123
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
124
|
+
Performance after wrapping the same model outputs with BetterCall:
|
|
125
|
+
|
|
126
|
+
```text
|
|
127
|
+
granite4.1:3b
|
|
128
|
+
Raw 73.4% | #############################...........
|
|
129
|
+
BetterCall 83.8% | ##################################......
|
|
130
|
+
qwen2.5:7b-instruct
|
|
131
|
+
Raw 72.2% | #############################...........
|
|
132
|
+
BetterCall 78.2% | ###############################.........
|
|
133
|
+
qwen3:0.6b
|
|
134
|
+
Raw 55.5% | ######################..................
|
|
135
|
+
BetterCall 63.6% | #########################...............
|
|
136
|
+
qwen3.5:0.8b
|
|
137
|
+
Raw 54.6% | ######################..................
|
|
138
|
+
BetterCall 56.9% | #######################.................
|
|
139
|
+
qwen3.5:2b
|
|
140
|
+
Raw 53.9% | ######################..................
|
|
141
|
+
BetterCall 54.9% | ######################..................
|
|
142
|
+
lfm2.5-thinking:latest
|
|
143
|
+
Raw 50.8% | ####################....................
|
|
144
|
+
BetterCall 54.8% | ######################..................
|
|
145
|
+
gemma4:e2b
|
|
146
|
+
Raw 24.3% | ##########..............................
|
|
147
|
+
BetterCall 24.7% | ##########..............................
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
| Rank | Model | Completed cases | Raw model | BetterCall | Lift | Request errors |
|
|
151
|
+
| ---: | --- | ---: | ---: | ---: | ---: | ---: |
|
|
152
|
+
| 1 | `granite4.1:3b` | 3,625 | 73.4% | 83.8% | +10.4pp | 25 |
|
|
153
|
+
| 2 | `qwen2.5:7b-instruct` | 3,625 | 72.2% | 78.2% | +5.9pp | 80 |
|
|
154
|
+
| 3 | `qwen3:0.6b` | 3,625 | 55.5% | 63.6% | +8.2pp | 217 |
|
|
155
|
+
| 4 | `qwen3.5:0.8b` | 3,625 | 54.6% | 56.9% | +2.3pp | 901 |
|
|
156
|
+
| 5 | `qwen3.5:2b` | 3,625 | 53.9% | 54.9% | +1.0pp | 1,308 |
|
|
157
|
+
| 6 | `lfm2.5-thinking:latest` | 3,625 | 50.8% | 54.8% | +4.0pp | 1,142 |
|
|
158
|
+
| 7 | `gemma4:e2b` | 3,625 | 24.3% | 24.7% | +0.4pp | 2,641 |
|
|
132
159
|
|
|
133
|
-
Latest completed model category detail: `
|
|
160
|
+
Latest completed model category detail: `qwen2.5:7b-instruct`.
|
|
134
161
|
|
|
135
162
|
| Category | Cases | Raw | BetterCall repair | Lift | Request errors |
|
|
136
163
|
| --- | ---: | ---: | ---: | ---: | ---: |
|
|
137
|
-
| `simple_python` | 400 |
|
|
138
|
-
| `simple_java` | 100 |
|
|
139
|
-
| `simple_javascript` | 50 |
|
|
140
|
-
| `multiple` | 200 |
|
|
141
|
-
| `parallel` | 200 |
|
|
142
|
-
| `parallel_multiple` | 200 |
|
|
143
|
-
| `irrelevance` | 240 |
|
|
144
|
-
| `live_simple` | 258 |
|
|
145
|
-
| `live_multiple` | 1,053 |
|
|
146
|
-
| `live_parallel` | 16 |
|
|
147
|
-
| `live_parallel_multiple` | 24 |
|
|
148
|
-
| `live_irrelevance` | 884 |
|
|
149
|
-
|
|
150
|
-
The strongest `
|
|
164
|
+
| `simple_python` | 400 | 91.3% | 93.0% | +1.8pp | 9 |
|
|
165
|
+
| `simple_java` | 100 | 55.0% | 59.0% | +4.0pp | 3 |
|
|
166
|
+
| `simple_javascript` | 50 | 60.0% | 64.0% | +4.0pp | 0 |
|
|
167
|
+
| `multiple` | 200 | 89.5% | 91.0% | +1.5pp | 3 |
|
|
168
|
+
| `parallel` | 200 | 81.0% | 84.5% | +3.5pp | 2 |
|
|
169
|
+
| `parallel_multiple` | 200 | 77.0% | 78.0% | +1.0pp | 6 |
|
|
170
|
+
| `irrelevance` | 240 | 64.6% | 89.6% | +25.0pp | 0 |
|
|
171
|
+
| `live_simple` | 258 | 69.0% | 72.1% | +3.1pp | 1 |
|
|
172
|
+
| `live_multiple` | 1,053 | 69.0% | 73.4% | +4.4pp | 18 |
|
|
173
|
+
| `live_parallel` | 16 | 25.0% | 37.5% | +12.5pp | 2 |
|
|
174
|
+
| `live_parallel_multiple` | 24 | 50.0% | 50.0% | +0.0pp | 2 |
|
|
175
|
+
| `live_irrelevance` | 884 | 67.6% | 75.9% | +8.3pp | 34 |
|
|
176
|
+
|
|
177
|
+
The strongest `qwen2.5:7b-instruct` category lift was `irrelevance`, improving from 64.6% to 89.6%. The largest absolute live gain was `live_irrelevance`, improving by 73 cases.
|
|
151
178
|
|
|
152
179
|
Historical targeted wrapper benchmark:
|
|
153
180
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "BFCL v4 full remote Ollama runs",
|
|
3
3
|
"source": "Real model calls against all supported BFCL v4 single-turn tool-call categories completed in this repository",
|
|
4
4
|
"note": "Endpoint redacted. Scores count request errors/timeouts as incorrect. This is not an official BFCL leaderboard submission.",
|
|
5
|
-
"generatedAt": "2026-05-
|
|
5
|
+
"generatedAt": "2026-05-08T22:24:15.073Z",
|
|
6
6
|
"results": [
|
|
7
7
|
{
|
|
8
8
|
"model": "granite4.1:3b",
|
|
@@ -122,6 +122,124 @@
|
|
|
122
122
|
}
|
|
123
123
|
]
|
|
124
124
|
},
|
|
125
|
+
{
|
|
126
|
+
"model": "qwen2.5:7b-instruct",
|
|
127
|
+
"total": 3625,
|
|
128
|
+
"rawCorrect": 2619,
|
|
129
|
+
"betterCorrect": 2833,
|
|
130
|
+
"errors": 80,
|
|
131
|
+
"repaired": 279,
|
|
132
|
+
"categories": [
|
|
133
|
+
{
|
|
134
|
+
"model": "qwen2.5:7b-instruct",
|
|
135
|
+
"category": "simple_python",
|
|
136
|
+
"total": 400,
|
|
137
|
+
"rawCorrect": 365,
|
|
138
|
+
"betterCorrect": 372,
|
|
139
|
+
"errors": 9,
|
|
140
|
+
"repaired": 8
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
"model": "qwen2.5:7b-instruct",
|
|
144
|
+
"category": "simple_java",
|
|
145
|
+
"total": 100,
|
|
146
|
+
"rawCorrect": 55,
|
|
147
|
+
"betterCorrect": 59,
|
|
148
|
+
"errors": 3,
|
|
149
|
+
"repaired": 12
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"model": "qwen2.5:7b-instruct",
|
|
153
|
+
"category": "simple_javascript",
|
|
154
|
+
"total": 50,
|
|
155
|
+
"rawCorrect": 30,
|
|
156
|
+
"betterCorrect": 32,
|
|
157
|
+
"errors": 0,
|
|
158
|
+
"repaired": 11
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
"model": "qwen2.5:7b-instruct",
|
|
162
|
+
"category": "multiple",
|
|
163
|
+
"total": 200,
|
|
164
|
+
"rawCorrect": 179,
|
|
165
|
+
"betterCorrect": 182,
|
|
166
|
+
"errors": 3,
|
|
167
|
+
"repaired": 4
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
"model": "qwen2.5:7b-instruct",
|
|
171
|
+
"category": "parallel",
|
|
172
|
+
"total": 200,
|
|
173
|
+
"rawCorrect": 162,
|
|
174
|
+
"betterCorrect": 169,
|
|
175
|
+
"errors": 2,
|
|
176
|
+
"repaired": 9
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
"model": "qwen2.5:7b-instruct",
|
|
180
|
+
"category": "parallel_multiple",
|
|
181
|
+
"total": 200,
|
|
182
|
+
"rawCorrect": 154,
|
|
183
|
+
"betterCorrect": 156,
|
|
184
|
+
"errors": 6,
|
|
185
|
+
"repaired": 7
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
"model": "qwen2.5:7b-instruct",
|
|
189
|
+
"category": "irrelevance",
|
|
190
|
+
"total": 240,
|
|
191
|
+
"rawCorrect": 155,
|
|
192
|
+
"betterCorrect": 215,
|
|
193
|
+
"errors": 0,
|
|
194
|
+
"repaired": 60
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
"model": "qwen2.5:7b-instruct",
|
|
198
|
+
"category": "live_simple",
|
|
199
|
+
"total": 258,
|
|
200
|
+
"rawCorrect": 178,
|
|
201
|
+
"betterCorrect": 186,
|
|
202
|
+
"errors": 1,
|
|
203
|
+
"repaired": 16
|
|
204
|
+
},
|
|
205
|
+
{
|
|
206
|
+
"model": "qwen2.5:7b-instruct",
|
|
207
|
+
"category": "live_multiple",
|
|
208
|
+
"total": 1053,
|
|
209
|
+
"rawCorrect": 727,
|
|
210
|
+
"betterCorrect": 773,
|
|
211
|
+
"errors": 18,
|
|
212
|
+
"repaired": 75
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
"model": "qwen2.5:7b-instruct",
|
|
216
|
+
"category": "live_parallel",
|
|
217
|
+
"total": 16,
|
|
218
|
+
"rawCorrect": 4,
|
|
219
|
+
"betterCorrect": 6,
|
|
220
|
+
"errors": 2,
|
|
221
|
+
"repaired": 4
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
"model": "qwen2.5:7b-instruct",
|
|
225
|
+
"category": "live_parallel_multiple",
|
|
226
|
+
"total": 24,
|
|
227
|
+
"rawCorrect": 12,
|
|
228
|
+
"betterCorrect": 12,
|
|
229
|
+
"errors": 2,
|
|
230
|
+
"repaired": 0
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
"model": "qwen2.5:7b-instruct",
|
|
234
|
+
"category": "live_irrelevance",
|
|
235
|
+
"total": 884,
|
|
236
|
+
"rawCorrect": 598,
|
|
237
|
+
"betterCorrect": 671,
|
|
238
|
+
"errors": 34,
|
|
239
|
+
"repaired": 73
|
|
240
|
+
}
|
|
241
|
+
]
|
|
242
|
+
},
|
|
125
243
|
{
|
|
126
244
|
"model": "qwen3:0.6b",
|
|
127
245
|
"total": 3625,
|
package/docs/banner.svg
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<svg xmlns="http://www.w3.org/2000/svg" width="1400" height="520" viewBox="0 0 1400 520" role="img" aria-labelledby="title desc">
|
|
2
2
|
<title id="title">BetterCall banner</title>
|
|
3
|
-
<desc id="desc">BetterCall is a one-line wrapper with
|
|
3
|
+
<desc id="desc">BetterCall is a one-line wrapper with seven full BFCL remote runs completed; the best run improves from 73.4 percent to 83.8 percent.</desc>
|
|
4
4
|
<defs>
|
|
5
5
|
<linearGradient id="background" x1="0" y1="0" x2="1" y2="1">
|
|
6
6
|
<stop offset="0" stop-color="#07111f"/>
|
|
@@ -24,6 +24,6 @@
|
|
|
24
24
|
<g transform="translate(112 112)">
|
|
25
25
|
<text x="0" y="80" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="78" font-weight="800" fill="#f8fbff">BetterCall</text>
|
|
26
26
|
<rect x="4" y="116" width="430" height="8" rx="4" fill="url(#accent)"/>
|
|
27
|
-
<text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">
|
|
27
|
+
<text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">Seven full BFCL remote runs. Best: 73.4% → 83.8%.</text>
|
|
28
28
|
</g>
|
|
29
29
|
</svg>
|