@botbotgo/better-call 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  # BetterCall
6
6
 
7
- **One-line wrapper. Five full BFCL remote runs completed. Best: 73.4% → 83.8%.**
7
+ **One-line wrapper. Seven full BFCL remote runs completed. Best: 73.4% → 83.8%.**
8
8
 
9
9
  ```ts
10
10
  const tools = betterTools([searchTool, calculatorTool]);
@@ -124,29 +124,31 @@ Latest completed remote run artifact: `benchmarks/bfcl-real-remote-completed-sum
124
124
  | Model | Completed cases | Raw | BetterCall repair | Accuracy lift | Request errors |
125
125
  | --- | ---: | ---: | ---: | ---: | ---: |
126
126
  | `granite4.1:3b` | 3,625 | 73.4% | 83.8% | +10.4pp | 25 |
127
+ | `qwen2.5:7b-instruct` | 3,625 | 72.2% | 78.2% | +5.9pp | 80 |
127
128
  | `qwen3:0.6b` | 3,625 | 55.5% | 63.6% | +8.2pp | 217 |
128
129
  | `qwen3.5:0.8b` | 3,625 | 54.6% | 56.9% | +2.3pp | 901 |
129
130
  | `qwen3.5:2b` | 3,625 | 53.9% | 54.9% | +1.0pp | 1,308 |
130
131
  | `lfm2.5-thinking:latest` | 3,625 | 50.8% | 54.8% | +4.0pp | 1,142 |
132
+ | `gemma4:e2b` | 3,625 | 24.3% | 24.7% | +0.4pp | 2,641 |
131
133
 
132
- Latest completed model category detail: `granite4.1:3b`.
134
+ Latest completed model category detail: `qwen2.5:7b-instruct`.
133
135
 
134
136
  | Category | Cases | Raw | BetterCall repair | Lift | Request errors |
135
137
  | --- | ---: | ---: | ---: | ---: | ---: |
136
- | `simple_python` | 400 | 86.5% | 86.8% | +0.3pp | 18 |
137
- | `simple_java` | 100 | 70.0% | 70.0% | +0.0pp | 0 |
138
- | `simple_javascript` | 50 | 72.0% | 72.0% | +0.0pp | 0 |
139
- | `multiple` | 200 | 90.5% | 90.5% | +0.0pp | 0 |
140
- | `parallel` | 200 | 83.0% | 83.0% | +0.0pp | 0 |
141
- | `parallel_multiple` | 200 | 78.5% | 78.5% | +0.0pp | 0 |
142
- | `irrelevance` | 240 | 67.1% | 100.0% | +32.9pp | 0 |
143
- | `live_simple` | 258 | 74.0% | 74.0% | +0.0pp | 0 |
144
- | `live_multiple` | 1,053 | 70.8% | 71.0% | +0.2pp | 0 |
145
- | `live_parallel` | 16 | 68.8% | 68.8% | +0.0pp | 0 |
146
- | `live_parallel_multiple` | 24 | 70.8% | 70.8% | +0.0pp | 0 |
147
- | `live_irrelevance` | 884 | 65.5% | 99.0% | +33.5pp | 7 |
148
-
149
- The strongest `granite4.1:3b` categories were no-tool-call tasks: `irrelevance` improved from 67.1% to 100.0%, and `live_irrelevance` improved from 65.5% to 99.0%.
138
+ | `simple_python` | 400 | 91.3% | 93.0% | +1.8pp | 9 |
139
+ | `simple_java` | 100 | 55.0% | 59.0% | +4.0pp | 3 |
140
+ | `simple_javascript` | 50 | 60.0% | 64.0% | +4.0pp | 0 |
141
+ | `multiple` | 200 | 89.5% | 91.0% | +1.5pp | 3 |
142
+ | `parallel` | 200 | 81.0% | 84.5% | +3.5pp | 2 |
143
+ | `parallel_multiple` | 200 | 77.0% | 78.0% | +1.0pp | 6 |
144
+ | `irrelevance` | 240 | 64.6% | 89.6% | +25.0pp | 0 |
145
+ | `live_simple` | 258 | 69.0% | 72.1% | +3.1pp | 1 |
146
+ | `live_multiple` | 1,053 | 69.0% | 73.4% | +4.4pp | 18 |
147
+ | `live_parallel` | 16 | 25.0% | 37.5% | +12.5pp | 2 |
148
+ | `live_parallel_multiple` | 24 | 50.0% | 50.0% | +0.0pp | 2 |
149
+ | `live_irrelevance` | 884 | 67.6% | 75.9% | +8.3pp | 34 |
150
+
151
+ The strongest `qwen2.5:7b-instruct` category lift was `irrelevance`, improving from 64.6% to 89.6%. The largest absolute live gain was `live_irrelevance`, improving by 73 cases.
150
152
 
151
153
  Historical targeted wrapper benchmark:
152
154
 
@@ -2,7 +2,7 @@
2
2
  "name": "BFCL v4 full remote Ollama runs",
3
3
  "source": "Real model calls against all supported BFCL v4 single-turn tool-call categories completed in this repository",
4
4
  "note": "Endpoint redacted. Scores count request errors/timeouts as incorrect. This is not an official BFCL leaderboard submission.",
5
- "generatedAt": "2026-05-08T16:02:50.925Z",
5
+ "generatedAt": "2026-05-08T22:24:15.073Z",
6
6
  "results": [
7
7
  {
8
8
  "model": "granite4.1:3b",
@@ -122,6 +122,124 @@
122
122
  }
123
123
  ]
124
124
  },
125
+ {
126
+ "model": "qwen2.5:7b-instruct",
127
+ "total": 3625,
128
+ "rawCorrect": 2619,
129
+ "betterCorrect": 2833,
130
+ "errors": 80,
131
+ "repaired": 279,
132
+ "categories": [
133
+ {
134
+ "model": "qwen2.5:7b-instruct",
135
+ "category": "simple_python",
136
+ "total": 400,
137
+ "rawCorrect": 365,
138
+ "betterCorrect": 372,
139
+ "errors": 9,
140
+ "repaired": 8
141
+ },
142
+ {
143
+ "model": "qwen2.5:7b-instruct",
144
+ "category": "simple_java",
145
+ "total": 100,
146
+ "rawCorrect": 55,
147
+ "betterCorrect": 59,
148
+ "errors": 3,
149
+ "repaired": 12
150
+ },
151
+ {
152
+ "model": "qwen2.5:7b-instruct",
153
+ "category": "simple_javascript",
154
+ "total": 50,
155
+ "rawCorrect": 30,
156
+ "betterCorrect": 32,
157
+ "errors": 0,
158
+ "repaired": 11
159
+ },
160
+ {
161
+ "model": "qwen2.5:7b-instruct",
162
+ "category": "multiple",
163
+ "total": 200,
164
+ "rawCorrect": 179,
165
+ "betterCorrect": 182,
166
+ "errors": 3,
167
+ "repaired": 4
168
+ },
169
+ {
170
+ "model": "qwen2.5:7b-instruct",
171
+ "category": "parallel",
172
+ "total": 200,
173
+ "rawCorrect": 162,
174
+ "betterCorrect": 169,
175
+ "errors": 2,
176
+ "repaired": 9
177
+ },
178
+ {
179
+ "model": "qwen2.5:7b-instruct",
180
+ "category": "parallel_multiple",
181
+ "total": 200,
182
+ "rawCorrect": 154,
183
+ "betterCorrect": 156,
184
+ "errors": 6,
185
+ "repaired": 7
186
+ },
187
+ {
188
+ "model": "qwen2.5:7b-instruct",
189
+ "category": "irrelevance",
190
+ "total": 240,
191
+ "rawCorrect": 155,
192
+ "betterCorrect": 215,
193
+ "errors": 0,
194
+ "repaired": 60
195
+ },
196
+ {
197
+ "model": "qwen2.5:7b-instruct",
198
+ "category": "live_simple",
199
+ "total": 258,
200
+ "rawCorrect": 178,
201
+ "betterCorrect": 186,
202
+ "errors": 1,
203
+ "repaired": 16
204
+ },
205
+ {
206
+ "model": "qwen2.5:7b-instruct",
207
+ "category": "live_multiple",
208
+ "total": 1053,
209
+ "rawCorrect": 727,
210
+ "betterCorrect": 773,
211
+ "errors": 18,
212
+ "repaired": 75
213
+ },
214
+ {
215
+ "model": "qwen2.5:7b-instruct",
216
+ "category": "live_parallel",
217
+ "total": 16,
218
+ "rawCorrect": 4,
219
+ "betterCorrect": 6,
220
+ "errors": 2,
221
+ "repaired": 4
222
+ },
223
+ {
224
+ "model": "qwen2.5:7b-instruct",
225
+ "category": "live_parallel_multiple",
226
+ "total": 24,
227
+ "rawCorrect": 12,
228
+ "betterCorrect": 12,
229
+ "errors": 2,
230
+ "repaired": 0
231
+ },
232
+ {
233
+ "model": "qwen2.5:7b-instruct",
234
+ "category": "live_irrelevance",
235
+ "total": 884,
236
+ "rawCorrect": 598,
237
+ "betterCorrect": 671,
238
+ "errors": 34,
239
+ "repaired": 73
240
+ }
241
+ ]
242
+ },
125
243
  {
126
244
  "model": "qwen3:0.6b",
127
245
  "total": 3625,
@@ -593,6 +711,124 @@
593
711
  "repaired": 131
594
712
  }
595
713
  ]
714
+ },
715
+ {
716
+ "model": "gemma4:e2b",
717
+ "total": 3625,
718
+ "rawCorrect": 880,
719
+ "betterCorrect": 895,
720
+ "errors": 2641,
721
+ "repaired": 22,
722
+ "categories": [
723
+ {
724
+ "model": "gemma4:e2b",
725
+ "category": "simple_python",
726
+ "total": 400,
727
+ "rawCorrect": 328,
728
+ "betterCorrect": 333,
729
+ "errors": 46,
730
+ "repaired": 5
731
+ },
732
+ {
733
+ "model": "gemma4:e2b",
734
+ "category": "simple_java",
735
+ "total": 100,
736
+ "rawCorrect": 50,
737
+ "betterCorrect": 50,
738
+ "errors": 29,
739
+ "repaired": 5
740
+ },
741
+ {
742
+ "model": "gemma4:e2b",
743
+ "category": "simple_javascript",
744
+ "total": 50,
745
+ "rawCorrect": 17,
746
+ "betterCorrect": 18,
747
+ "errors": 22,
748
+ "repaired": 1
749
+ },
750
+ {
751
+ "model": "gemma4:e2b",
752
+ "category": "multiple",
753
+ "total": 200,
754
+ "rawCorrect": 153,
755
+ "betterCorrect": 158,
756
+ "errors": 32,
757
+ "repaired": 6
758
+ },
759
+ {
760
+ "model": "gemma4:e2b",
761
+ "category": "parallel",
762
+ "total": 200,
763
+ "rawCorrect": 108,
764
+ "betterCorrect": 108,
765
+ "errors": 84,
766
+ "repaired": 0
767
+ },
768
+ {
769
+ "model": "gemma4:e2b",
770
+ "category": "parallel_multiple",
771
+ "total": 200,
772
+ "rawCorrect": 85,
773
+ "betterCorrect": 85,
774
+ "errors": 109,
775
+ "repaired": 1
776
+ },
777
+ {
778
+ "model": "gemma4:e2b",
779
+ "category": "irrelevance",
780
+ "total": 240,
781
+ "rawCorrect": 127,
782
+ "betterCorrect": 130,
783
+ "errors": 104,
784
+ "repaired": 3
785
+ },
786
+ {
787
+ "model": "gemma4:e2b",
788
+ "category": "live_simple",
789
+ "total": 258,
790
+ "rawCorrect": 12,
791
+ "betterCorrect": 13,
792
+ "errors": 238,
793
+ "repaired": 1
794
+ },
795
+ {
796
+ "model": "gemma4:e2b",
797
+ "category": "live_multiple",
798
+ "total": 1053,
799
+ "rawCorrect": 0,
800
+ "betterCorrect": 0,
801
+ "errors": 1053,
802
+ "repaired": 0
803
+ },
804
+ {
805
+ "model": "gemma4:e2b",
806
+ "category": "live_parallel",
807
+ "total": 16,
808
+ "rawCorrect": 0,
809
+ "betterCorrect": 0,
810
+ "errors": 16,
811
+ "repaired": 0
812
+ },
813
+ {
814
+ "model": "gemma4:e2b",
815
+ "category": "live_parallel_multiple",
816
+ "total": 24,
817
+ "rawCorrect": 0,
818
+ "betterCorrect": 0,
819
+ "errors": 24,
820
+ "repaired": 0
821
+ },
822
+ {
823
+ "model": "gemma4:e2b",
824
+ "category": "live_irrelevance",
825
+ "total": 884,
826
+ "rawCorrect": 0,
827
+ "betterCorrect": 0,
828
+ "errors": 884,
829
+ "repaired": 0
830
+ }
831
+ ]
596
832
  }
597
833
  ]
598
834
  }
package/docs/banner.svg CHANGED
@@ -1,6 +1,6 @@
1
1
  <svg xmlns="http://www.w3.org/2000/svg" width="1400" height="520" viewBox="0 0 1400 520" role="img" aria-labelledby="title desc">
2
2
  <title id="title">BetterCall banner</title>
3
- <desc id="desc">BetterCall is a one-line wrapper with five full BFCL remote runs completed; the best run improves from 73.4 percent to 83.8 percent.</desc>
3
+ <desc id="desc">BetterCall is a one-line wrapper with seven full BFCL remote runs completed; the best run improves from 73.4 percent to 83.8 percent.</desc>
4
4
  <defs>
5
5
  <linearGradient id="background" x1="0" y1="0" x2="1" y2="1">
6
6
  <stop offset="0" stop-color="#07111f"/>
@@ -24,6 +24,6 @@
24
24
  <g transform="translate(112 112)">
25
25
  <text x="0" y="80" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="78" font-weight="800" fill="#f8fbff">BetterCall</text>
26
26
  <rect x="4" y="116" width="430" height="8" rx="4" fill="url(#accent)"/>
27
- <text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">Five full BFCL remote runs. Best: 73.4% → 83.8%.</text>
27
+ <text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">Seven full BFCL remote runs. Best: 73.4% → 83.8%.</text>
28
28
  </g>
29
29
  </svg>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@botbotgo/better-call",
3
- "version": "0.1.4",
3
+ "version": "0.1.6",
4
4
  "description": "LLM tool-call reliability layer.",
5
5
  "type": "module",
6
6
  "license": "Apache-2.0",