@botbotgo/better-call 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -16
- package/benchmarks/bfcl-real-remote-completed-summary.json +237 -1
- package/docs/banner.svg +2 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# BetterCall
|
|
6
6
|
|
|
7
|
-
**One-line wrapper.
|
|
7
|
+
**One-line wrapper. Six full BFCL remote runs completed. Best: 73.4% → 83.8%.**
|
|
8
8
|
|
|
9
9
|
```ts
|
|
10
10
|
const tools = betterTools([searchTool, calculatorTool]);
|
|
@@ -123,29 +123,31 @@ Latest completed remote run artifact: `benchmarks/bfcl-real-remote-completed-sum
|
|
|
123
123
|
|
|
124
124
|
| Model | Completed cases | Raw | BetterCall repair | Accuracy lift | Request errors |
|
|
125
125
|
| --- | ---: | ---: | ---: | ---: | ---: |
|
|
126
|
+
| `granite4.1:3b` | 3,625 | 73.4% | 83.8% | +10.4pp | 25 |
|
|
126
127
|
| `qwen3:0.6b` | 3,625 | 55.5% | 63.6% | +8.2pp | 217 |
|
|
127
128
|
| `qwen3.5:0.8b` | 3,625 | 54.6% | 56.9% | +2.3pp | 901 |
|
|
128
129
|
| `qwen3.5:2b` | 3,625 | 53.9% | 54.9% | +1.0pp | 1,308 |
|
|
129
130
|
| `lfm2.5-thinking:latest` | 3,625 | 50.8% | 54.8% | +4.0pp | 1,142 |
|
|
131
|
+
| `gemma4:e2b` | 3,625 | 24.3% | 24.7% | +0.4pp | 2,641 |
|
|
130
132
|
|
|
131
|
-
Latest completed model category detail: `
|
|
133
|
+
Latest completed model category detail: `gemma4:e2b`.
|
|
132
134
|
|
|
133
135
|
| Category | Cases | Raw | BetterCall repair | Lift | Request errors |
|
|
134
136
|
| --- | ---: | ---: | ---: | ---: | ---: |
|
|
135
|
-
| `simple_python` | 400 |
|
|
136
|
-
| `simple_java` | 100 |
|
|
137
|
-
| `simple_javascript` | 50 |
|
|
138
|
-
| `multiple` | 200 |
|
|
139
|
-
| `parallel` | 200 |
|
|
140
|
-
| `parallel_multiple` | 200 |
|
|
141
|
-
| `irrelevance` | 240 |
|
|
142
|
-
| `live_simple` | 258 |
|
|
143
|
-
| `live_multiple` | 1,053 |
|
|
144
|
-
| `live_parallel` | 16 |
|
|
145
|
-
| `live_parallel_multiple` | 24 |
|
|
146
|
-
| `live_irrelevance` | 884 |
|
|
147
|
-
|
|
148
|
-
The strongest `
|
|
137
|
+
| `simple_python` | 400 | 82.0% | 83.3% | +1.3pp | 46 |
|
|
138
|
+
| `simple_java` | 100 | 50.0% | 50.0% | +0.0pp | 29 |
|
|
139
|
+
| `simple_javascript` | 50 | 34.0% | 36.0% | +2.0pp | 22 |
|
|
140
|
+
| `multiple` | 200 | 76.5% | 79.0% | +2.5pp | 32 |
|
|
141
|
+
| `parallel` | 200 | 54.0% | 54.0% | +0.0pp | 84 |
|
|
142
|
+
| `parallel_multiple` | 200 | 42.5% | 42.5% | +0.0pp | 109 |
|
|
143
|
+
| `irrelevance` | 240 | 52.9% | 54.2% | +1.3pp | 104 |
|
|
144
|
+
| `live_simple` | 258 | 4.7% | 5.0% | +0.4pp | 238 |
|
|
145
|
+
| `live_multiple` | 1,053 | 0.0% | 0.0% | +0.0pp | 1,053 |
|
|
146
|
+
| `live_parallel` | 16 | 0.0% | 0.0% | +0.0pp | 16 |
|
|
147
|
+
| `live_parallel_multiple` | 24 | 0.0% | 0.0% | +0.0pp | 24 |
|
|
148
|
+
| `live_irrelevance` | 884 | 0.0% | 0.0% | +0.0pp | 884 |
|
|
149
|
+
|
|
150
|
+
The strongest `gemma4:e2b` category was `multiple`, improving from 76.5% to 79.0%. The live categories were dominated by request errors under the 5s timeout and are preserved as measured in the artifact.
|
|
149
151
|
|
|
150
152
|
Historical targeted wrapper benchmark:
|
|
151
153
|
|
|
@@ -2,8 +2,126 @@
|
|
|
2
2
|
"name": "BFCL v4 full remote Ollama runs",
|
|
3
3
|
"source": "Real model calls against all supported BFCL v4 single-turn tool-call categories completed in this repository",
|
|
4
4
|
"note": "Endpoint redacted. Scores count request errors/timeouts as incorrect. This is not an official BFCL leaderboard submission.",
|
|
5
|
-
"generatedAt": "2026-05-
|
|
5
|
+
"generatedAt": "2026-05-08T21:10:42.234Z",
|
|
6
6
|
"results": [
|
|
7
|
+
{
|
|
8
|
+
"model": "granite4.1:3b",
|
|
9
|
+
"total": 3625,
|
|
10
|
+
"rawCorrect": 2661,
|
|
11
|
+
"betterCorrect": 3039,
|
|
12
|
+
"errors": 25,
|
|
13
|
+
"repaired": 380,
|
|
14
|
+
"categories": [
|
|
15
|
+
{
|
|
16
|
+
"model": "granite4.1:3b",
|
|
17
|
+
"category": "simple_python",
|
|
18
|
+
"total": 400,
|
|
19
|
+
"rawCorrect": 346,
|
|
20
|
+
"betterCorrect": 347,
|
|
21
|
+
"errors": 18,
|
|
22
|
+
"repaired": 1
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"model": "granite4.1:3b",
|
|
26
|
+
"category": "simple_java",
|
|
27
|
+
"total": 100,
|
|
28
|
+
"rawCorrect": 70,
|
|
29
|
+
"betterCorrect": 70,
|
|
30
|
+
"errors": 0,
|
|
31
|
+
"repaired": 1
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"model": "granite4.1:3b",
|
|
35
|
+
"category": "simple_javascript",
|
|
36
|
+
"total": 50,
|
|
37
|
+
"rawCorrect": 36,
|
|
38
|
+
"betterCorrect": 36,
|
|
39
|
+
"errors": 0,
|
|
40
|
+
"repaired": 1
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"model": "granite4.1:3b",
|
|
44
|
+
"category": "multiple",
|
|
45
|
+
"total": 200,
|
|
46
|
+
"rawCorrect": 181,
|
|
47
|
+
"betterCorrect": 181,
|
|
48
|
+
"errors": 0,
|
|
49
|
+
"repaired": 0
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"model": "granite4.1:3b",
|
|
53
|
+
"category": "parallel",
|
|
54
|
+
"total": 200,
|
|
55
|
+
"rawCorrect": 166,
|
|
56
|
+
"betterCorrect": 166,
|
|
57
|
+
"errors": 0,
|
|
58
|
+
"repaired": 0
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"model": "granite4.1:3b",
|
|
62
|
+
"category": "parallel_multiple",
|
|
63
|
+
"total": 200,
|
|
64
|
+
"rawCorrect": 157,
|
|
65
|
+
"betterCorrect": 157,
|
|
66
|
+
"errors": 0,
|
|
67
|
+
"repaired": 0
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"model": "granite4.1:3b",
|
|
71
|
+
"category": "irrelevance",
|
|
72
|
+
"total": 240,
|
|
73
|
+
"rawCorrect": 161,
|
|
74
|
+
"betterCorrect": 240,
|
|
75
|
+
"errors": 0,
|
|
76
|
+
"repaired": 79
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"model": "granite4.1:3b",
|
|
80
|
+
"category": "live_simple",
|
|
81
|
+
"total": 258,
|
|
82
|
+
"rawCorrect": 191,
|
|
83
|
+
"betterCorrect": 191,
|
|
84
|
+
"errors": 0,
|
|
85
|
+
"repaired": 0
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"model": "granite4.1:3b",
|
|
89
|
+
"category": "live_multiple",
|
|
90
|
+
"total": 1053,
|
|
91
|
+
"rawCorrect": 746,
|
|
92
|
+
"betterCorrect": 748,
|
|
93
|
+
"errors": 0,
|
|
94
|
+
"repaired": 2
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"model": "granite4.1:3b",
|
|
98
|
+
"category": "live_parallel",
|
|
99
|
+
"total": 16,
|
|
100
|
+
"rawCorrect": 11,
|
|
101
|
+
"betterCorrect": 11,
|
|
102
|
+
"errors": 0,
|
|
103
|
+
"repaired": 0
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"model": "granite4.1:3b",
|
|
107
|
+
"category": "live_parallel_multiple",
|
|
108
|
+
"total": 24,
|
|
109
|
+
"rawCorrect": 17,
|
|
110
|
+
"betterCorrect": 17,
|
|
111
|
+
"errors": 0,
|
|
112
|
+
"repaired": 0
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"model": "granite4.1:3b",
|
|
116
|
+
"category": "live_irrelevance",
|
|
117
|
+
"total": 884,
|
|
118
|
+
"rawCorrect": 579,
|
|
119
|
+
"betterCorrect": 875,
|
|
120
|
+
"errors": 7,
|
|
121
|
+
"repaired": 296
|
|
122
|
+
}
|
|
123
|
+
]
|
|
124
|
+
},
|
|
7
125
|
{
|
|
8
126
|
"model": "qwen3:0.6b",
|
|
9
127
|
"total": 3625,
|
|
@@ -475,6 +593,124 @@
|
|
|
475
593
|
"repaired": 131
|
|
476
594
|
}
|
|
477
595
|
]
|
|
596
|
+
},
|
|
597
|
+
{
|
|
598
|
+
"model": "gemma4:e2b",
|
|
599
|
+
"total": 3625,
|
|
600
|
+
"rawCorrect": 880,
|
|
601
|
+
"betterCorrect": 895,
|
|
602
|
+
"errors": 2641,
|
|
603
|
+
"repaired": 22,
|
|
604
|
+
"categories": [
|
|
605
|
+
{
|
|
606
|
+
"model": "gemma4:e2b",
|
|
607
|
+
"category": "simple_python",
|
|
608
|
+
"total": 400,
|
|
609
|
+
"rawCorrect": 328,
|
|
610
|
+
"betterCorrect": 333,
|
|
611
|
+
"errors": 46,
|
|
612
|
+
"repaired": 5
|
|
613
|
+
},
|
|
614
|
+
{
|
|
615
|
+
"model": "gemma4:e2b",
|
|
616
|
+
"category": "simple_java",
|
|
617
|
+
"total": 100,
|
|
618
|
+
"rawCorrect": 50,
|
|
619
|
+
"betterCorrect": 50,
|
|
620
|
+
"errors": 29,
|
|
621
|
+
"repaired": 5
|
|
622
|
+
},
|
|
623
|
+
{
|
|
624
|
+
"model": "gemma4:e2b",
|
|
625
|
+
"category": "simple_javascript",
|
|
626
|
+
"total": 50,
|
|
627
|
+
"rawCorrect": 17,
|
|
628
|
+
"betterCorrect": 18,
|
|
629
|
+
"errors": 22,
|
|
630
|
+
"repaired": 1
|
|
631
|
+
},
|
|
632
|
+
{
|
|
633
|
+
"model": "gemma4:e2b",
|
|
634
|
+
"category": "multiple",
|
|
635
|
+
"total": 200,
|
|
636
|
+
"rawCorrect": 153,
|
|
637
|
+
"betterCorrect": 158,
|
|
638
|
+
"errors": 32,
|
|
639
|
+
"repaired": 6
|
|
640
|
+
},
|
|
641
|
+
{
|
|
642
|
+
"model": "gemma4:e2b",
|
|
643
|
+
"category": "parallel",
|
|
644
|
+
"total": 200,
|
|
645
|
+
"rawCorrect": 108,
|
|
646
|
+
"betterCorrect": 108,
|
|
647
|
+
"errors": 84,
|
|
648
|
+
"repaired": 0
|
|
649
|
+
},
|
|
650
|
+
{
|
|
651
|
+
"model": "gemma4:e2b",
|
|
652
|
+
"category": "parallel_multiple",
|
|
653
|
+
"total": 200,
|
|
654
|
+
"rawCorrect": 85,
|
|
655
|
+
"betterCorrect": 85,
|
|
656
|
+
"errors": 109,
|
|
657
|
+
"repaired": 1
|
|
658
|
+
},
|
|
659
|
+
{
|
|
660
|
+
"model": "gemma4:e2b",
|
|
661
|
+
"category": "irrelevance",
|
|
662
|
+
"total": 240,
|
|
663
|
+
"rawCorrect": 127,
|
|
664
|
+
"betterCorrect": 130,
|
|
665
|
+
"errors": 104,
|
|
666
|
+
"repaired": 3
|
|
667
|
+
},
|
|
668
|
+
{
|
|
669
|
+
"model": "gemma4:e2b",
|
|
670
|
+
"category": "live_simple",
|
|
671
|
+
"total": 258,
|
|
672
|
+
"rawCorrect": 12,
|
|
673
|
+
"betterCorrect": 13,
|
|
674
|
+
"errors": 238,
|
|
675
|
+
"repaired": 1
|
|
676
|
+
},
|
|
677
|
+
{
|
|
678
|
+
"model": "gemma4:e2b",
|
|
679
|
+
"category": "live_multiple",
|
|
680
|
+
"total": 1053,
|
|
681
|
+
"rawCorrect": 0,
|
|
682
|
+
"betterCorrect": 0,
|
|
683
|
+
"errors": 1053,
|
|
684
|
+
"repaired": 0
|
|
685
|
+
},
|
|
686
|
+
{
|
|
687
|
+
"model": "gemma4:e2b",
|
|
688
|
+
"category": "live_parallel",
|
|
689
|
+
"total": 16,
|
|
690
|
+
"rawCorrect": 0,
|
|
691
|
+
"betterCorrect": 0,
|
|
692
|
+
"errors": 16,
|
|
693
|
+
"repaired": 0
|
|
694
|
+
},
|
|
695
|
+
{
|
|
696
|
+
"model": "gemma4:e2b",
|
|
697
|
+
"category": "live_parallel_multiple",
|
|
698
|
+
"total": 24,
|
|
699
|
+
"rawCorrect": 0,
|
|
700
|
+
"betterCorrect": 0,
|
|
701
|
+
"errors": 24,
|
|
702
|
+
"repaired": 0
|
|
703
|
+
},
|
|
704
|
+
{
|
|
705
|
+
"model": "gemma4:e2b",
|
|
706
|
+
"category": "live_irrelevance",
|
|
707
|
+
"total": 884,
|
|
708
|
+
"rawCorrect": 0,
|
|
709
|
+
"betterCorrect": 0,
|
|
710
|
+
"errors": 884,
|
|
711
|
+
"repaired": 0
|
|
712
|
+
}
|
|
713
|
+
]
|
|
478
714
|
}
|
|
479
715
|
]
|
|
480
716
|
}
|
package/docs/banner.svg
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<svg xmlns="http://www.w3.org/2000/svg" width="1400" height="520" viewBox="0 0 1400 520" role="img" aria-labelledby="title desc">
|
|
2
2
|
<title id="title">BetterCall banner</title>
|
|
3
|
-
<desc id="desc">BetterCall is a one-line wrapper with
|
|
3
|
+
<desc id="desc">BetterCall is a one-line wrapper with six full BFCL remote runs completed; the best run improves from 73.4 percent to 83.8 percent.</desc>
|
|
4
4
|
<defs>
|
|
5
5
|
<linearGradient id="background" x1="0" y1="0" x2="1" y2="1">
|
|
6
6
|
<stop offset="0" stop-color="#07111f"/>
|
|
@@ -24,6 +24,6 @@
|
|
|
24
24
|
<g transform="translate(112 112)">
|
|
25
25
|
<text x="0" y="80" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="78" font-weight="800" fill="#f8fbff">BetterCall</text>
|
|
26
26
|
<rect x="4" y="116" width="430" height="8" rx="4" fill="url(#accent)"/>
|
|
27
|
-
<text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">
|
|
27
|
+
<text x="0" y="186" font-family="Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="34" font-weight="670" fill="#dcecff">Six full BFCL remote runs. Best: 73.4% → 83.8%.</text>
|
|
28
28
|
</g>
|
|
29
29
|
</svg>
|