toolrails 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ .pytest_cache/
8
+ .ruff_cache/
9
+ .DS_Store
@@ -0,0 +1,34 @@
1
+ # Changelog
2
+
3
+ All notable changes to toolrails are recorded here. The format follows
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project aims to
5
+ follow [semantic versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [0.1.0] — unreleased
8
+
9
+ First release.
10
+
11
+ ### Added
12
+
13
+ - An OpenAI-compatible proxy over Ollama that guarantees well-formed tool calls:
14
+ a real tool name and arguments that match the tool's JSON schema.
15
+ - A repair ladder that never constrains the model's decision to call a tool
16
+ (which suppresses tool calls — the "constraint tax"): a valid call passes
17
+ through untouched; type errors (a stringified array, a quoted integer) are
18
+ fixed by coercing the model's own values with no second model call; and only
19
+ if coercion can't satisfy the schema are the arguments regenerated under a
20
+ grammar built from it (Ollama's `format`).
21
+ - `tool_choice` support, which Ollama's OpenAI endpoint ignores — `none` strips
22
+ the tools, `required` and a named function force a call.
23
+ - Hallucinated tool names snapped to the nearest offered tool; unknown names left
24
+ untouched.
25
+ - Fail-open behaviour: any error, or an upstream rejection, is passed through
26
+ unchanged rather than turned into a proxy error.
27
+ - Streaming support for tool-calling requests: the repaired response is
28
+ re-emitted as standard incremental deltas (each tool call carrying its
29
+ `index`), verified against the OpenAI SDK's streaming client.
30
+ - `demo/reliability.py`, a benchmark that measures valid-tool-call rate raw
31
+ versus through toolrails.
32
+ - Per-call logging, silenced with `--quiet`.
33
+
34
+ [0.1.0]: https://github.com/theadamdanielsson/toolrails/releases/tag/v0.1.0
@@ -0,0 +1,56 @@
1
+ # Contributing to toolrails
2
+
3
+ ## The most useful thing you can send
4
+
5
+ A tool call that came out wrong. toolrails lives or dies on the range of broken
6
+ output it can recognise and fix, and the only way to grow that is real examples.
7
+ Open an issue with three things:
8
+
9
+ - the model (e.g. `llama3.2:3b`),
10
+ - the tool schema you passed, and
11
+ - what the model produced — the raw `arguments` string, however mangled.
12
+
13
+ That is a test case. If it's a shape toolrails should have caught and didn't,
14
+ it's a bug; if it's one it already fixes, it becomes a regression test so it
15
+ stays fixed.
16
+
17
+ ## Running it locally
18
+
19
+ ```bash
20
+ git clone https://github.com/theadamdanielsson/toolrails
21
+ cd toolrails
22
+ uv venv && uv pip install -e ".[dev]"
23
+ uv run pytest
24
+ ```
25
+
26
+ The tests split in two. `tests/test_schemas.py` and `tests/test_pipeline.py` are
27
+ pure and deterministic — they mock the model, so they run in a fraction of a
28
+ second and need no Ollama. That's where a new broken-output case should land.
29
+
30
+ ## Measuring against your own models
31
+
32
+ `demo/reliability.py` runs the same tool-calling request many times against raw
33
+ Ollama and through toolrails, and reports how many calls came back valid. Start
34
+ the proxy, then point the benchmark at any tool-capable model you have:
35
+
36
+ ```bash
37
+ uvx toolrails --port 11500 &
38
+ python demo/reliability.py --model llama3.2:3b --trials 20
39
+ ```
40
+
41
+ If a model does better or worse than you expect, that number is worth an issue.
42
+
43
+ ## What fits
44
+
45
+ toolrails fixes the *shape* of tool calls — valid name, valid arguments, working
46
+ `tool_choice` — and stays a thin proxy over Ollama. Changes that keep it small
47
+ and make it catch more real breakage are welcome. Things that would turn it into
48
+ a router, a semantic cache, or a second model judging the first are out of scope
49
+ on purpose; that boundary is what keeps it something you can read before you
50
+ trust it in front of your agent.
51
+
52
+ ## Fixes should stay fail-open
53
+
54
+ The one rule that isn't negotiable: toolrails must never turn a working call into
55
+ an error. If a change can't fall back to passing the model's original answer
56
+ through when something goes wrong, it doesn't go in.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Adam Danielsson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,208 @@
1
+ Metadata-Version: 2.4
2
+ Name: toolrails
3
+ Version: 0.1.0
4
+ Summary: Valid tool calls from any local model. A drop-in OpenAI-compatible proxy for Ollama that guarantees well-formed tool calls and restores tool_choice.
5
+ Project-URL: Homepage, https://github.com/theadamdanielsson/toolrails
6
+ Project-URL: Issues, https://github.com/theadamdanielsson/toolrails/issues
7
+ Author-email: Adam Danielsson <the.adam.danielsson@gmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: agent,function-calling,llm,local-llm,ollama,openai,proxy,structured-output,tool-calling
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Requires-Python: >=3.10
17
+ Requires-Dist: httpx>=0.27
18
+ Requires-Dist: jsonschema>=4.0
19
+ Requires-Dist: starlette>=0.37
20
+ Requires-Dist: uvicorn>=0.30
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
23
+ Requires-Dist: pytest>=7; extra == 'dev'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # toolrails
27
+
28
+ **Valid tool calls from any local model.**
29
+
30
+ Local models are good enough to code with now — until they try to call a tool.
31
+ A small model on Ollama will decide to call `read_file` and then hand your agent
32
+ the arguments as a *string* instead of an object, or an array field serialized
33
+ as `"[...]"`, or an integer wrapped in quotes, or invent a tool named
34
+ `readFile`. The agent can't use it, retries, gets the same broken call, and
35
+ burns your evening in a loop. (See
36
+ [ollama/ollama#15390](https://github.com/ollama/ollama/issues/15390): Claude Code
37
+ + a local model, stuck on *Invalid tool parameters*, unresolved.)
38
+
39
+ toolrails is a small proxy that sits between your agent and Ollama and makes that
40
+ stop. Your agent speaks the ordinary OpenAI API to it; toolrails guarantees the
41
+ tool calls that come back are well-formed — a real tool name, and arguments that
42
+ match the tool's JSON schema.
43
+
44
+ ```bash
45
+ # start it (nothing to install with uv)
46
+ uvx toolrails --ollama http://localhost:11434
47
+
48
+ # then point your agent's base URL at toolrails instead of Ollama:
49
+ # http://localhost:11500/v1
50
+ ```
51
+
52
+ That's the whole change. One base URL.
53
+
54
+ ## Point your agent at it
55
+
56
+ toolrails speaks the OpenAI API, so anything that lets you set a base URL works —
57
+ Cline, opencode, the OpenAI SDKs, your own scripts. Point the base URL at
58
+ `http://localhost:11500/v1` and keep using your Ollama model name. The API key is
59
+ ignored, so pass any placeholder.
60
+
61
+ ```python
62
+ from openai import OpenAI
63
+
64
+ client = OpenAI(base_url="http://localhost:11500/v1", api_key="ollama")
65
+ resp = client.chat.completions.create(
66
+ model="llama3.2:3b",
67
+ messages=[{"role": "user", "content": "weather in Oslo?"}],
68
+ tools=[...],
69
+ )
70
+ ```
71
+
72
+ ## The difference, measured
73
+
74
+ A benchmark ships in the repo (`demo/reliability.py`): the same tool-calling
75
+ request, twelve times, against raw Ollama and through toolrails, using a
76
+ realistically complex tool — typed fields and a nested array of objects, the way
77
+ a real coding agent's tools actually look.
78
+
79
+ | endpoint | model | valid tool calls |
80
+ | --- | --- | --- |
81
+ | raw Ollama | llama3.2:3b | **0 / 12** |
82
+ | via toolrails | llama3.2:3b | **12 / 12** |
83
+
84
+ The model isn't stupid — it gets the *values* right and the *types* wrong. Raw,
85
+ it hands your agent this (note the integer-as-string and the two stringified
86
+ arrays):
87
+
88
+ ```json
89
+ {"duration_minutes": "30",
90
+ "attendees": "[\"alice@example.com\", \"bob@example.com\"]",
91
+ "reminders": "[{\"method\": \"email\", \"minutes_before\": 10}]"}
92
+ ```
93
+
94
+ `attendees` is a string, not a list — your agent can't iterate it, so the call
95
+ fails and the retry loop begins. Through toolrails, the same request and the same
96
+ model:
97
+
98
+ ```json
99
+ {"duration_minutes": 30,
100
+ "attendees": ["alice@example.com", "bob@example.com"],
101
+ "reminders": [{"method": "email", "minutes_before": 10}]}
102
+ ```
103
+
104
+ Correct types, real nested arrays, every time. Simpler flat tools fail far less
105
+ often raw — the gap is widest exactly where real agent tools live: structured,
106
+ typed, nested.
107
+
108
+ ## What it guarantees
109
+
110
+ - **The tool name is real.** A hallucinated `getWeather` is snapped to the
111
+ `get_weather` you actually offered; a name that matches nothing is left alone
112
+ rather than guessed at.
113
+ - **The arguments parse and fit the schema.** When the model's arguments don't
114
+ validate, toolrails first fixes the *types* of its own values — the array it
115
+ sent as a string, the integer it quoted — and only if that still can't satisfy
116
+ the schema does it regenerate them under a grammar built from the tool's
117
+ schema. Either way, the call you receive validates.
118
+ - **`tool_choice` works again.** Ollama's OpenAI-compatible endpoint silently
119
+ ignores `tool_choice`. toolrails restores it: `"none"` strips the tools,
120
+ `"required"` (or a named function) forces a call even when the model tried to
121
+ answer in prose.
122
+
123
+ ## It never breaks your agent
124
+
125
+ toolrails fails open. If it can't reach Ollama's constrained endpoint, hits a
126
+ tool schema it can't make sense of, or throws anywhere in the repair path, it
127
+ forwards the model's original answer unchanged. The worst it can ever do is
128
+ nothing — it will not turn a working call into an error. And on the common case,
129
+ where the model already produced a valid call, it adds **zero** extra model
130
+ calls: the fast path recognises a good call and passes it straight through.
131
+
132
+ ## How it works
133
+
134
+ The naive fix — force every response through the tool's grammar — backfires.
135
+ Constraining the *decision* to call a tool is what makes models stop calling
136
+ tools at all; there's a measured "constraint tax" for exactly this
137
+ ([arXiv:2606.25605](https://arxiv.org/abs/2606.25605)). So toolrails never
138
+ touches the decision. It asks Ollama normally, lets the model choose whether and
139
+ which tool to call, and then repairs the result in the cheapest way that works:
140
+
141
+ 1. **If the call already validates, it passes straight through** — no extra work.
142
+ 2. **If only the types are wrong** — the array the model sent as a string, the
143
+ integer it quoted — toolrails coerces the model's *own* values to the schema.
144
+ This is the common case; it costs no second model call and never changes what
145
+ the model meant.
146
+ 3. **If coercion still can't satisfy the schema**, toolrails regenerates the
147
+ arguments with the tool's JSON schema in Ollama's `format` parameter. Ollama
148
+ compiles that schema to a grammar (XGrammar) and constrains decoding token by
149
+ token, so the arguments come back well-formed by construction.
150
+
151
+ Names are repaired by deterministic string matching, arguments checked with
152
+ `jsonschema`. There is no second model judging the first — just coercion, a
153
+ grammar, and a validator. And if every step somehow fails, the model's original
154
+ answer passes through untouched.
155
+
156
+ ## Install
157
+
158
+ You need [Ollama](https://ollama.com) running and Python 3.10 or newer.
159
+
160
+ ```bash
161
+ uvx toolrails # run without installing
162
+ pip install toolrails # or install the CLI
163
+ toolrails --ollama http://localhost:11434 --port 11500
164
+ ```
165
+
166
+ Options: `--ollama` (Ollama base URL, or `$OLLAMA_HOST`), `--host`, `--port`,
167
+ `--quiet` (stop logging a line per repaired call). It prints one line whenever it
168
+ steps in, so you can see it working:
169
+
170
+ ```
171
+ toolrails: call create_event repaired (arguments did not match schema)
172
+ toolrails: forced call get_weather (tool_choice names it)
173
+ ```
174
+
175
+ ## Scope
176
+
177
+ toolrails fixes the *shape* of tool calls: valid name, valid arguments, working
178
+ `tool_choice`. It does not make a weak model *choose* the right tool, invent a
179
+ call the model didn't attempt, or route between models. If the model decides not
180
+ to call a tool, that decision stands (unless you set `tool_choice: required`).
181
+ It is a proxy over Ollama specifically, because the leverage is Ollama's
182
+ grammar-constrained `format` — the same primitive the guarantee is built on. It
183
+ repairs models that *attempt* tool calls; a model Ollama rejects outright with
184
+ *"does not support tools"* (some chat templates have none) is out of scope for
185
+ v1 — forcing tool calls onto those is a bigger, separate job.
186
+
187
+ Streaming requests are supported: with tools, the response is repaired and then
188
+ re-emitted as standard incremental deltas (verified against the OpenAI SDK's
189
+ streaming client). The repair still buffers internally rather than streaming the
190
+ model token by token — that's a later refinement; v1 gets the call right first.
191
+
192
+ ## Contributing
193
+
194
+ The most useful thing you can send is a tool call that came out wrong: the model,
195
+ the tool schema you gave it, and what it produced. That is the test set. See
196
+ [CONTRIBUTING.md](CONTRIBUTING.md) for how to run the tests and the reliability
197
+ benchmark against your own models.
198
+
199
+ ## From the same author
200
+
201
+ toolrails is by the author of [overloop](https://github.com/theadamdanielsson/overloop)
202
+ (*stop your agent looping*) and [overllm](https://github.com/theadamdanielsson/overllm)
203
+ (*catch the LLM calls you didn't need*). Same theme, one layer down: those stop
204
+ wasted agent work; this stops the wasted work of a tool call that never parses.
205
+
206
+ ## License
207
+
208
+ MIT © Adam Danielsson
@@ -0,0 +1,183 @@
1
+ # toolrails
2
+
3
+ **Valid tool calls from any local model.**
4
+
5
+ Local models are good enough to code with now — until they try to call a tool.
6
+ A small model on Ollama will decide to call `read_file` and then hand your agent
7
+ the arguments as a *string* instead of an object, or an array field serialized
8
+ as `"[...]"`, or an integer wrapped in quotes, or invent a tool named
9
+ `readFile`. The agent can't use it, retries, gets the same broken call, and
10
+ burns your evening in a loop. (See
11
+ [ollama/ollama#15390](https://github.com/ollama/ollama/issues/15390): Claude Code
12
+ + a local model, stuck on *Invalid tool parameters*, unresolved.)
13
+
14
+ toolrails is a small proxy that sits between your agent and Ollama and makes that
15
+ stop. Your agent speaks the ordinary OpenAI API to it; toolrails guarantees the
16
+ tool calls that come back are well-formed — a real tool name, and arguments that
17
+ match the tool's JSON schema.
18
+
19
+ ```bash
20
+ # start it (nothing to install with uv)
21
+ uvx toolrails --ollama http://localhost:11434
22
+
23
+ # then point your agent's base URL at toolrails instead of Ollama:
24
+ # http://localhost:11500/v1
25
+ ```
26
+
27
+ That's the whole change. One base URL.
28
+
29
+ ## Point your agent at it
30
+
31
+ toolrails speaks the OpenAI API, so anything that lets you set a base URL works —
32
+ Cline, opencode, the OpenAI SDKs, your own scripts. Point the base URL at
33
+ `http://localhost:11500/v1` and keep using your Ollama model name. The API key is
34
+ ignored, so pass any placeholder.
35
+
36
+ ```python
37
+ from openai import OpenAI
38
+
39
+ client = OpenAI(base_url="http://localhost:11500/v1", api_key="ollama")
40
+ resp = client.chat.completions.create(
41
+ model="llama3.2:3b",
42
+ messages=[{"role": "user", "content": "weather in Oslo?"}],
43
+ tools=[...],
44
+ )
45
+ ```
46
+
47
+ ## The difference, measured
48
+
49
+ A benchmark ships in the repo (`demo/reliability.py`): the same tool-calling
50
+ request, twelve times, against raw Ollama and through toolrails, using a
51
+ realistically complex tool — typed fields and a nested array of objects, the way
52
+ a real coding agent's tools actually look.
53
+
54
+ | endpoint | model | valid tool calls |
55
+ | --- | --- | --- |
56
+ | raw Ollama | llama3.2:3b | **0 / 12** |
57
+ | via toolrails | llama3.2:3b | **12 / 12** |
58
+
59
+ The model isn't stupid — it gets the *values* right and the *types* wrong. Raw,
60
+ it hands your agent this (note the integer-as-string and the two stringified
61
+ arrays):
62
+
63
+ ```json
64
+ {"duration_minutes": "30",
65
+ "attendees": "[\"alice@example.com\", \"bob@example.com\"]",
66
+ "reminders": "[{\"method\": \"email\", \"minutes_before\": 10}]"}
67
+ ```
68
+
69
+ `attendees` is a string, not a list — your agent can't iterate it, so the call
70
+ fails and the retry loop begins. Through toolrails, the same request and the same
71
+ model:
72
+
73
+ ```json
74
+ {"duration_minutes": 30,
75
+ "attendees": ["alice@example.com", "bob@example.com"],
76
+ "reminders": [{"method": "email", "minutes_before": 10}]}
77
+ ```
78
+
79
+ Correct types, real nested arrays, every time. Simpler flat tools fail far less
80
+ often raw — the gap is widest exactly where real agent tools live: structured,
81
+ typed, nested.
82
+
83
+ ## What it guarantees
84
+
85
+ - **The tool name is real.** A hallucinated `getWeather` is snapped to the
86
+ `get_weather` you actually offered; a name that matches nothing is left alone
87
+ rather than guessed at.
88
+ - **The arguments parse and fit the schema.** When the model's arguments don't
89
+ validate, toolrails first fixes the *types* of its own values — the array it
90
+ sent as a string, the integer it quoted — and only if that still can't satisfy
91
+ the schema does it regenerate them under a grammar built from the tool's
92
+ schema. Either way, the call you receive validates.
93
+ - **`tool_choice` works again.** Ollama's OpenAI-compatible endpoint silently
94
+ ignores `tool_choice`. toolrails restores it: `"none"` strips the tools,
95
+ `"required"` (or a named function) forces a call even when the model tried to
96
+ answer in prose.
97
+
98
+ ## It never breaks your agent
99
+
100
+ toolrails fails open. If it can't reach Ollama's constrained endpoint, hits a
101
+ tool schema it can't make sense of, or throws anywhere in the repair path, it
102
+ forwards the model's original answer unchanged. The worst it can ever do is
103
+ nothing — it will not turn a working call into an error. And on the common case,
104
+ where the model already produced a valid call, it adds **zero** extra model
105
+ calls: the fast path recognises a good call and passes it straight through.
106
+
107
+ ## How it works
108
+
109
+ The naive fix — force every response through the tool's grammar — backfires.
110
+ Constraining the *decision* to call a tool is what makes models stop calling
111
+ tools at all; there's a measured "constraint tax" for exactly this
112
+ ([arXiv:2606.25605](https://arxiv.org/abs/2606.25605)). So toolrails never
113
+ touches the decision. It asks Ollama normally, lets the model choose whether and
114
+ which tool to call, and then repairs the result in the cheapest way that works:
115
+
116
+ 1. **If the call already validates, it passes straight through** — no extra work.
117
+ 2. **If only the types are wrong** — the array the model sent as a string, the
118
+ integer it quoted — toolrails coerces the model's *own* values to the schema.
119
+ This is the common case; it costs no second model call and never changes what
120
+ the model meant.
121
+ 3. **If coercion still can't satisfy the schema**, toolrails regenerates the
122
+ arguments with the tool's JSON schema in Ollama's `format` parameter. Ollama
123
+ compiles that schema to a grammar (XGrammar) and constrains decoding token by
124
+ token, so the arguments come back well-formed by construction.
125
+
126
+ Names are repaired by deterministic string matching, arguments checked with
127
+ `jsonschema`. There is no second model judging the first — just coercion, a
128
+ grammar, and a validator. And if every step somehow fails, the model's original
129
+ answer passes through untouched.
130
+
131
+ ## Install
132
+
133
+ You need [Ollama](https://ollama.com) running and Python 3.10 or newer.
134
+
135
+ ```bash
136
+ uvx toolrails # run without installing
137
+ pip install toolrails # or install the CLI
138
+ toolrails --ollama http://localhost:11434 --port 11500
139
+ ```
140
+
141
+ Options: `--ollama` (Ollama base URL, or `$OLLAMA_HOST`), `--host`, `--port`,
142
+ `--quiet` (stop logging a line per repaired call). It prints one line whenever it
143
+ steps in, so you can see it working:
144
+
145
+ ```
146
+ toolrails: call create_event repaired (arguments did not match schema)
147
+ toolrails: forced call get_weather (tool_choice names it)
148
+ ```
149
+
150
+ ## Scope
151
+
152
+ toolrails fixes the *shape* of tool calls: valid name, valid arguments, working
153
+ `tool_choice`. It does not make a weak model *choose* the right tool, invent a
154
+ call the model didn't attempt, or route between models. If the model decides not
155
+ to call a tool, that decision stands (unless you set `tool_choice: required`).
156
+ It is a proxy over Ollama specifically, because the leverage is Ollama's
157
+ grammar-constrained `format` — the same primitive the guarantee is built on. It
158
+ repairs models that *attempt* tool calls; a model Ollama rejects outright with
159
+ *"does not support tools"* (some chat templates have none) is out of scope for
160
+ v1 — forcing tool calls onto those is a bigger, separate job.
161
+
162
+ Streaming requests are supported: with tools, the response is repaired and then
163
+ re-emitted as standard incremental deltas (verified against the OpenAI SDK's
164
+ streaming client). The repair still buffers internally rather than streaming the
165
+ model token by token — that's a later refinement; v1 gets the call right first.
166
+
167
+ ## Contributing
168
+
169
+ The most useful thing you can send is a tool call that came out wrong: the model,
170
+ the tool schema you gave it, and what it produced. That is the test set. See
171
+ [CONTRIBUTING.md](CONTRIBUTING.md) for how to run the tests and the reliability
172
+ benchmark against your own models.
173
+
174
+ ## From the same author
175
+
176
+ toolrails is by the author of [overloop](https://github.com/theadamdanielsson/overloop)
177
+ (*stop your agent looping*) and [overllm](https://github.com/theadamdanielsson/overllm)
178
+ (*catch the LLM calls you didn't need*). Same theme, one layer down: those stop
179
+ wasted agent work; this stops the wasted work of a tool call that never parses.
180
+
181
+ ## License
182
+
183
+ MIT © Adam Danielsson
@@ -0,0 +1,31 @@
1
+ # The demo GIF is the pitch
2
+
3
+ toolrails lives or dies on one 10–15s before/after GIF at the top of the README:
4
+ a local model failing to call a tool, then the same model through toolrails
5
+ getting it right on the first try. Capture it two ways, best first.
6
+
7
+ ## 1. Real in-session (best, most credible)
8
+
9
+ 1. Pull a small, tool-flaky model: `ollama pull llama3.2:3b` (a good demo model —
10
+ it supports tools but mangles types on nested schemas). Note gemma3 won't
11
+ work here: Ollama reports it *does not support tools* at all, which is a
12
+ different problem toolrails v1 doesn't address.
13
+ 2. Point a coding agent (Cline, opencode, or Claude Code with a local base URL)
14
+ straight at Ollama. Drive it until it hits the *Invalid tool parameters*
15
+ loop — a task that needs a tool call usually does it within a turn or two.
16
+ Screen-record the loop.
17
+ 3. Start toolrails (`uvx toolrails`), point the same agent at
18
+ `http://localhost:11500/v1`, repeat the same task. Record the clean call.
19
+ 4. Cut the two side by side. That split-screen is the whole launch.
20
+
21
+ ## 2. Scripted terminal cast (fallback, reproducible)
22
+
23
+ `demo.tape` renders a scripted before/after to a GIF with
24
+ [vhs](https://github.com/charmbracelet/vhs), so it ships with zero live session:
25
+
26
+ ```bash
27
+ vhs demo/demo.tape # writes demo/toolrails.gif
28
+ ```
29
+
30
+ It sends one hand-built broken tool call and one repaired one against a running
31
+ Ollama, so the fix is visible without needing to reproduce the loop live.
@@ -0,0 +1,126 @@
1
+ """Measure tool-call reliability: raw Ollama vs. through toolrails.
2
+
3
+ Runs the same tool-requiring prompt N times against each endpoint and classifies
4
+ every response — did the model produce a call with a real name and arguments that
5
+ match the tool's JSON schema? Prints a rate table and sample failures.
6
+
7
+ python demo/reliability.py # defaults: gemma3:4b, 12 trials
8
+ python demo/reliability.py --model llama3.2:3b --trials 20
9
+
10
+ Requires Ollama on :11434 and toolrails on :11500 (uvx toolrails).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+
18
+ import httpx
19
+ import jsonschema
20
+
21
+ # A deliberately stressful schema: required integers, enums, and a nested array
22
+ # of objects — the shape small models most often mangle.
23
+ TOOL = {
24
+ "type": "function",
25
+ "function": {
26
+ "name": "create_event",
27
+ "description": "Create a calendar event.",
28
+ "parameters": {
29
+ "type": "object",
30
+ "properties": {
31
+ "title": {"type": "string"},
32
+ "date": {"type": "string", "description": "ISO date, e.g. 2026-07-14"},
33
+ "duration_minutes": {"type": "integer"},
34
+ "priority": {"type": "string", "enum": ["low", "medium", "high"]},
35
+ "attendees": {"type": "array", "items": {"type": "string"}},
36
+ "reminders": {
37
+ "type": "array",
38
+ "items": {
39
+ "type": "object",
40
+ "properties": {
41
+ "method": {"type": "string", "enum": ["email", "popup"]},
42
+ "minutes_before": {"type": "integer"},
43
+ },
44
+ "required": ["method", "minutes_before"],
45
+ },
46
+ },
47
+ },
48
+ "required": ["title", "date", "duration_minutes", "priority"],
49
+ },
50
+ },
51
+ }
52
+
53
+ PROMPT = (
54
+ "Schedule a 30 minute high-priority meeting titled 'Q3 planning' on "
55
+ "2026-07-14 with alice@example.com and bob@example.com, and remind me by "
56
+ "email 10 minutes before. Use the create_event tool."
57
+ )
58
+
59
+ SCHEMA = TOOL["function"]["parameters"]
60
+
61
+
62
+ def classify(msg: dict) -> tuple[str, str]:
63
+ """Return (outcome, detail) for one response message."""
64
+ calls = msg.get("tool_calls")
65
+ if not calls:
66
+ return "no_tool_call", (msg.get("content") or "")[:70]
67
+ fn = calls[0].get("function", {})
68
+ if fn.get("name") != "create_event":
69
+ return "wrong_name", str(fn.get("name"))
70
+ raw = fn.get("arguments")
71
+ try:
72
+ args = raw if isinstance(raw, dict) else json.loads(raw)
73
+ except (json.JSONDecodeError, TypeError):
74
+ return "unparseable_args", str(raw)[:70]
75
+ try:
76
+ jsonschema.validate(args, SCHEMA)
77
+ except jsonschema.ValidationError as e:
78
+ return "schema_invalid", e.message[:70]
79
+ return "valid", ""
80
+
81
+
82
+ def run(url: str, model: str, trials: int) -> list[tuple[str, str]]:
83
+ out = []
84
+ for _ in range(trials):
85
+ body = {"model": model, "messages": [{"role": "user", "content": PROMPT}],
86
+ "tools": [TOOL], "stream": False}
87
+ try:
88
+ r = httpx.post(url, json=body, timeout=300)
89
+ r.raise_for_status()
90
+ out.append(classify(r.json()["choices"][0]["message"]))
91
+ except Exception as e: # noqa: BLE001 - a hard failure is still a failure
92
+ out.append(("request_error", str(e)[:70]))
93
+ return out
94
+
95
+
96
+ def report(label: str, results: list[tuple[str, str]]) -> None:
97
+ n = len(results)
98
+ valid = sum(1 for o, _ in results if o == "valid")
99
+ print(f"\n{label}: {valid}/{n} valid tool calls ({100*valid//n}%)")
100
+ buckets: dict[str, int] = {}
101
+ for o, _ in results:
102
+ buckets[o] = buckets.get(o, 0) + 1
103
+ for o, c in sorted(buckets.items(), key=lambda x: -x[1]):
104
+ if o != "valid":
105
+ print(f" {c:>2} × {o}")
106
+ for o, d in results:
107
+ if o != "valid":
108
+ print(f" e.g. {o}: {d}")
109
+ break
110
+
111
+
112
+ def main() -> None:
113
+ ap = argparse.ArgumentParser()
114
+ ap.add_argument("--model", default="gemma3:4b")
115
+ ap.add_argument("--trials", type=int, default=12)
116
+ ap.add_argument("--ollama", default="http://localhost:11434/v1/chat/completions")
117
+ ap.add_argument("--toolrails", default="http://localhost:11500/v1/chat/completions")
118
+ args = ap.parse_args()
119
+
120
+ print(f"model={args.model} trials={args.trials}")
121
+ report(f"raw Ollama ({args.model})", run(args.ollama, args.model, args.trials))
122
+ report(f"via toolrails ({args.model})", run(args.toolrails, args.model, args.trials))
123
+
124
+
125
+ if __name__ == "__main__":
126
+ main()