PyPI - toolrails - Versions diffs - 0.1.0__tar.gz - Mend

toolrails 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

toolrails-0.1.0/.gitignore +9 -0
toolrails-0.1.0/CHANGELOG.md +34 -0
toolrails-0.1.0/CONTRIBUTING.md +56 -0
toolrails-0.1.0/LICENSE +21 -0
toolrails-0.1.0/PKG-INFO +208 -0
toolrails-0.1.0/README.md +183 -0
toolrails-0.1.0/demo/README.md +31 -0
toolrails-0.1.0/demo/reliability.py +126 -0
toolrails-0.1.0/examples/agent_session.py +125 -0
toolrails-0.1.0/examples/quickstart.py +59 -0
toolrails-0.1.0/pyproject.toml +43 -0
toolrails-0.1.0/src/toolrails/__init__.py +9 -0
toolrails-0.1.0/src/toolrails/__main__.py +4 -0
toolrails-0.1.0/src/toolrails/app.py +155 -0
toolrails-0.1.0/src/toolrails/cli.py +56 -0
toolrails-0.1.0/src/toolrails/pipeline.py +206 -0
toolrails-0.1.0/src/toolrails/schemas.py +218 -0
toolrails-0.1.0/src/toolrails/upstream.py +91 -0
toolrails-0.1.0/tests/test_pipeline.py +173 -0
toolrails-0.1.0/tests/test_schemas.py +128 -0

toolrails-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,9 @@
+__pycache__/
+*.py[cod]
+.venv/
+dist/
+build/
+*.egg-info/
+.pytest_cache/
+.ruff_cache/
+.DS_Store

toolrails-0.1.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,34 @@
+# Changelog
+All notable changes to toolrails are recorded here. The format follows
+[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project aims to
+follow [semantic versioning](https://semver.org/spec/v2.0.0.html).
+## [0.1.0] — unreleased
+First release.
+### Added
+- An OpenAI-compatible proxy over Ollama that guarantees well-formed tool calls:
+  a real tool name and arguments that match the tool's JSON schema.
+- A repair ladder that never constrains the model's decision to call a tool
+  (which suppresses tool calls — the "constraint tax"): a valid call passes
+  through untouched; type errors (a stringified array, a quoted integer) are
+  fixed by coercing the model's own values with no second model call; and only
+  if coercion can't satisfy the schema are the arguments regenerated under a
+  grammar built from it (Ollama's `format`).
+- `tool_choice` support, which Ollama's OpenAI endpoint ignores — `none` strips
+  the tools, `required` and a named function force a call.
+- Hallucinated tool names snapped to the nearest offered tool; unknown names left
+  untouched.
+- Fail-open behaviour: any error, or an upstream rejection, is passed through
+  unchanged rather than turned into a proxy error.
+- Streaming support for tool-calling requests: the repaired response is
+  re-emitted as standard incremental deltas (each tool call carrying its
+  `index`), verified against the OpenAI SDK's streaming client.
+- `demo/reliability.py`, a benchmark that measures valid-tool-call rate raw
+  versus through toolrails.
+- Per-call logging, silenced with `--quiet`.
+[0.1.0]: https://github.com/theadamdanielsson/toolrails/releases/tag/v0.1.0

toolrails-0.1.0/CONTRIBUTING.md ADDED Viewed

@@ -0,0 +1,56 @@
+# Contributing to toolrails
+## The most useful thing you can send
+A tool call that came out wrong. toolrails lives or dies on the range of broken
+output it can recognise and fix, and the only way to grow that is real examples.
+Open an issue with three things:
+- the model (e.g. `llama3.2:3b`),
+- the tool schema you passed, and
+- what the model produced — the raw `arguments` string, however mangled.
+That is a test case. If it's a shape toolrails should have caught and didn't,
+it's a bug; if it's one it already fixes, it becomes a regression test so it
+stays fixed.
+## Running it locally
+```bash
+git clone https://github.com/theadamdanielsson/toolrails
+cd toolrails
+uv venv && uv pip install -e ".[dev]"
+uv run pytest
+```
+The tests split in two. `tests/test_schemas.py` and `tests/test_pipeline.py` are
+pure and deterministic — they mock the model, so they run in a fraction of a
+second and need no Ollama. That's where a new broken-output case should land.
+## Measuring against your own models
+`demo/reliability.py` runs the same tool-calling request many times against raw
+Ollama and through toolrails, and reports how many calls came back valid. Start
+the proxy, then point the benchmark at any tool-capable model you have:
+```bash
+uvx toolrails --port 11500 &
+python demo/reliability.py --model llama3.2:3b --trials 20
+```
+If a model does better or worse than you expect, that number is worth an issue.
+## What fits
+toolrails fixes the *shape* of tool calls — valid name, valid arguments, working
+`tool_choice` — and stays a thin proxy over Ollama. Changes that keep it small
+and make it catch more real breakage are welcome. Things that would turn it into
+a router, a semantic cache, or a second model judging the first are out of scope
+on purpose; that boundary is what keeps it something you can read before you
+trust it in front of your agent.
+## Fixes should stay fail-open
+The one rule that isn't negotiable: toolrails must never turn a working call into
+an error. If a change can't fall back to passing the model's original answer
+through when something goes wrong, it doesn't go in.

toolrails-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Adam Danielsson
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

toolrails-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,208 @@
+Metadata-Version: 2.4
+Name: toolrails
+Version: 0.1.0
+Summary: Valid tool calls from any local model. A drop-in OpenAI-compatible proxy for Ollama that guarantees well-formed tool calls and restores tool_choice.
+Project-URL: Homepage, https://github.com/theadamdanielsson/toolrails
+Project-URL: Issues, https://github.com/theadamdanielsson/toolrails/issues
+Author-email: Adam Danielsson <the.adam.danielsson@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: agent,function-calling,llm,local-llm,ollama,openai,proxy,structured-output,tool-calling
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Topic :: Software Development :: Libraries
+Requires-Python: >=3.10
+Requires-Dist: httpx>=0.27
+Requires-Dist: jsonschema>=4.0
+Requires-Dist: starlette>=0.37
+Requires-Dist: uvicorn>=0.30
+Provides-Extra: dev
+Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
+Requires-Dist: pytest>=7; extra == 'dev'
+Description-Content-Type: text/markdown
+# toolrails
+**Valid tool calls from any local model.**
+Local models are good enough to code with now — until they try to call a tool.
+A small model on Ollama will decide to call `read_file` and then hand your agent
+the arguments as a *string* instead of an object, or an array field serialized
+as `"[...]"`, or an integer wrapped in quotes, or invent a tool named
+`readFile`. The agent can't use it, retries, gets the same broken call, and
+burns your evening in a loop. (See
+[ollama/ollama#15390](https://github.com/ollama/ollama/issues/15390): Claude Code
++ a local model, stuck on *Invalid tool parameters*, unresolved.)
+toolrails is a small proxy that sits between your agent and Ollama and makes that
+stop. Your agent speaks the ordinary OpenAI API to it; toolrails guarantees the
+tool calls that come back are well-formed — a real tool name, and arguments that
+match the tool's JSON schema.
+```bash
+# start it (nothing to install with uv)
+uvx toolrails --ollama http://localhost:11434
+# then point your agent's base URL at toolrails instead of Ollama:
+#   http://localhost:11500/v1
+```
+That's the whole change. One base URL.
+## Point your agent at it
+toolrails speaks the OpenAI API, so anything that lets you set a base URL works —
+Cline, opencode, the OpenAI SDKs, your own scripts. Point the base URL at
+`http://localhost:11500/v1` and keep using your Ollama model name. The API key is
+ignored, so pass any placeholder.
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://localhost:11500/v1", api_key="ollama")
+resp = client.chat.completions.create(
+    model="llama3.2:3b",
+    messages=[{"role": "user", "content": "weather in Oslo?"}],
+    tools=[...],
+)
+```
+## The difference, measured
+A benchmark ships in the repo (`demo/reliability.py`): the same tool-calling
+request, twelve times, against raw Ollama and through toolrails, using a
+realistically complex tool — typed fields and a nested array of objects, the way
+a real coding agent's tools actually look.
+| endpoint | model | valid tool calls |
+| --- | --- | --- |
+| raw Ollama | llama3.2:3b | **0 / 12** |
+| via toolrails | llama3.2:3b | **12 / 12** |
+The model isn't stupid — it gets the *values* right and the *types* wrong. Raw,
+it hands your agent this (note the integer-as-string and the two stringified
+arrays):
+```json
+{"duration_minutes": "30",
+ "attendees": "[\"alice@example.com\", \"bob@example.com\"]",
+ "reminders": "[{\"method\": \"email\", \"minutes_before\": 10}]"}
+```
+`attendees` is a string, not a list — your agent can't iterate it, so the call
+fails and the retry loop begins. Through toolrails, the same request and the same
+model:
+```json
+{"duration_minutes": 30,
+ "attendees": ["alice@example.com", "bob@example.com"],
+ "reminders": [{"method": "email", "minutes_before": 10}]}
+```
+Correct types, real nested arrays, every time. Simpler flat tools fail far less
+often raw — the gap is widest exactly where real agent tools live: structured,
+typed, nested.
+## What it guarantees
+- **The tool name is real.** A hallucinated `getWeather` is snapped to the
+  `get_weather` you actually offered; a name that matches nothing is left alone
+  rather than guessed at.
+- **The arguments parse and fit the schema.** When the model's arguments don't
+  validate, toolrails first fixes the *types* of its own values — the array it
+  sent as a string, the integer it quoted — and only if that still can't satisfy
+  the schema does it regenerate them under a grammar built from the tool's
+  schema. Either way, the call you receive validates.
+- **`tool_choice` works again.** Ollama's OpenAI-compatible endpoint silently
+  ignores `tool_choice`. toolrails restores it: `"none"` strips the tools,
+  `"required"` (or a named function) forces a call even when the model tried to
+  answer in prose.
+## It never breaks your agent
+toolrails fails open. If it can't reach Ollama's constrained endpoint, hits a
+tool schema it can't make sense of, or throws anywhere in the repair path, it
+forwards the model's original answer unchanged. The worst it can ever do is
+nothing — it will not turn a working call into an error. And on the common case,
+where the model already produced a valid call, it adds **zero** extra model
+calls: the fast path recognises a good call and passes it straight through.
+## How it works
+The naive fix — force every response through the tool's grammar — backfires.
+Constraining the *decision* to call a tool is what makes models stop calling
+tools at all; there's a measured "constraint tax" for exactly this
+([arXiv:2606.25605](https://arxiv.org/abs/2606.25605)). So toolrails never
+touches the decision. It asks Ollama normally, lets the model choose whether and
+which tool to call, and then repairs the result in the cheapest way that works:
+1. **If the call already validates, it passes straight through** — no extra work.
+2. **If only the types are wrong** — the array the model sent as a string, the
+   integer it quoted — toolrails coerces the model's *own* values to the schema.
+   This is the common case; it costs no second model call and never changes what
+   the model meant.
+3. **If coercion still can't satisfy the schema**, toolrails regenerates the
+   arguments with the tool's JSON schema in Ollama's `format` parameter. Ollama
+   compiles that schema to a grammar (XGrammar) and constrains decoding token by
+   token, so the arguments come back well-formed by construction.
+Names are repaired by deterministic string matching, arguments checked with
+`jsonschema`. There is no second model judging the first — just coercion, a
+grammar, and a validator. And if every step somehow fails, the model's original
+answer passes through untouched.
+## Install
+You need [Ollama](https://ollama.com) running and Python 3.10 or newer.
+```bash
+uvx toolrails                 # run without installing
+pip install toolrails         # or install the CLI
+toolrails --ollama http://localhost:11434 --port 11500
+```
+Options: `--ollama` (Ollama base URL, or `$OLLAMA_HOST`), `--host`, `--port`,
+`--quiet` (stop logging a line per repaired call). It prints one line whenever it
+steps in, so you can see it working:
+```
+toolrails: call create_event repaired (arguments did not match schema)
+toolrails: forced call get_weather (tool_choice names it)
+```
+## Scope
+toolrails fixes the *shape* of tool calls: valid name, valid arguments, working
+`tool_choice`. It does not make a weak model *choose* the right tool, invent a
+call the model didn't attempt, or route between models. If the model decides not
+to call a tool, that decision stands (unless you set `tool_choice: required`).
+It is a proxy over Ollama specifically, because the leverage is Ollama's
+grammar-constrained `format` — the same primitive the guarantee is built on. It
+repairs models that *attempt* tool calls; a model Ollama rejects outright with
+*"does not support tools"* (some chat templates have none) is out of scope for
+v1 — forcing tool calls onto those is a bigger, separate job.
+Streaming requests are supported: with tools, the response is repaired and then
+re-emitted as standard incremental deltas (verified against the OpenAI SDK's
+streaming client). The repair still buffers internally rather than streaming the
+model token by token — that's a later refinement; v1 gets the call right first.
+## Contributing
+The most useful thing you can send is a tool call that came out wrong: the model,
+the tool schema you gave it, and what it produced. That is the test set. See
+[CONTRIBUTING.md](CONTRIBUTING.md) for how to run the tests and the reliability
+benchmark against your own models.
+## From the same author
+toolrails is by the author of [overloop](https://github.com/theadamdanielsson/overloop)
+(*stop your agent looping*) and [overllm](https://github.com/theadamdanielsson/overllm)
+(*catch the LLM calls you didn't need*). Same theme, one layer down: those stop
+wasted agent work; this stops the wasted work of a tool call that never parses.
+## License
+MIT © Adam Danielsson

toolrails-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,183 @@
+# toolrails
+**Valid tool calls from any local model.**
+Local models are good enough to code with now — until they try to call a tool.
+A small model on Ollama will decide to call `read_file` and then hand your agent
+the arguments as a *string* instead of an object, or an array field serialized
+as `"[...]"`, or an integer wrapped in quotes, or invent a tool named
+`readFile`. The agent can't use it, retries, gets the same broken call, and
+burns your evening in a loop. (See
+[ollama/ollama#15390](https://github.com/ollama/ollama/issues/15390): Claude Code
++ a local model, stuck on *Invalid tool parameters*, unresolved.)
+toolrails is a small proxy that sits between your agent and Ollama and makes that
+stop. Your agent speaks the ordinary OpenAI API to it; toolrails guarantees the
+tool calls that come back are well-formed — a real tool name, and arguments that
+match the tool's JSON schema.
+```bash
+# start it (nothing to install with uv)
+uvx toolrails --ollama http://localhost:11434
+# then point your agent's base URL at toolrails instead of Ollama:
+#   http://localhost:11500/v1
+```
+That's the whole change. One base URL.
+## Point your agent at it
+toolrails speaks the OpenAI API, so anything that lets you set a base URL works —
+Cline, opencode, the OpenAI SDKs, your own scripts. Point the base URL at
+`http://localhost:11500/v1` and keep using your Ollama model name. The API key is
+ignored, so pass any placeholder.
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://localhost:11500/v1", api_key="ollama")
+resp = client.chat.completions.create(
+    model="llama3.2:3b",
+    messages=[{"role": "user", "content": "weather in Oslo?"}],
+    tools=[...],
+)
+```
+## The difference, measured
+A benchmark ships in the repo (`demo/reliability.py`): the same tool-calling
+request, twelve times, against raw Ollama and through toolrails, using a
+realistically complex tool — typed fields and a nested array of objects, the way
+a real coding agent's tools actually look.
+| endpoint | model | valid tool calls |
+| --- | --- | --- |
+| raw Ollama | llama3.2:3b | **0 / 12** |
+| via toolrails | llama3.2:3b | **12 / 12** |
+The model isn't stupid — it gets the *values* right and the *types* wrong. Raw,
+it hands your agent this (note the integer-as-string and the two stringified
+arrays):
+```json
+{"duration_minutes": "30",
+ "attendees": "[\"alice@example.com\", \"bob@example.com\"]",
+ "reminders": "[{\"method\": \"email\", \"minutes_before\": 10}]"}
+```
+`attendees` is a string, not a list — your agent can't iterate it, so the call
+fails and the retry loop begins. Through toolrails, the same request and the same
+model:
+```json
+{"duration_minutes": 30,
+ "attendees": ["alice@example.com", "bob@example.com"],
+ "reminders": [{"method": "email", "minutes_before": 10}]}
+```
+Correct types, real nested arrays, every time. Simpler flat tools fail far less
+often raw — the gap is widest exactly where real agent tools live: structured,
+typed, nested.
+## What it guarantees
+- **The tool name is real.** A hallucinated `getWeather` is snapped to the
+  `get_weather` you actually offered; a name that matches nothing is left alone
+  rather than guessed at.
+- **The arguments parse and fit the schema.** When the model's arguments don't
+  validate, toolrails first fixes the *types* of its own values — the array it
+  sent as a string, the integer it quoted — and only if that still can't satisfy
+  the schema does it regenerate them under a grammar built from the tool's
+  schema. Either way, the call you receive validates.
+- **`tool_choice` works again.** Ollama's OpenAI-compatible endpoint silently
+  ignores `tool_choice`. toolrails restores it: `"none"` strips the tools,
+  `"required"` (or a named function) forces a call even when the model tried to
+  answer in prose.
+## It never breaks your agent
+toolrails fails open. If it can't reach Ollama's constrained endpoint, hits a
+tool schema it can't make sense of, or throws anywhere in the repair path, it
+forwards the model's original answer unchanged. The worst it can ever do is
+nothing — it will not turn a working call into an error. And on the common case,
+where the model already produced a valid call, it adds **zero** extra model
+calls: the fast path recognises a good call and passes it straight through.
+## How it works
+The naive fix — force every response through the tool's grammar — backfires.
+Constraining the *decision* to call a tool is what makes models stop calling
+tools at all; there's a measured "constraint tax" for exactly this
+([arXiv:2606.25605](https://arxiv.org/abs/2606.25605)). So toolrails never
+touches the decision. It asks Ollama normally, lets the model choose whether and
+which tool to call, and then repairs the result in the cheapest way that works:
+1. **If the call already validates, it passes straight through** — no extra work.
+2. **If only the types are wrong** — the array the model sent as a string, the
+   integer it quoted — toolrails coerces the model's *own* values to the schema.
+   This is the common case; it costs no second model call and never changes what
+   the model meant.
+3. **If coercion still can't satisfy the schema**, toolrails regenerates the
+   arguments with the tool's JSON schema in Ollama's `format` parameter. Ollama
+   compiles that schema to a grammar (XGrammar) and constrains decoding token by
+   token, so the arguments come back well-formed by construction.
+Names are repaired by deterministic string matching, arguments checked with
+`jsonschema`. There is no second model judging the first — just coercion, a
+grammar, and a validator. And if every step somehow fails, the model's original
+answer passes through untouched.
+## Install
+You need [Ollama](https://ollama.com) running and Python 3.10 or newer.
+```bash
+uvx toolrails                 # run without installing
+pip install toolrails         # or install the CLI
+toolrails --ollama http://localhost:11434 --port 11500
+```
+Options: `--ollama` (Ollama base URL, or `$OLLAMA_HOST`), `--host`, `--port`,
+`--quiet` (stop logging a line per repaired call). It prints one line whenever it
+steps in, so you can see it working:
+```
+toolrails: call create_event repaired (arguments did not match schema)
+toolrails: forced call get_weather (tool_choice names it)
+```
+## Scope
+toolrails fixes the *shape* of tool calls: valid name, valid arguments, working
+`tool_choice`. It does not make a weak model *choose* the right tool, invent a
+call the model didn't attempt, or route between models. If the model decides not
+to call a tool, that decision stands (unless you set `tool_choice: required`).
+It is a proxy over Ollama specifically, because the leverage is Ollama's
+grammar-constrained `format` — the same primitive the guarantee is built on. It
+repairs models that *attempt* tool calls; a model Ollama rejects outright with
+*"does not support tools"* (some chat templates have none) is out of scope for
+v1 — forcing tool calls onto those is a bigger, separate job.
+Streaming requests are supported: with tools, the response is repaired and then
+re-emitted as standard incremental deltas (verified against the OpenAI SDK's
+streaming client). The repair still buffers internally rather than streaming the
+model token by token — that's a later refinement; v1 gets the call right first.
+## Contributing
+The most useful thing you can send is a tool call that came out wrong: the model,
+the tool schema you gave it, and what it produced. That is the test set. See
+[CONTRIBUTING.md](CONTRIBUTING.md) for how to run the tests and the reliability
+benchmark against your own models.
+## From the same author
+toolrails is by the author of [overloop](https://github.com/theadamdanielsson/overloop)
+(*stop your agent looping*) and [overllm](https://github.com/theadamdanielsson/overllm)
+(*catch the LLM calls you didn't need*). Same theme, one layer down: those stop
+wasted agent work; this stops the wasted work of a tool call that never parses.
+## License
+MIT © Adam Danielsson

toolrails-0.1.0/demo/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+# The demo GIF is the pitch
+toolrails lives or dies on one 10–15s before/after GIF at the top of the README:
+a local model failing to call a tool, then the same model through toolrails
+getting it right on the first try. Capture it two ways, best first.
+## 1. Real in-session (best, most credible)
+1. Pull a small, tool-flaky model: `ollama pull llama3.2:3b` (a good demo model —
+   it supports tools but mangles types on nested schemas). Note gemma3 won't
+   work here: Ollama reports it *does not support tools* at all, which is a
+   different problem toolrails v1 doesn't address.
+2. Point a coding agent (Cline, opencode, or Claude Code with a local base URL)
+   straight at Ollama. Drive it until it hits the *Invalid tool parameters*
+   loop — a task that needs a tool call usually does it within a turn or two.
+   Screen-record the loop.
+3. Start toolrails (`uvx toolrails`), point the same agent at
+   `http://localhost:11500/v1`, repeat the same task. Record the clean call.
+4. Cut the two side by side. That split-screen is the whole launch.
+## 2. Scripted terminal cast (fallback, reproducible)
+`demo.tape` renders a scripted before/after to a GIF with
+[vhs](https://github.com/charmbracelet/vhs), so it ships with zero live session:
+```bash
+vhs demo/demo.tape        # writes demo/toolrails.gif
+```
+It sends one hand-built broken tool call and one repaired one against a running
+Ollama, so the fix is visible without needing to reproduce the loop live.

toolrails-0.1.0/demo/reliability.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Measure tool-call reliability: raw Ollama vs. through toolrails.
+Runs the same tool-requiring prompt N times against each endpoint and classifies
+every response — did the model produce a call with a real name and arguments that
+match the tool's JSON schema? Prints a rate table and sample failures.
+    python demo/reliability.py                 # defaults: gemma3:4b, 12 trials
+    python demo/reliability.py --model llama3.2:3b --trials 20
+Requires Ollama on :11434 and toolrails on :11500 (uvx toolrails).
+"""
+from __future__ import annotations
+import argparse
+import json
+import httpx
+import jsonschema
+# A deliberately stressful schema: required integers, enums, and a nested array
+# of objects — the shape small models most often mangle.
+TOOL = {
+    "type": "function",
+    "function": {
+        "name": "create_event",
+        "description": "Create a calendar event.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "title": {"type": "string"},
+                "date": {"type": "string", "description": "ISO date, e.g. 2026-07-14"},
+                "duration_minutes": {"type": "integer"},
+                "priority": {"type": "string", "enum": ["low", "medium", "high"]},
+                "attendees": {"type": "array", "items": {"type": "string"}},
+                "reminders": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "method": {"type": "string", "enum": ["email", "popup"]},
+                            "minutes_before": {"type": "integer"},
+                        },
+                        "required": ["method", "minutes_before"],
+                    },
+                },
+            },
+            "required": ["title", "date", "duration_minutes", "priority"],
+        },
+    },
+}
+PROMPT = (
+    "Schedule a 30 minute high-priority meeting titled 'Q3 planning' on "
+    "2026-07-14 with alice@example.com and bob@example.com, and remind me by "
+    "email 10 minutes before. Use the create_event tool."
+)
+SCHEMA = TOOL["function"]["parameters"]
+def classify(msg: dict) -> tuple[str, str]:
+    """Return (outcome, detail) for one response message."""
+    calls = msg.get("tool_calls")
+    if not calls:
+        return "no_tool_call", (msg.get("content") or "")[:70]
+    fn = calls[0].get("function", {})
+    if fn.get("name") != "create_event":
+        return "wrong_name", str(fn.get("name"))
+    raw = fn.get("arguments")
+    try:
+        args = raw if isinstance(raw, dict) else json.loads(raw)
+    except (json.JSONDecodeError, TypeError):
+        return "unparseable_args", str(raw)[:70]
+    try:
+        jsonschema.validate(args, SCHEMA)
+    except jsonschema.ValidationError as e:
+        return "schema_invalid", e.message[:70]
+    return "valid", ""
+def run(url: str, model: str, trials: int) -> list[tuple[str, str]]:
+    out = []
+    for _ in range(trials):
+        body = {"model": model, "messages": [{"role": "user", "content": PROMPT}],
+                "tools": [TOOL], "stream": False}
+        try:
+            r = httpx.post(url, json=body, timeout=300)
+            r.raise_for_status()
+            out.append(classify(r.json()["choices"][0]["message"]))
+        except Exception as e:  # noqa: BLE001 - a hard failure is still a failure
+            out.append(("request_error", str(e)[:70]))
+    return out
+def report(label: str, results: list[tuple[str, str]]) -> None:
+    n = len(results)
+    valid = sum(1 for o, _ in results if o == "valid")
+    print(f"\n{label}: {valid}/{n} valid tool calls ({100*valid//n}%)")
+    buckets: dict[str, int] = {}
+    for o, _ in results:
+        buckets[o] = buckets.get(o, 0) + 1
+    for o, c in sorted(buckets.items(), key=lambda x: -x[1]):
+        if o != "valid":
+            print(f"    {c:>2} × {o}")
+    for o, d in results:
+        if o != "valid":
+            print(f"       e.g. {o}: {d}")
+            break
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default="gemma3:4b")
+    ap.add_argument("--trials", type=int, default=12)
+    ap.add_argument("--ollama", default="http://localhost:11434/v1/chat/completions")
+    ap.add_argument("--toolrails", default="http://localhost:11500/v1/chat/completions")
+    args = ap.parse_args()
+    print(f"model={args.model}  trials={args.trials}")
+    report(f"raw Ollama   ({args.model})", run(args.ollama, args.model, args.trials))
+    report(f"via toolrails ({args.model})", run(args.toolrails, args.model, args.trials))
+if __name__ == "__main__":
+    main()