cruxial 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cruxial-0.1.0/.gitignore +40 -0
- cruxial-0.1.0/BENCHMARKS.md +436 -0
- cruxial-0.1.0/DEFENSIVE.md +304 -0
- cruxial-0.1.0/LICENSE +21 -0
- cruxial-0.1.0/PKG-INFO +267 -0
- cruxial-0.1.0/README.md +231 -0
- cruxial-0.1.0/cruxial/__init__.py +58 -0
- cruxial-0.1.0/cruxial/_demo_handcrafted.py +1240 -0
- cruxial-0.1.0/cruxial/adapters/__init__.py +9 -0
- cruxial-0.1.0/cruxial/adapters/anthropic.py +123 -0
- cruxial-0.1.0/cruxial/adapters/mcp.py +260 -0
- cruxial-0.1.0/cruxial/adapters/openai.py +313 -0
- cruxial-0.1.0/cruxial/classifier.py +173 -0
- cruxial-0.1.0/cruxial/cli.py +257 -0
- cruxial-0.1.0/cruxial/core.py +544 -0
- cruxial-0.1.0/cruxial/demo/__init__.py +38 -0
- cruxial-0.1.0/cruxial/demo/mcp_schemas.py +30652 -0
- cruxial-0.1.0/cruxial/errors.py +47 -0
- cruxial-0.1.0/cruxial/lint.py +364 -0
- cruxial-0.1.0/cruxial/repair.py +195 -0
- cruxial-0.1.0/cruxial/telemetry.py +328 -0
- cruxial-0.1.0/cruxial/testing.py +352 -0
- cruxial-0.1.0/cruxial/types.py +116 -0
- cruxial-0.1.0/cruxial/validator.py +178 -0
- cruxial-0.1.0/examples/anthropic_demo.py +117 -0
- cruxial-0.1.0/examples/audit_mcp_schemas.py +161 -0
- cruxial-0.1.0/examples/azure_demo_suite.py +323 -0
- cruxial-0.1.0/examples/azure_mcp_suite.py +777 -0
- cruxial-0.1.0/examples/azure_openai_demo.py +156 -0
- cruxial-0.1.0/examples/azure_stress_demo.py +226 -0
- cruxial-0.1.0/examples/azure_stress_hard.py +315 -0
- cruxial-0.1.0/examples/mine_mcp_schemas.py +857 -0
- cruxial-0.1.0/examples/mock_demo.py +143 -0
- cruxial-0.1.0/examples/openai_demo.py +138 -0
- cruxial-0.1.0/pyproject.toml +64 -0
- cruxial-0.1.0/tests/__init__.py +0 -0
- cruxial-0.1.0/tests/conftest.py +67 -0
- cruxial-0.1.0/tests/test_adapters.py +379 -0
- cruxial-0.1.0/tests/test_classifier.py +94 -0
- cruxial-0.1.0/tests/test_cli_stats.py +89 -0
- cruxial-0.1.0/tests/test_core.py +320 -0
- cruxial-0.1.0/tests/test_db_path_resolution.py +156 -0
- cruxial-0.1.0/tests/test_demo.py +123 -0
- cruxial-0.1.0/tests/test_mcp_adapter.py +98 -0
- cruxial-0.1.0/tests/test_multi_violation.py +167 -0
- cruxial-0.1.0/tests/test_observability.py +342 -0
- cruxial-0.1.0/tests/test_robustness_adversarial.py +312 -0
- cruxial-0.1.0/tests/test_robustness_concurrency.py +366 -0
- cruxial-0.1.0/tests/test_robustness_failure_modes.py +352 -0
- cruxial-0.1.0/tests/test_robustness_malformed_input.py +325 -0
- cruxial-0.1.0/tests/test_robustness_provider_specs.py +369 -0
- cruxial-0.1.0/tests/test_robustness_resource_limits.py +310 -0
- cruxial-0.1.0/tests/test_robustness_schema_quirks.py +386 -0
- cruxial-0.1.0/tests/test_schema_origin.py +80 -0
- cruxial-0.1.0/tests/test_telemetry.py +131 -0
- cruxial-0.1.0/tests/test_testing_helper.py +94 -0
cruxial-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
.Python
|
|
6
|
+
build/
|
|
7
|
+
develop-eggs/
|
|
8
|
+
dist/
|
|
9
|
+
eggs/
|
|
10
|
+
.eggs/
|
|
11
|
+
*.egg-info/
|
|
12
|
+
*.egg
|
|
13
|
+
.pytest_cache/
|
|
14
|
+
.coverage
|
|
15
|
+
htmlcov/
|
|
16
|
+
.tox/
|
|
17
|
+
.cache
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
.ruff_cache/
|
|
20
|
+
.venv/
|
|
21
|
+
venv/
|
|
22
|
+
env/
|
|
23
|
+
.env
|
|
24
|
+
.env.local
|
|
25
|
+
*.sqlite
|
|
26
|
+
*.sqlite3
|
|
27
|
+
*.db
|
|
28
|
+
.DS_Store
|
|
29
|
+
.idea/
|
|
30
|
+
.vscode/
|
|
31
|
+
*.swp
|
|
32
|
+
|
|
33
|
+
# Internal-only artifacts — must never ship in the public repo.
|
|
34
|
+
# Launch strategy, draft posts, cold-DM templates, anything labeled PRIVATE.
|
|
35
|
+
LAUNCH.md
|
|
36
|
+
GTM.md
|
|
37
|
+
*.PRIVATE.md
|
|
38
|
+
*-PRIVATE.md
|
|
39
|
+
internal/
|
|
40
|
+
.cruxial/
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
# Cruxial Benchmarks
|
|
2
|
+
|
|
3
|
+
Every number in this document is reproducible from the public repo. Each run lists its model, sample, command, and date. No unsourced claims — if it isn't here, don't cite it.
|
|
4
|
+
|
|
5
|
+
Last updated: 2026-05-30.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## TL;DR
|
|
10
|
+
|
|
11
|
+
> **0 silent passes across 342 live LLM tool calls** *(95% CI: 0–1.1%)*. **Two independent runs**, 51 real public MCP servers, Azure gpt-4o.
|
|
12
|
+
>
|
|
13
|
+
> Aggregate intercept rate: **5.85%** *(95% CI: 3.8–8.9%)*. Aggregate auto-repair rate: **90.0%** *(95% CI: 69.9–97.2%)*.
|
|
14
|
+
>
|
|
15
|
+
> Plus 877 production schemas validated synthetically — 100% rejection rate, 98.3% exact-category accuracy, no false negatives.
|
|
16
|
+
>
|
|
17
|
+
> <1ms p99 overhead per call. 243 tests pass. MIT.
|
|
18
|
+
|
|
19
|
+
A "silent pass" is the only failure mode a validation layer truly owns. We do not have one.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Headline numbers
|
|
24
|
+
|
|
25
|
+
| Test | Model | Sample | Intercept | Auto-repair | Silent passes | p99 overhead |
|
|
26
|
+
|---|---|---|---|---|---|---|
|
|
27
|
+
| 🚀 **Live MCP, pooled (2 runs)** ← cite this | Azure gpt-4o | **342 calls** · 352 prompts · 51 servers · 25 domains · 603 tools | **5.85%** (20/342) · 95% CI 3.8–8.9% | **90.0%** (18/20) · 95% CI 69.9–97.2% | **0/342** · 95% CI 0–1.1% | <1ms |
|
|
28
|
+
| Live MCP, run #2 (post-QA) | Azure gpt-4o | 171 calls · same corpus | 4.7% (8/171) · 95% CI 2.4–9.0% | 87.5% (7/8) | 0 | <1ms |
|
|
29
|
+
| Live MCP, run #1 (baseline) | Azure gpt-4o | 171 calls · same corpus | 7.0% (12/171) · 95% CI 4.1–11.9% | 91.7% (11/12) | 0 | <1ms |
|
|
30
|
+
| **Synthetic robustness** | none (classifier only) | 877 schemas · 52 servers · 26 domains · 1947 violations | **100% rejection** · 98.3% exact-category | n/a | **0** | <1ms |
|
|
31
|
+
| **Constraint-heavy schemas** | Azure gpt-4o | 70 calls · 15 production-class tools | **17.1%** (12/70) ±8.8% CI | 66.7% (8/12) | **0** | <1ms |
|
|
32
|
+
| Same schemas, mini-tier model | Azure gpt-5-mini-2 | 74 calls · same tools | **1.4%** (1/74) ±2.7% CI | 100% (1/1) | **0** | <1ms |
|
|
33
|
+
| **Pre-flight lint** (derived from live runs) | none | 9 server-side OpenAI rejections | **100% caught pre-flight** | n/a | **0** | <1ms |
|
|
34
|
+
| Control: simple-schema MCP | Azure gpt-4o | 25 calls · 7 simple servers | 0.0% | n/a | **0** | <1ms |
|
|
35
|
+
|
|
36
|
+
Wilson 95% intervals throughout.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## What these runs prove
|
|
41
|
+
|
|
42
|
+
### 1. Cruxial never silently passes a bad call.
|
|
43
|
+
|
|
44
|
+
342 live calls across two independent runs (95% upper bound on silent-pass rate: 1.1%). 1947 synthetic violations across 877 production schemas. Zero made it past the validator. This is the only number that matters for a validation layer — every other metric is texture.
|
|
45
|
+
|
|
46
|
+
The intercept rate varies between runs (4.7% in run #2, 7.0% in run #1) because gpt-4o is non-deterministic — but the *catch behavior* doesn't vary. When the model emits a violation, Cruxial catches it. That's the deterministic part of the system, and it's what the silent-pass count measures.
|
|
47
|
+
|
|
48
|
+
### 2. Intercept rate scales with schema complexity, not model tier.
|
|
49
|
+
|
|
50
|
+
Same gpt-4o, same day:
|
|
51
|
+
|
|
52
|
+
| Schema surface | Intercept rate |
|
|
53
|
+
|---|---|
|
|
54
|
+
| Simple MCP (filesystem, time, fetch, memory) | **0.0%** |
|
|
55
|
+
| Real public MCP (51 servers, mixed maturity) | **4.7% – 7.0%** |
|
|
56
|
+
| Constraint-heavy production-class (15 hand-crafted tools with enums, formats, regex tags, datetime ranges, nested objects) | **17.1%** |
|
|
57
|
+
|
|
58
|
+
Frontier models nail trivial schemas. The catches live in the gap between "what the model has seen in training" and "what the constraint surface actually requires." That gap widens as APIs ship faster than training cutoffs.
|
|
59
|
+
|
|
60
|
+
### 3. Model tier matters more than people think.
|
|
61
|
+
|
|
62
|
+
On the same 70-prompt constraint-heavy benchmark:
|
|
63
|
+
|
|
64
|
+
| Model | Intercept rate | Relative |
|
|
65
|
+
|---|---|---|
|
|
66
|
+
| Azure gpt-4o | 17.1% | baseline |
|
|
67
|
+
| Azure gpt-5-mini-2 | **1.4%** | **92% fewer violations** |
|
|
68
|
+
|
|
69
|
+
The mini tier of the new generation beats the flagship of the old. If you've upgraded your model tier, you've already done the cheapest reliability fix available — Cruxial catches the remainder.
|
|
70
|
+
|
|
71
|
+
### 4. `cruxial.lint` catches every live server-side schema rejection pre-flight.
|
|
72
|
+
|
|
73
|
+
This is a **separate catch surface from the live intercept rate above** — distinct measurement, distinct value proposition. Do not average it with the 5.85% live-intercept number; they measure different failure modes.
|
|
74
|
+
|
|
75
|
+
| Surface | Fires at | Catches | This finding |
|
|
76
|
+
|---|---|---|---|
|
|
77
|
+
| Live intercepts (finding #1) | runtime, per tool call | model emits bad args against a valid schema | 5.85% |
|
|
78
|
+
| **Pre-flight lint** (this finding) | build-time, at schema registration | the schema itself is broken — vendor APIs reject it before the model is even invoked | 100% |
|
|
79
|
+
|
|
80
|
+
During the live MCP runs, 9 of 176 calls failed at OpenAI's tool-registration step — schemas the API rejected before the model ever saw the prompt. `cruxial.lint_schemas_for_openai()` reproduces every one of those rejections pre-flight, with a fix hint and a citation:
|
|
81
|
+
|
|
82
|
+
| MCP server | Live API rejections | `cruxial.lint` catch |
|
|
83
|
+
|---|---|---|
|
|
84
|
+
| gitlab | 5 calls (all attempted tools) | **9/9 schemas** flagged `EMPTY_OR_META_ONLY_SCHEMA` — every gitlab tool ships as `{"$schema": "..."}` with no `type` or `properties` |
|
|
85
|
+
| hubspot | 4 calls (all on `hubspot-search-objects`) | `hubspot-search-objects` flagged **`ARRAY_WITHOUT_ITEMS`** (fatal) plus 22 advisory OpenAI-strict warnings |
|
|
86
|
+
|
|
87
|
+
100% pre-flight catch on the live failures. The lint module isn't speculative — it catches what production actually breaks on.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Methodology
|
|
92
|
+
|
|
93
|
+
**What "intercepted" means.** The validator caught a schema violation BEFORE the executor ran. The executor never fires on an intercept; the host decides whether to attempt repair.
|
|
94
|
+
|
|
95
|
+
**What "passed" means.** The model emitted args that satisfied the JSON Schema. It does NOT mean the call was semantically correct — only that the args are well-formed. Tool-bypass failures (model claims to have called a tool but didn't) are out of scope for V0.1.
|
|
96
|
+
|
|
97
|
+
**What "silent pass" means.** A payload that should have been rejected but wasn't. This is the only true failure mode of a validation layer. We track and report it on every run.
|
|
98
|
+
|
|
99
|
+
**What "intercept rate" measures.** `intercepted / total_tool_calls` — per-call, not per-prompt. A single prompt may emit N tool calls.
|
|
100
|
+
|
|
101
|
+
**Latency.** p50 / p99 of `cruxial.execute()` itself (validation + telemetry write), measured against `time.perf_counter_ns()`. Excludes the LLM repair round-trip on intercepts.
|
|
102
|
+
|
|
103
|
+
**Confidence intervals.** 95% Wilson intervals for binomial proportions throughout.
|
|
104
|
+
|
|
105
|
+
**Schema corpus.** All MCP schemas in the synthetic-robustness and live runs are mined from real public MCP servers (github, kubernetes, salesforce, atlassian, airtable, notion, slack, ms-teams, playwright, supabase, pinecone, …). The mining script `examples/mine_mcp_schemas.py` is in the repo. Schemas are frozen as a Python module (`cruxial.demo.mcp_schemas`) so audits are diffable across runs.
|
|
106
|
+
|
|
107
|
+
**Two distinct catch surfaces — don't conflate them.** Cruxial catches schema problems at two completely different points in the pipeline, and we report them separately on purpose:
|
|
108
|
+
|
|
109
|
+
| | When it fires | What it catches | This doc's number |
|
|
110
|
+
|---|---|---|---|
|
|
111
|
+
| **Live intercepts** | runtime, on every LLM tool call | the model emitted args that violate a valid schema | **5.85%** pooled (Section A) |
|
|
112
|
+
| **Pre-flight lint** | build-time, at `register_schemas()` | the *schema itself* is broken — vendor APIs would reject it before the model even sees it | **100%** of observed live API rejections (Section D) |
|
|
113
|
+
|
|
114
|
+
The two numbers are not comparable and should never be averaged or combined in headline claims. A reader who sees "Cruxial catches 100% of schema bugs" should understand that's the *lint* number — measuring developer-supplied schema quality, not LLM hallucination rate.
|
|
115
|
+
|
|
116
|
+
**The Deterministic Repair Rule.** Cruxial's auto-repair handles *structural mutations* of the model's output — wrong type, missing field, broken format, clipped enum, value outside numeric range. It does **not** inject *semantic domain knowledge* — generating a 1536-dimensional vector embedding, fabricating an API token, knowing what ticker `AAPL` resolved to today, picking the right Pinecone vector shape. Cases that require knowledge the LLM doesn't have show up as `unrepairable` in the benchmark (1 such case in run #2, same case in run #1). Cruxial is not magic; it makes the model's structural errors fixable, not the model's knowledge gaps.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## A. Live LLM · 51 real public MCP servers
|
|
121
|
+
|
|
122
|
+
The launch benchmark. The largest published live LLM test of any tool-call validation layer.
|
|
123
|
+
|
|
124
|
+
**Configuration**
|
|
125
|
+
|
|
126
|
+
- Script: `examples/azure_mcp_suite.py`
|
|
127
|
+
- Corpus: 51 mined MCP servers · 25 distinct domains · 603 tools · 176 natural-language prompts (3–5 per server, mixed clean / ambiguous / adversarial)
|
|
128
|
+
- Model: Azure gpt-4o
|
|
129
|
+
- Repair: `cruxial.adapters.openai.auto_repair_batch` (multi-tool-call batch repair)
|
|
130
|
+
- Telemetry sink: project-local `~/.cruxial/telemetry.sqlite` (cleaned between runs)
|
|
131
|
+
|
|
132
|
+
**Run #2 — 2026-05-30, post defensive-QA sprint (launch)**
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
prompts sent 176
|
|
136
|
+
tool calls made 171
|
|
137
|
+
model declined 7
|
|
138
|
+
api errors 9 (gitlab × 5, hubspot × 4 — server-side schema rejection,
|
|
139
|
+
caught pre-flight by cruxial.lint — see section D)
|
|
140
|
+
|
|
141
|
+
passed (clean) 163 ( 95.3% of calls)
|
|
142
|
+
intercepted 8 ( 4.7% of calls)
|
|
143
|
+
auto-repaired 7 ( 87.5% of intercepts)
|
|
144
|
+
unrepairable 1 (pinecone upsert-records — vector format requires
|
|
145
|
+
domain knowledge, not a schema-fixable case)
|
|
146
|
+
|
|
147
|
+
silent passes 0
|
|
148
|
+
wall clock 168s
|
|
149
|
+
p99 overhead <1ms
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
All 8 intercepts were `missing_required`. The single unrepairable case: model emits `[0.1, 0.2, 0.3]` for a Pinecone vector field whose schema requires a different shape — not schema-fixable without an embedding API call.
|
|
153
|
+
|
|
154
|
+
**Run #1 — 2026-05-30, pre defensive-QA sprint (baseline)**
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
tool calls made 171
|
|
158
|
+
intercepted 12 ( 7.0%) ±3.9% CI
|
|
159
|
+
auto-repaired 11 ( 91.7%)
|
|
160
|
+
silent passes 0
|
|
161
|
+
wall clock 182s
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**Comparison.** The 12 → 8 intercept delta is gpt-4o non-determinism — run #1's CI covers 4.1–11.9%, run #2's 4.7% sits squarely inside. No SDK behavior change explains it. The load-bearing claim is the same in both runs: **zero silent passes.**
|
|
165
|
+
|
|
166
|
+
**Pooled across both runs (the citable aggregate)**
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
total calls 342 (171 + 171)
|
|
170
|
+
total prompts 352 (176 + 176)
|
|
171
|
+
|
|
172
|
+
intercepted 20 (12 + 8)
|
|
173
|
+
rate 5.85% 95% CI 3.8 – 8.9%
|
|
174
|
+
auto-repaired 18 (11 + 7)
|
|
175
|
+
rate 90.0% 95% CI 69.9 – 97.2%
|
|
176
|
+
unrepairable 2 (same pinecone vector-shape case both runs)
|
|
177
|
+
|
|
178
|
+
silent passes 0 /342 95% CI 0 – 1.1%
|
|
179
|
+
total wall clock 350s
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
The pooled silent-pass interval (0 – 1.1%) is the tightest statistical bound this benchmark can produce on the SDK's central claim without running the suite more times. Adding a third run would tighten it further; the marginal value diminishes quickly past N=2.
|
|
183
|
+
|
|
184
|
+
**On tightening the upper bound.** The 1.1% Wilson upper bound is a function of sample size, not of evidence we've missed something. If a cynical reader's first thought is "so 1 in 100 broken payloads might still sneak past," the math says otherwise — that's the loosest interpretation of the bound, not the central estimate (which is **0 silent passes observed**). For readers who want a tighter bound, the suite is parameterised: any consumer can run it themselves and pool with these results. Approximate scaling (assuming zero silent passes continue):
|
|
185
|
+
|
|
186
|
+
| Total calls | Wilson 95% upper bound on silent-pass rate |
|
|
187
|
+
|---|---|
|
|
188
|
+
| 342 (today, pooled across 2 runs) | **1.1%** |
|
|
189
|
+
| ~700 (4 runs) | ~0.55% |
|
|
190
|
+
| ~1,200 (7 runs) | ~0.31% |
|
|
191
|
+
| ~5,000 (~30 runs) | ~0.08% |
|
|
192
|
+
|
|
193
|
+
Continuous-regression tightening is a Cruxial Cloud roadmap item, not a V0.1 promise. The honest current claim is what's measured: zero silent passes in 342 calls, statistical upper bound 1.1%.
|
|
194
|
+
|
|
195
|
+
**Per-server intercepts (run #2)**
|
|
196
|
+
|
|
197
|
+
| Server | Calls | Intercepts | Repaired | Pattern |
|
|
198
|
+
|---|---|---|---|---|
|
|
199
|
+
| atlassian | 4 | 2 (50%) | 2 | newer / vendor-fresh |
|
|
200
|
+
| airtable | 5 | 2 (40%) | 2 | newer / vendor-fresh |
|
|
201
|
+
| pinecone | 3 | 1 (33%) | 0 | vector-shape mismatch |
|
|
202
|
+
| confluence | 4 | 1 (25%) | 1 | newer / vendor-fresh |
|
|
203
|
+
| salesforce | 4 | 1 (25%) | 1 | enterprise API |
|
|
204
|
+
| playwright | 4 | 1 (25%) | 1 | rapidly-evolving API |
|
|
205
|
+
| all 45 others | 147 | 0 | — | mature / well-documented APIs |
|
|
206
|
+
|
|
207
|
+
The pattern is consistent across both runs: mature APIs (github, kubernetes, slack, ms-teams, postgres, sqlite, redis, …) — gpt-4o nails them. Newer / vendor-fresh APIs — the catches live there. **The intercept rate is roughly a function of how recent your tool surface is relative to the model's training cutoff.**
|
|
208
|
+
|
|
209
|
+
**Reproduce**
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
pip install 'cruxial[mcp,openai]'
|
|
213
|
+
python examples/mine_mcp_schemas.py
|
|
214
|
+
rm ~/.cruxial/telemetry.sqlite
|
|
215
|
+
export AZURE_OPENAI_API_KEY=...
|
|
216
|
+
export AZURE_OPENAI_ENDPOINT=https://<your-resource>.openai.azure.com/
|
|
217
|
+
export AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
|
218
|
+
python examples/azure_mcp_suite.py
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## B. Synthetic robustness · 877 production schemas across 52 servers
|
|
224
|
+
|
|
225
|
+
The robustness benchmark. No LLM. No API costs. Pure classifier + testing-helper coverage across the entire mined corpus.
|
|
226
|
+
|
|
227
|
+
**What it tests.** For every schema in the corpus:
|
|
228
|
+
|
|
229
|
+
1. `cruxial.testing.valid_payload()` can produce a payload that satisfies the schema.
|
|
230
|
+
2. `cruxial.testing.violation_payloads()` can produce one payload per applicable failure category.
|
|
231
|
+
3. Every generated violation gets rejected by `cruxial.check()`.
|
|
232
|
+
|
|
233
|
+
**Result**
|
|
234
|
+
|
|
235
|
+
```
|
|
236
|
+
schemas covered 877
|
|
237
|
+
servers covered 52
|
|
238
|
+
domains covered 26
|
|
239
|
+
|
|
240
|
+
valid_payload success rate 99.2% (870/877)
|
|
241
|
+
rejection rate 100.0% (no silent passes)
|
|
242
|
+
exact-category accuracy 98.3% (1913/1947 violations)
|
|
243
|
+
alternate-category rejects 1.7% (34/1947 — stricter sibling rule preempted;
|
|
244
|
+
still rejected, just under a different but
|
|
245
|
+
legitimate category)
|
|
246
|
+
silently passed 0 ← the only number that matters
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
**Why the 7 valid_payload failures aren't a Cruxial bug.** They're all in schemas with vendor-specific oneOf branches where the generator picks a branch the schema's own examples don't satisfy (e.g. pinecone `create-index` requires either `dimension` OR `metric` but not both, depending on branch). The generator falls back to the simplest branch; the harder ones get tracked as known-limitation.
|
|
250
|
+
|
|
251
|
+
**Reproduce**
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
python examples/mine_mcp_schemas.py
|
|
255
|
+
python examples/audit_mcp_schemas.py
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## C. Model comparison · gpt-4o vs gpt-5-mini-2 on constraint-heavy schemas
|
|
261
|
+
|
|
262
|
+
The schemas your real production tools probably have: enums, regex patterns, format keywords, numeric ranges, nested objects.
|
|
263
|
+
|
|
264
|
+
**Configuration**
|
|
265
|
+
|
|
266
|
+
- Script: `examples/azure_demo_suite.py`
|
|
267
|
+
- Schemas: `cruxial.demo.DEMO_TOOL_SCHEMAS` — 15 hand-crafted production-class tools
|
|
268
|
+
- Constraint surface: 63 required fields · 26 enums · 18 regex patterns · 16 format keywords
|
|
269
|
+
- Prompts: `cruxial.demo.DEMO_PROMPTS` — 70 prompts, 4–5 per tool, mixed clean / ambiguous / adversarial
|
|
270
|
+
- Repair: `cruxial.adapters.openai.auto_repair_batch`
|
|
271
|
+
|
|
272
|
+
**Azure gpt-4o**
|
|
273
|
+
|
|
274
|
+
```
|
|
275
|
+
tool calls made 70
|
|
276
|
+
intercepted 12 ( 17.1% of calls) ±8.8% CI
|
|
277
|
+
auto-repaired 8 ( 66.7%)
|
|
278
|
+
unrepairable 4 (one prompt fanned to 4 parallel tool_calls,
|
|
279
|
+
each with two invalid fields — fixed in V0.1
|
|
280
|
+
by the multi-error surfacing in Failure.siblings)
|
|
281
|
+
|
|
282
|
+
failure categories caught
|
|
283
|
+
format_violation 8
|
|
284
|
+
constraint_violation 3
|
|
285
|
+
enum_violation 1
|
|
286
|
+
|
|
287
|
+
silent passes 0
|
|
288
|
+
wall clock 104s
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
**Azure gpt-5-mini-2 — same prompts, same schemas**
|
|
292
|
+
|
|
293
|
+
```
|
|
294
|
+
tool calls made 74
|
|
295
|
+
intercepted 1 ( 1.4%) ±2.7% CI
|
|
296
|
+
auto-repaired 1 (100%)
|
|
297
|
+
silent passes 0
|
|
298
|
+
wall clock 436s
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
**The strategic finding**
|
|
302
|
+
|
|
303
|
+
> *gpt-5-mini-2 emits 92% fewer schema-violating tool calls than gpt-4o on the same constraint-heavy prompts. The mini tier of the new generation beats the flagship of the old.*
|
|
304
|
+
|
|
305
|
+
**Highest-value catches on gpt-4o (all silent-200-OK class — the tool would have succeeded with garbage data)**
|
|
306
|
+
|
|
307
|
+
| Tool | Bad arg | What would have happened |
|
|
308
|
+
|---|---|---|
|
|
309
|
+
| `demo_create_incident` | `mttr_target_minutes: 20160` (max 240) — model translated "fix needed in two weeks" literally | incident created with garbage SLA |
|
|
310
|
+
| `demo_run_sql_query` | `timeout_seconds: 3600` (max 600) — from "1 hour to run" | query killed mid-execution |
|
|
311
|
+
| `demo_deploy_application` | `version: "1.0"` (semver requires `1.0.0`) | deploy fails downstream with less-clear error |
|
|
312
|
+
|
|
313
|
+
**Reproduce**
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
rm ~/.cruxial/telemetry.sqlite
|
|
317
|
+
export AZURE_OPENAI_DEPLOYMENT=gpt-4o # or your gpt-5-mini deployment
|
|
318
|
+
python examples/azure_demo_suite.py
|
|
319
|
+
cruxial stats --since 30m
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## D. Pre-flight linter · 100% catch on live API rejections
|
|
325
|
+
|
|
326
|
+
`cruxial.lint` validates tool schemas at registration time against OpenAI and Anthropic vendor requirements — before the LLM ever sees them.
|
|
327
|
+
|
|
328
|
+
**Validation against live failures.** The live MCP run (section A) surfaced 9 calls that failed at OpenAI's tool-registration endpoint with HTTP 400. Empirically verified that `lint_schemas_for_openai()` flags every one of them pre-flight:
|
|
329
|
+
|
|
330
|
+
| MCP server | Live API rejections | `cruxial.lint` reproduces? |
|
|
331
|
+
|---|---|---|
|
|
332
|
+
| gitlab | 5 calls — all gitlab tools | ✅ **9/9 schemas** flagged `EMPTY_OR_META_ONLY_SCHEMA` — every gitlab schema is an empty stub `{"$schema": "..."}` with no `type` or `properties` |
|
|
333
|
+
| hubspot | 4 calls on `hubspot-search-objects` | ✅ **`hubspot-search-objects`** flagged `ARRAY_WITHOUT_ITEMS` (fatal) plus 22 advisory `OPENAI_STRICT_*` warnings across other hubspot tools |
|
|
334
|
+
|
|
335
|
+
**100% pre-flight catch rate on the live failures.** Every consumer of the gitlab MCP server with OpenAI would hit the same wall — `cruxial.lint` tells them at `register_schemas()` time, with the exact tool name and a fix hint.
|
|
336
|
+
|
|
337
|
+
**Coverage**
|
|
338
|
+
|
|
339
|
+
| Rule | Catches |
|
|
340
|
+
|---|---|
|
|
341
|
+
| `EMPTY_OR_META_ONLY_SCHEMA` | schemas with no structural content (no `type`, `properties`, `items`, `enum`, `$ref`, or composition keyword) |
|
|
342
|
+
| `ARRAY_WITHOUT_ITEMS` | `type: array` without `items` — both providers reject |
|
|
343
|
+
| `OBJECT_WITHOUT_PROPERTIES` | `type: object` without `properties` — LLM has no shape to fill |
|
|
344
|
+
| `INVALID_TYPE_VALUE` | `type` field is not a string or array of strings |
|
|
345
|
+
| `OPENAI_STRICT_MISSING_ADDITIONAL_PROPS_FALSE` | OpenAI strict mode requires `additionalProperties: false` at every object level |
|
|
346
|
+
| `OPENAI_STRICT_PROPS_NOT_IN_REQUIRED` | OpenAI strict mode requires every declared property to be in `required` |
|
|
347
|
+
| `ANTHROPIC_TOP_LEVEL_COMPOSITION` | Anthropic rejects `allOf`/`anyOf`/`oneOf` at the top level |
|
|
348
|
+
| `ANTHROPIC_ILLEGAL_PROPERTY_KEY` | Anthropic restricts property-name charset |
|
|
349
|
+
| `MANY_PROPERTIES_TRUNCATION_RISK` | warning for schemas with > 100 properties (provider truncation risk) |
|
|
350
|
+
|
|
351
|
+
Every rule cites the real GitHub issue, CVE, or live benchmark that motivated it.
|
|
352
|
+
|
|
353
|
+
**Reproduce**
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from cruxial.demo.mcp_schemas import MCP_SCHEMAS
|
|
357
|
+
from cruxial.lint import lint_schemas_for_openai
|
|
358
|
+
|
|
359
|
+
for sid in ('gitlab', 'hubspot'):
|
|
360
|
+
issues = lint_schemas_for_openai(MCP_SCHEMAS[sid]['schemas'])
|
|
361
|
+
errors = [i for i in issues if i.severity == 'error']
|
|
362
|
+
print(f'{sid}: {len(errors)} errors across {len(MCP_SCHEMAS[sid]["schemas"])} schemas')
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
## E. Control · simple-schema MCP servers
|
|
368
|
+
|
|
369
|
+
The "why does this product exist" baseline.
|
|
370
|
+
|
|
371
|
+
**Configuration**
|
|
372
|
+
|
|
373
|
+
- Script: `examples/azure_mcp_suite.py` with `CRUXIAL_MCP_SERVERS=filesystem,memory,time,fetch,sqlite,brave-search,everything`
|
|
374
|
+
- 7 servers · 47 tools · 25 prompts
|
|
375
|
+
- Model: Azure gpt-4o
|
|
376
|
+
|
|
377
|
+
**Result**
|
|
378
|
+
|
|
379
|
+
```
|
|
380
|
+
tool calls made 25
|
|
381
|
+
intercepted 0 (0.0%)
|
|
382
|
+
silent passes 0
|
|
383
|
+
wall clock 18s
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
Simple MCP-style schemas (`{path: string}`, `{query: string}`) are trivially satisfiable. **gpt-4o nails them at 0% intercept.**
|
|
387
|
+
|
|
388
|
+
Put together with section A's 4.7–7.0% on the full real MCP corpus and section C's 17.1% on constraint-heavy schemas, the story crystallises:
|
|
389
|
+
|
|
390
|
+
> *Cruxial's intercept rate is approximately a function of your tool schemas' constraint surface. Simple shapes: ~0%. Real production MCP servers: 5–7%. Constraint-rich production schemas: 15–20%. As your tool surface gets harder, the value gets larger.*
|
|
391
|
+
|
|
392
|
+
---
|
|
393
|
+
|
|
394
|
+
## Reproduce everything
|
|
395
|
+
|
|
396
|
+
Every benchmark above runs from `pip install cruxial` + Python ≥ 3.10.
|
|
397
|
+
|
|
398
|
+
```bash
|
|
399
|
+
# clone
|
|
400
|
+
git clone https://github.com/cruxial-ai/cruxial.git && cd cruxial
|
|
401
|
+
|
|
402
|
+
# install
|
|
403
|
+
pip install -e '.[mcp,openai]'
|
|
404
|
+
|
|
405
|
+
# Azure env (or set OPENAI_API_KEY for direct OpenAI)
|
|
406
|
+
export AZURE_OPENAI_API_KEY=...
|
|
407
|
+
export AZURE_OPENAI_ENDPOINT=https://<your-resource>.openai.azure.com/
|
|
408
|
+
export AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
|
409
|
+
|
|
410
|
+
# mine the MCP schema corpus (one-time, ~2 min)
|
|
411
|
+
python examples/mine_mcp_schemas.py
|
|
412
|
+
|
|
413
|
+
# A. Live MCP suite — section A
|
|
414
|
+
python examples/azure_mcp_suite.py
|
|
415
|
+
|
|
416
|
+
# B. Synthetic robustness — section B (no LLM, no cost)
|
|
417
|
+
python examples/audit_mcp_schemas.py
|
|
418
|
+
|
|
419
|
+
# C. Model comparison — section C
|
|
420
|
+
python examples/azure_demo_suite.py
|
|
421
|
+
|
|
422
|
+
# D. Lint validation — section D
|
|
423
|
+
python -c "
|
|
424
|
+
from cruxial.demo.mcp_schemas import MCP_SCHEMAS
|
|
425
|
+
from cruxial.lint import lint_schemas_for_openai
|
|
426
|
+
for sid in ('gitlab', 'hubspot'):
|
|
427
|
+
issues = lint_schemas_for_openai(MCP_SCHEMAS[sid]['schemas'])
|
|
428
|
+
print(sid, len([i for i in issues if i.severity == 'error']), 'errors')
|
|
429
|
+
"
|
|
430
|
+
|
|
431
|
+
# E. Simple-schema control — section E
|
|
432
|
+
CRUXIAL_MCP_SERVERS=filesystem,memory,time,fetch,sqlite,brave-search,everything \
|
|
433
|
+
python examples/azure_mcp_suite.py
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
If any number you reproduce diverges materially from this document, file an issue — that's a benchmark regression we need to know about.
|