grounded-reasoning 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- grounded_reasoning-0.1.0/LICENSE +21 -0
- grounded_reasoning-0.1.0/PKG-INFO +255 -0
- grounded_reasoning-0.1.0/README.md +235 -0
- grounded_reasoning-0.1.0/grounded_reasoning/__init__.py +38 -0
- grounded_reasoning-0.1.0/grounded_reasoning.egg-info/PKG-INFO +255 -0
- grounded_reasoning-0.1.0/grounded_reasoning.egg-info/SOURCES.txt +53 -0
- grounded_reasoning-0.1.0/grounded_reasoning.egg-info/dependency_links.txt +1 -0
- grounded_reasoning-0.1.0/grounded_reasoning.egg-info/entry_points.txt +2 -0
- grounded_reasoning-0.1.0/grounded_reasoning.egg-info/requires.txt +9 -0
- grounded_reasoning-0.1.0/grounded_reasoning.egg-info/top_level.txt +2 -0
- grounded_reasoning-0.1.0/pyproject.toml +42 -0
- grounded_reasoning-0.1.0/setup.cfg +4 -0
- grounded_reasoning-0.1.0/src/__init__.py +3 -0
- grounded_reasoning-0.1.0/src/agent/__init__.py +18 -0
- grounded_reasoning-0.1.0/src/agent/mcp_server.py +37 -0
- grounded_reasoning-0.1.0/src/agent/tool.py +93 -0
- grounded_reasoning-0.1.0/src/agent/verifier.py +185 -0
- grounded_reasoning-0.1.0/src/experiments/__init__.py +0 -0
- grounded_reasoning-0.1.0/src/experiments/agent_demo.py +106 -0
- grounded_reasoning-0.1.0/src/experiments/clutrr_eval.py +290 -0
- grounded_reasoning-0.1.0/src/experiments/conformal_llm_eval.py +188 -0
- grounded_reasoning-0.1.0/src/experiments/guard_cost_eval.py +98 -0
- grounded_reasoning-0.1.0/src/experiments/guard_llm_eval.py +151 -0
- grounded_reasoning-0.1.0/src/experiments/inference_eval.py +114 -0
- grounded_reasoning-0.1.0/src/experiments/nl_ontology_eval.py +239 -0
- grounded_reasoning-0.1.0/src/experiments/self_grounded_eval.py +163 -0
- grounded_reasoning-0.1.0/src/reasoning/__init__.py +24 -0
- grounded_reasoning-0.1.0/src/reasoning/abstract_inference.py +176 -0
- grounded_reasoning-0.1.0/src/reasoning/composition_algebra.py +70 -0
- grounded_reasoning-0.1.0/src/reasoning/conformal_reasoning.py +73 -0
- grounded_reasoning-0.1.0/src/reasoning/horn.py +59 -0
- grounded_reasoning-0.1.0/src/reasoning/llm_client.py +122 -0
- grounded_reasoning-0.1.0/src/reasoning/operator_algebra.py +131 -0
- grounded_reasoning-0.1.0/src/reasoning/relation_spectrum.py +118 -0
- grounded_reasoning-0.1.0/src/theory/__init__.py +0 -0
- grounded_reasoning-0.1.0/src/theory/theorems.py +681 -0
- grounded_reasoning-0.1.0/tests/test_abstract_inference.py +96 -0
- grounded_reasoning-0.1.0/tests/test_agent.py +97 -0
- grounded_reasoning-0.1.0/tests/test_clutrr_solver.py +100 -0
- grounded_reasoning-0.1.0/tests/test_composition_conformal_regressions.py +74 -0
- grounded_reasoning-0.1.0/tests/test_conformal_llm.py +39 -0
- grounded_reasoning-0.1.0/tests/test_examples.py +34 -0
- grounded_reasoning-0.1.0/tests/test_fuzz_regressions.py +165 -0
- grounded_reasoning-0.1.0/tests/test_guard_llm.py +93 -0
- grounded_reasoning-0.1.0/tests/test_horn_composition.py +54 -0
- grounded_reasoning-0.1.0/tests/test_llm_client.py +55 -0
- grounded_reasoning-0.1.0/tests/test_llm_client_regressions.py +91 -0
- grounded_reasoning-0.1.0/tests/test_operator_algebra.py +40 -0
- grounded_reasoning-0.1.0/tests/test_public_api.py +20 -0
- grounded_reasoning-0.1.0/tests/test_relation_spectrum.py +51 -0
- grounded_reasoning-0.1.0/tests/test_robustness.py +87 -0
- grounded_reasoning-0.1.0/tests/test_self_grounded.py +55 -0
- grounded_reasoning-0.1.0/tests/test_spectrum_regressions.py +98 -0
- grounded_reasoning-0.1.0/tests/test_theorems.py +69 -0
- grounded_reasoning-0.1.0/tests/test_world_builder_consistency.py +73 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 grounded-reasoning contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: grounded-reasoning
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Zero-token, precision-guaranteed verifier for LLM/agent multi-hop relational reasoning
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/ALEXaquarius/grounded-reasoning
|
|
7
|
+
Project-URL: Repository, https://github.com/ALEXaquarius/grounded-reasoning
|
|
8
|
+
Project-URL: Paper, https://github.com/ALEXaquarius/grounded-reasoning/blob/main/PAPER.md
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=1.24
|
|
13
|
+
Provides-Extra: mcp
|
|
14
|
+
Requires-Dist: mcp>=1.0; extra == "mcp"
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-cov>=6; extra == "dev"
|
|
18
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# grounded-reasoning — Grounded, Guaranteed Reasoning for LLMs & Agents
|
|
22
|
+
|
|
23
|
+
[](https://github.com/ALEXaquarius/grounded-reasoning/actions/workflows/ci.yml)
|
|
24
|
+
[](LICENSE)
|
|
25
|
+
[](pyproject.toml)
|
|
26
|
+
[](https://colab.research.google.com/github/ALEXaquarius/grounded-reasoning/blob/main/examples/quickstart.ipynb)
|
|
27
|
+
|
|
28
|
+
> **TL;DR.** LLMs hallucinate on multi-hop relational reasoning. This is a
|
|
29
|
+
> **relation-algebra verifier** an agent calls to check a claim *before* asserting it:
|
|
30
|
+
> **zero model tokens**, **precision-guaranteed** (accepts a claim iff a grounded proof
|
|
31
|
+
> path exists), language-agnostic, and provider-agnostic. Plugs in as a **library**, a
|
|
32
|
+
> **function-calling tool**, or an **MCP server**. Validated on **real LLMs** (DeepSeek
|
|
33
|
+
> et al.) and the public **CLUTRR** benchmark. See [docs/integration.md](docs/integration.md).
|
|
34
|
+
|
|
35
|
+
📄 Full paper: **[PAPER.md](PAPER.md)** · Integration guide: **[docs/integration.md](docs/integration.md)** · Try it in 30 seconds: **[quickstart notebook](https://colab.research.google.com/github/ALEXaquarius/grounded-reasoning/blob/main/examples/quickstart.ipynb)**
|
|
36
|
+
|
|
37
|
+
Đọc bằng tiếng Việt: **[README.vi.md](README.vi.md)**
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Why this exists
|
|
42
|
+
|
|
43
|
+
LLMs are solid on one-hop facts but **collapse on composition** — chaining several
|
|
44
|
+
correct facts into a multi-step conclusion. On CLUTRR (kinship reasoning), DeepSeek's
|
|
45
|
+
accuracy **falls off with depth**, while a grounded operator-composition solver holds
|
|
46
|
+
**~100% flat — at zero tokens**:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
acc
|
|
50
|
+
100% ●─────●─────●─────●─────●─────●─────● ● Grounded solver (algebra, 0 tokens)
|
|
51
|
+
90% |
|
|
52
|
+
80% ○
|
|
53
|
+
70% | ╲
|
|
54
|
+
60% | ╲
|
|
55
|
+
50% | ╲
|
|
56
|
+
40% | ○ ○ ○ DeepSeek (LLM)
|
|
57
|
+
30% | ╲ ╱ ╲
|
|
58
|
+
20% | ○─────○ ╲
|
|
59
|
+
10% | ○─────○
|
|
60
|
+
0% +──┴─────┴─────┴─────┴─────┴─────┴─────┴─
|
|
61
|
+
hop 2 3 4 5 6 7 8 (composition steps)
|
|
62
|
+
|
|
63
|
+
hop: 2 3 4 5 6 7 8
|
|
64
|
+
DeepSeek: 83% 42% 25% 25% 42% 17% 8%
|
|
65
|
+
Solver: 100% 100% 100% 100% 100% 100% 100%
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
*(CLUTRR/v1 gen_train234_test2to10, clean-chain, n=12/hop; full test set n=635: solver
|
|
69
|
+
covers 99.5%, accuracy 99.2%. `src/experiments/clutrr_eval.py`.)*
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## What it is / is NOT (honestly)
|
|
74
|
+
|
|
75
|
+
**Is:** a guaranteed reasoning-verification layer built on relation operator algebra.
|
|
76
|
+
- **Precision = 1.0, guaranteed** (Theorem G) — accepts a claim only if a grounded proof path exists.
|
|
77
|
+
- **Zero extra tokens** — local matrix multiplication, no LLM call. Compare to
|
|
78
|
+
"have the LLM self-verify," which costs +110% tokens for 34% precision.
|
|
79
|
+
- **Two-sided guarantee** (Theorem I) — precision *and* recall both have tight bounds.
|
|
80
|
+
- **No external KB required** (SGDC) — uses the LLM's own internal consistency.
|
|
81
|
+
|
|
82
|
+
**Is not:** an "unprecedented breakthrough." The Katz index, the Neumann series,
|
|
83
|
+
graph reachability, and neuro-symbolic grounding are all classical math and
|
|
84
|
+
technique. The contribution here is unification, a measured guarantee, and
|
|
85
|
+
benchmark numbers — not a new primitive. The guard needs a relation graph
|
|
86
|
+
(supplied, or extracted from LLM facts); flexibility is bounded (see
|
|
87
|
+
[PAPER §5](PAPER.md)).
|
|
88
|
+
|
|
89
|
+
### How this differs from the usual fixes
|
|
90
|
+
|
|
91
|
+
| Approach | Extra tokens | Guarantee | Needs an external KB |
|
|
92
|
+
|---|---|---|---|
|
|
93
|
+
| LLM self-verification (2nd call) | +110% | none (measured 34% precision) | no |
|
|
94
|
+
| Self-consistency / majority vote | multiplies with sample count | none, statistical only | no |
|
|
95
|
+
| RAG / external KG grounding | varies | only as good as retrieval | yes |
|
|
96
|
+
| **This guard** | **+0** | **precision = 1.0** (Theorem G) | no |
|
|
97
|
+
| **This guard, self-grounded (SGDC)** | **+0** | precision = 1.0 given sound atomic facts (Theorem I) | no |
|
|
98
|
+
| **This guard, conformal** | **+0** | coverage ≥ 1−α, distribution-free (Theorem K) | no |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Three theorems, one operator (F = G = H)
|
|
103
|
+
|
|
104
|
+
The reasoning core rests on a single unification (numerically verified, zero error):
|
|
105
|
+
|
|
106
|
+
| View | Theorem | Content |
|
|
107
|
+
|------|---------|---------|
|
|
108
|
+
| Fuzzy diffusion inference | **F** | conf(a→b) = Σ αᵏ(Pᵏ)[a,b], calibrated + grounded |
|
|
109
|
+
| Relation operator algebra | **G** | composition = operator product, transitive closure = Σ powers |
|
|
110
|
+
| Spectral analysis (Katz) | **H** | `engine.infer` = resolvent (I−αP)⁻¹−I (matches **0.0** error) |
|
|
111
|
+
|
|
112
|
+
⟹ fuzzy inference **is** spectral analysis of the relation operator. `src/reasoning/`.
|
|
113
|
+
|
|
114
|
+
Four further theorems extend this core: **I** (two-sided precision/recall guarantee
|
|
115
|
+
for a self-grounded, no-external-KB variant), **J** (closure-learning completeness,
|
|
116
|
+
validated on CLUTRR), **K** (conformal reasoning — distribution-free coverage under a
|
|
117
|
+
*noisy* relation graph, including one extracted by an LLM from raw text), and **L**
|
|
118
|
+
(Horn forward-chaining, generalizing transitive closure to conjunctive rules). All
|
|
119
|
+
seven are stated, proved, and numerically verified in [PAPER.md](PAPER.md).
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Evidence on real LLMs (DeepSeek)
|
|
124
|
+
|
|
125
|
+
| Experiment | Result |
|
|
126
|
+
|------------|--------|
|
|
127
|
+
| Hallucination guard (kinship) | precision **33% → 100%**, catches 94/94, 0 false rejects |
|
|
128
|
+
| Guard token cost | **+0 tokens** (vs. LLM self-verify: +110% tokens, 34% precision) |
|
|
129
|
+
| SGDC (self-grounded, no external KB) | precision **78% → 100%** from internal consistency alone |
|
|
130
|
+
| CLUTRR (public benchmark) | solver **~100% at every hop** vs. DeepSeek 83%→8% |
|
|
131
|
+
| Hard passage (9-step chain) | DeepSeek **fabricates 2/10** (wrong direction); grounded system **10/10**, with proofs — [`examples/hallucination_demo.py`](examples/hallucination_demo.py) |
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Guaranteed reasoning over a graph an LLM extracted from raw text
|
|
136
|
+
|
|
137
|
+
The guard/solver needs a **clean** graph. But if you let an **LLM extract** relations
|
|
138
|
+
from natural-language text, the graph is **noisy** (missing/spurious edges).
|
|
139
|
+
**Conformal Reasoning** (Theorem K) fixes exactly that: use operator confidence as a
|
|
140
|
+
score, calibrate a threshold ⟹ **distribution-free coverage ≥ 1−α**, even on a noisy
|
|
141
|
+
graph.
|
|
142
|
+
|
|
143
|
+
End-to-end demo: **DeepSeek extracts an "is a" graph from text** → conformal runs on
|
|
144
|
+
that extracted graph (ground truth is used only for scoring):
|
|
145
|
+
|
|
146
|
+
| Text | LLM extraction (P / R) | Coverage (target ≥90%) | Efficiency (FPR) |
|
|
147
|
+
|------|------------------------:|----------------------------:|------------------:|
|
|
148
|
+
| Easy | 100% / 99.7% | **91.3%** | 0.0 |
|
|
149
|
+
| Hard (nested clauses + near-miss distractors) | 99.5% / **68.5%** | **93.0%** | 0.77 |
|
|
150
|
+
|
|
151
|
+
> The LLM's extraction **drops 31% of the edges** (a genuinely noisy graph) →
|
|
152
|
+
> **the coverage guarantee still holds** (93% ≥ 90%), only efficiency degrades.
|
|
153
|
+
> *Validity always holds; efficiency scales with graph quality.*
|
|
154
|
+
|
|
155
|
+
⟹ A path to guaranteed reasoning over **natural-language relations** — where the hard
|
|
156
|
+
guard can't reach. `src/experiments/conformal_llm_eval.py`.
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Quickstart
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
git clone https://github.com/ALEXaquarius/grounded-reasoning
|
|
164
|
+
cd grounded-reasoning && pip install -e ".[dev]" # not yet on PyPI — install from source
|
|
165
|
+
pytest tests/ # every theorem + offline-locked logic, no network needed
|
|
166
|
+
|
|
167
|
+
# Use it right now (no LLM/network needed):
|
|
168
|
+
python -c "from grounded_reasoning import GroundedReasoner as G; r=G(); r.add_facts([('a','p','b'),('b','p','c')]); print(r.verify('a','c',via='p'))"
|
|
169
|
+
|
|
170
|
+
# Real-LLM experiments (need a key — read from an env var, NEVER hardcoded):
|
|
171
|
+
export DEEPSEEK_API_KEY=sk-... # bring your own; .env is gitignored
|
|
172
|
+
python -m src.experiments.guard_llm_eval # hallucination guard
|
|
173
|
+
python -m src.experiments.self_grounded_eval # SGDC
|
|
174
|
+
python -m src.experiments.clutrr_eval # public CLUTRR benchmark
|
|
175
|
+
python -m src.experiments.conformal_llm_eval # end-to-end conformal (LLM-extracted graph)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Integrating with an Agent / LLM (`src/agent/`)
|
|
181
|
+
|
|
182
|
+
A **relation-reasoning verifier** for agents: check a multi-hop claim **before
|
|
183
|
+
asserting it** — zero model tokens, precision guaranteed (accepts iff a grounded proof
|
|
184
|
+
path exists).
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from grounded_reasoning import GroundedReasoner
|
|
188
|
+
gr = GroundedReasoner()
|
|
189
|
+
gr.add_facts([("alice","parent","bob"),("bob","parent","carol")])
|
|
190
|
+
gr.verify("alice","carol", via="parent") # Verdict(grounded=True, proof=['alice','bob','carol'])
|
|
191
|
+
gr.verify("alice","zed", via="parent") # Verdict(grounded=False, proof=None) ← hallucination blocked
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Three integration paths (details: [docs/integration.md](docs/integration.md)):
|
|
195
|
+
- **Library**: `GroundedReasoner.verify / filter_claims / contradictions`.
|
|
196
|
+
- **Function-calling**: `TOOL_SPEC` (Anthropic) / `openai_tool_spec()` (OpenAI) + `run_tool` — a stateless `verify_relation` tool.
|
|
197
|
+
- **MCP server**: `python -m src.agent.mcp_server` — plugs into Claude or any MCP-compatible agent.
|
|
198
|
+
|
|
199
|
+
**Multi-provider** (not just DeepSeek): `LLMClient(provider=...)` for DeepSeek / OpenAI /
|
|
200
|
+
Groq / OpenRouter / Together / Mistral / Ollama (local) — all OpenAI-compatible, switch
|
|
201
|
+
providers without changing code. **Multilingual**: entities/relations are opaque
|
|
202
|
+
Unicode strings ⟹ works with any language (`cha`, `父`, `والد`…) with zero configuration.
|
|
203
|
+
|
|
204
|
+
A real function-calling demo (agent verifies itself, blocks hallucination):
|
|
205
|
+
`python -m src.experiments.agent_demo`. When the graph is **noisy** (relations
|
|
206
|
+
extracted by an LLM from text), use `ConformalReasoner` for a **coverage ≥1−α**
|
|
207
|
+
guarantee instead of hard precision.
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Source map
|
|
212
|
+
|
|
213
|
+
| Path | Content |
|
|
214
|
+
|------|---------|
|
|
215
|
+
| `grounded_reasoning/` | Public package — `GroundedReasoner`, `verify_relation`, `TOOL_SPEC`, `ConformalReasoner`, `LLMClient` |
|
|
216
|
+
| `src/agent/{verifier,tool,mcp_server}.py` | Public API implementation — HallucinationGuard, function-calling tool, MCP server |
|
|
217
|
+
| `src/reasoning/abstract_inference.py` | FuzzyInferenceEngine, TypedInferenceEngine, HallucinationGuard (Theorem F) |
|
|
218
|
+
| `src/reasoning/operator_algebra.py` | Relation operator algebra (Theorem G) |
|
|
219
|
+
| `src/reasoning/relation_spectrum.py` | Spectrum, nilpotency, Katz resolvent (Theorem H) |
|
|
220
|
+
| `src/reasoning/conformal_reasoning.py` | Conformal — coverage guarantee under noise (Theorem K) |
|
|
221
|
+
| `src/reasoning/composition_algebra.py` | Composition-table learning, validated on CLUTRR (Theorem J) |
|
|
222
|
+
| `src/reasoning/horn.py` | Horn forward-chaining, least-model semantics (Theorem L) |
|
|
223
|
+
| `src/reasoning/llm_client.py` | Provider-agnostic LLM client (key read from an env var) |
|
|
224
|
+
| `src/theory/theorems.py` | **Seven theorems (F–L)** with numerical verification |
|
|
225
|
+
| `src/experiments/{guard_llm,self_grounded,nl_ontology,guard_cost,clutrr,conformal_llm,inference}_eval.py` | Real-LLM and benchmark experiments backing every claim above |
|
|
226
|
+
| `examples/hallucination_demo.py` | End-to-end function-calling demo |
|
|
227
|
+
| `examples/quickstart.ipynb` | Runnable tour of the library (offline, Colab-ready) |
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Origin story
|
|
232
|
+
|
|
233
|
+
This project began as an attempt to invent an embedding-free retrieval algorithm that
|
|
234
|
+
could compete with dense/RAG retrieval. That research question reached a rigorous,
|
|
235
|
+
fully honest **negative** conclusion (ties BM25, loses significantly to dense
|
|
236
|
+
embeddings — with a proof of why). The same mathematical toolkit — operator algebra,
|
|
237
|
+
spectral analysis — turned out to have real, measurable value on a different problem:
|
|
238
|
+
**guaranteeing** multi-hop relational reasoning. This repository ships only that
|
|
239
|
+
validated, tested reasoning system; the full retrieval research trail (including every
|
|
240
|
+
failed attempt, honestly recorded) lives in a separate research repository and is not
|
|
241
|
+
part of this package. See [PAPER.md §1](PAPER.md) for the full framing.
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## Contributing & Community
|
|
246
|
+
|
|
247
|
+
- How to contribute + research principles: [CONTRIBUTING.md](CONTRIBUTING.md)
|
|
248
|
+
- Code of conduct: [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) · Security: [SECURITY.md](SECURITY.md)
|
|
249
|
+
- Version history: [CHANGELOG.md](CHANGELOG.md) · Citation: [CITATION.cff](CITATION.cff)
|
|
250
|
+
- License: **MIT** ([LICENSE](LICENSE))
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
*Principle: proof before code, formal definitions, falsifiability, and honest
|
|
255
|
+
reporting of negative results — see [CONTRIBUTING.md](CONTRIBUTING.md).*
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# grounded-reasoning — Grounded, Guaranteed Reasoning for LLMs & Agents
|
|
2
|
+
|
|
3
|
+
[](https://github.com/ALEXaquarius/grounded-reasoning/actions/workflows/ci.yml)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](pyproject.toml)
|
|
6
|
+
[](https://colab.research.google.com/github/ALEXaquarius/grounded-reasoning/blob/main/examples/quickstart.ipynb)
|
|
7
|
+
|
|
8
|
+
> **TL;DR.** LLMs hallucinate on multi-hop relational reasoning. This is a
|
|
9
|
+
> **relation-algebra verifier** an agent calls to check a claim *before* asserting it:
|
|
10
|
+
> **zero model tokens**, **precision-guaranteed** (accepts a claim iff a grounded proof
|
|
11
|
+
> path exists), language-agnostic, and provider-agnostic. Plugs in as a **library**, a
|
|
12
|
+
> **function-calling tool**, or an **MCP server**. Validated on **real LLMs** (DeepSeek
|
|
13
|
+
> et al.) and the public **CLUTRR** benchmark. See [docs/integration.md](docs/integration.md).
|
|
14
|
+
|
|
15
|
+
📄 Full paper: **[PAPER.md](PAPER.md)** · Integration guide: **[docs/integration.md](docs/integration.md)** · Try it in 30 seconds: **[quickstart notebook](https://colab.research.google.com/github/ALEXaquarius/grounded-reasoning/blob/main/examples/quickstart.ipynb)**
|
|
16
|
+
|
|
17
|
+
Đọc bằng tiếng Việt: **[README.vi.md](README.vi.md)**
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Why this exists
|
|
22
|
+
|
|
23
|
+
LLMs are solid on one-hop facts but **collapse on composition** — chaining several
|
|
24
|
+
correct facts into a multi-step conclusion. On CLUTRR (kinship reasoning), DeepSeek's
|
|
25
|
+
accuracy **falls off with depth**, while a grounded operator-composition solver holds
|
|
26
|
+
**~100% flat — at zero tokens**:
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
acc
|
|
30
|
+
100% ●─────●─────●─────●─────●─────●─────● ● Grounded solver (algebra, 0 tokens)
|
|
31
|
+
90% |
|
|
32
|
+
80% ○
|
|
33
|
+
70% | ╲
|
|
34
|
+
60% | ╲
|
|
35
|
+
50% | ╲
|
|
36
|
+
40% | ○ ○ ○ DeepSeek (LLM)
|
|
37
|
+
30% | ╲ ╱ ╲
|
|
38
|
+
20% | ○─────○ ╲
|
|
39
|
+
10% | ○─────○
|
|
40
|
+
0% +──┴─────┴─────┴─────┴─────┴─────┴─────┴─
|
|
41
|
+
hop 2 3 4 5 6 7 8 (composition steps)
|
|
42
|
+
|
|
43
|
+
hop: 2 3 4 5 6 7 8
|
|
44
|
+
DeepSeek: 83% 42% 25% 25% 42% 17% 8%
|
|
45
|
+
Solver: 100% 100% 100% 100% 100% 100% 100%
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
*(CLUTRR/v1 gen_train234_test2to10, clean-chain, n=12/hop; full test set n=635: solver
|
|
49
|
+
covers 99.5%, accuracy 99.2%. `src/experiments/clutrr_eval.py`.)*
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## What it is / is NOT (honestly)
|
|
54
|
+
|
|
55
|
+
**Is:** a guaranteed reasoning-verification layer built on relation operator algebra.
|
|
56
|
+
- **Precision = 1.0, guaranteed** (Theorem G) — accepts a claim only if a grounded proof path exists.
|
|
57
|
+
- **Zero extra tokens** — local matrix multiplication, no LLM call. Compare to
|
|
58
|
+
"have the LLM self-verify," which costs +110% tokens for 34% precision.
|
|
59
|
+
- **Two-sided guarantee** (Theorem I) — precision *and* recall both have tight bounds.
|
|
60
|
+
- **No external KB required** (SGDC) — uses the LLM's own internal consistency.
|
|
61
|
+
|
|
62
|
+
**Is not:** an "unprecedented breakthrough." The Katz index, the Neumann series,
|
|
63
|
+
graph reachability, and neuro-symbolic grounding are all classical math and
|
|
64
|
+
technique. The contribution here is unification, a measured guarantee, and
|
|
65
|
+
benchmark numbers — not a new primitive. The guard needs a relation graph
|
|
66
|
+
(supplied, or extracted from LLM facts); flexibility is bounded (see
|
|
67
|
+
[PAPER §5](PAPER.md)).
|
|
68
|
+
|
|
69
|
+
### How this differs from the usual fixes
|
|
70
|
+
|
|
71
|
+
| Approach | Extra tokens | Guarantee | Needs an external KB |
|
|
72
|
+
|---|---|---|---|
|
|
73
|
+
| LLM self-verification (2nd call) | +110% | none (measured 34% precision) | no |
|
|
74
|
+
| Self-consistency / majority vote | multiplies with sample count | none, statistical only | no |
|
|
75
|
+
| RAG / external KG grounding | varies | only as good as retrieval | yes |
|
|
76
|
+
| **This guard** | **+0** | **precision = 1.0** (Theorem G) | no |
|
|
77
|
+
| **This guard, self-grounded (SGDC)** | **+0** | precision = 1.0 given sound atomic facts (Theorem I) | no |
|
|
78
|
+
| **This guard, conformal** | **+0** | coverage ≥ 1−α, distribution-free (Theorem K) | no |
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Three theorems, one operator (F = G = H)
|
|
83
|
+
|
|
84
|
+
The reasoning core rests on a single unification (numerically verified, zero error):
|
|
85
|
+
|
|
86
|
+
| View | Theorem | Content |
|
|
87
|
+
|------|---------|---------|
|
|
88
|
+
| Fuzzy diffusion inference | **F** | conf(a→b) = Σ αᵏ(Pᵏ)[a,b], calibrated + grounded |
|
|
89
|
+
| Relation operator algebra | **G** | composition = operator product, transitive closure = Σ powers |
|
|
90
|
+
| Spectral analysis (Katz) | **H** | `engine.infer` = resolvent (I−αP)⁻¹−I (matches **0.0** error) |
|
|
91
|
+
|
|
92
|
+
⟹ fuzzy inference **is** spectral analysis of the relation operator. `src/reasoning/`.
|
|
93
|
+
|
|
94
|
+
Four further theorems extend this core: **I** (two-sided precision/recall guarantee
|
|
95
|
+
for a self-grounded, no-external-KB variant), **J** (closure-learning completeness,
|
|
96
|
+
validated on CLUTRR), **K** (conformal reasoning — distribution-free coverage under a
|
|
97
|
+
*noisy* relation graph, including one extracted by an LLM from raw text), and **L**
|
|
98
|
+
(Horn forward-chaining, generalizing transitive closure to conjunctive rules). All
|
|
99
|
+
seven are stated, proved, and numerically verified in [PAPER.md](PAPER.md).
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Evidence on real LLMs (DeepSeek)
|
|
104
|
+
|
|
105
|
+
| Experiment | Result |
|
|
106
|
+
|------------|--------|
|
|
107
|
+
| Hallucination guard (kinship) | precision **33% → 100%**, catches 94/94, 0 false rejects |
|
|
108
|
+
| Guard token cost | **+0 tokens** (vs. LLM self-verify: +110% tokens, 34% precision) |
|
|
109
|
+
| SGDC (self-grounded, no external KB) | precision **78% → 100%** from internal consistency alone |
|
|
110
|
+
| CLUTRR (public benchmark) | solver **~100% at every hop** vs. DeepSeek 83%→8% |
|
|
111
|
+
| Hard passage (9-step chain) | DeepSeek **fabricates 2/10** (wrong direction); grounded system **10/10**, with proofs — [`examples/hallucination_demo.py`](examples/hallucination_demo.py) |
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Guaranteed reasoning over a graph an LLM extracted from raw text
|
|
116
|
+
|
|
117
|
+
The guard/solver needs a **clean** graph. But if you let an **LLM extract** relations
|
|
118
|
+
from natural-language text, the graph is **noisy** (missing/spurious edges).
|
|
119
|
+
**Conformal Reasoning** (Theorem K) fixes exactly that: use operator confidence as a
|
|
120
|
+
score, calibrate a threshold ⟹ **distribution-free coverage ≥ 1−α**, even on a noisy
|
|
121
|
+
graph.
|
|
122
|
+
|
|
123
|
+
End-to-end demo: **DeepSeek extracts an "is a" graph from text** → conformal runs on
|
|
124
|
+
that extracted graph (ground truth is used only for scoring):
|
|
125
|
+
|
|
126
|
+
| Text | LLM extraction (P / R) | Coverage (target ≥90%) | Efficiency (FPR) |
|
|
127
|
+
|------|------------------------:|----------------------------:|------------------:|
|
|
128
|
+
| Easy | 100% / 99.7% | **91.3%** | 0.0 |
|
|
129
|
+
| Hard (nested clauses + near-miss distractors) | 99.5% / **68.5%** | **93.0%** | 0.77 |
|
|
130
|
+
|
|
131
|
+
> The LLM's extraction **drops 31% of the edges** (a genuinely noisy graph) →
|
|
132
|
+
> **the coverage guarantee still holds** (93% ≥ 90%), only efficiency degrades.
|
|
133
|
+
> *Validity always holds; efficiency scales with graph quality.*
|
|
134
|
+
|
|
135
|
+
⟹ A path to guaranteed reasoning over **natural-language relations** — where the hard
|
|
136
|
+
guard can't reach. `src/experiments/conformal_llm_eval.py`.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Quickstart
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
git clone https://github.com/ALEXaquarius/grounded-reasoning
|
|
144
|
+
cd grounded-reasoning && pip install -e ".[dev]" # not yet on PyPI — install from source
|
|
145
|
+
pytest tests/ # every theorem + offline-locked logic, no network needed
|
|
146
|
+
|
|
147
|
+
# Use it right now (no LLM/network needed):
|
|
148
|
+
python -c "from grounded_reasoning import GroundedReasoner as G; r=G(); r.add_facts([('a','p','b'),('b','p','c')]); print(r.verify('a','c',via='p'))"
|
|
149
|
+
|
|
150
|
+
# Real-LLM experiments (need a key — read from an env var, NEVER hardcoded):
|
|
151
|
+
export DEEPSEEK_API_KEY=sk-... # bring your own; .env is gitignored
|
|
152
|
+
python -m src.experiments.guard_llm_eval # hallucination guard
|
|
153
|
+
python -m src.experiments.self_grounded_eval # SGDC
|
|
154
|
+
python -m src.experiments.clutrr_eval # public CLUTRR benchmark
|
|
155
|
+
python -m src.experiments.conformal_llm_eval # end-to-end conformal (LLM-extracted graph)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Integrating with an Agent / LLM (`src/agent/`)
|
|
161
|
+
|
|
162
|
+
A **relation-reasoning verifier** for agents: check a multi-hop claim **before
|
|
163
|
+
asserting it** — zero model tokens, precision guaranteed (accepts iff a grounded proof
|
|
164
|
+
path exists).
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from grounded_reasoning import GroundedReasoner
|
|
168
|
+
gr = GroundedReasoner()
|
|
169
|
+
gr.add_facts([("alice","parent","bob"),("bob","parent","carol")])
|
|
170
|
+
gr.verify("alice","carol", via="parent") # Verdict(grounded=True, proof=['alice','bob','carol'])
|
|
171
|
+
gr.verify("alice","zed", via="parent") # Verdict(grounded=False, proof=None) ← hallucination blocked
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Three integration paths (details: [docs/integration.md](docs/integration.md)):
|
|
175
|
+
- **Library**: `GroundedReasoner.verify / filter_claims / contradictions`.
|
|
176
|
+
- **Function-calling**: `TOOL_SPEC` (Anthropic) / `openai_tool_spec()` (OpenAI) + `run_tool` — a stateless `verify_relation` tool.
|
|
177
|
+
- **MCP server**: `python -m src.agent.mcp_server` — plugs into Claude or any MCP-compatible agent.
|
|
178
|
+
|
|
179
|
+
**Multi-provider** (not just DeepSeek): `LLMClient(provider=...)` for DeepSeek / OpenAI /
|
|
180
|
+
Groq / OpenRouter / Together / Mistral / Ollama (local) — all OpenAI-compatible, switch
|
|
181
|
+
providers without changing code. **Multilingual**: entities/relations are opaque
|
|
182
|
+
Unicode strings ⟹ works with any language (`cha`, `父`, `والد`…) with zero configuration.
|
|
183
|
+
|
|
184
|
+
A real function-calling demo (agent verifies itself, blocks hallucination):
|
|
185
|
+
`python -m src.experiments.agent_demo`. When the graph is **noisy** (relations
|
|
186
|
+
extracted by an LLM from text), use `ConformalReasoner` for a **coverage ≥1−α**
|
|
187
|
+
guarantee instead of hard precision.
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Source map
|
|
192
|
+
|
|
193
|
+
| Path | Content |
|
|
194
|
+
|------|---------|
|
|
195
|
+
| `grounded_reasoning/` | Public package — `GroundedReasoner`, `verify_relation`, `TOOL_SPEC`, `ConformalReasoner`, `LLMClient` |
|
|
196
|
+
| `src/agent/{verifier,tool,mcp_server}.py` | Public API implementation — HallucinationGuard, function-calling tool, MCP server |
|
|
197
|
+
| `src/reasoning/abstract_inference.py` | FuzzyInferenceEngine, TypedInferenceEngine, HallucinationGuard (Theorem F) |
|
|
198
|
+
| `src/reasoning/operator_algebra.py` | Relation operator algebra (Theorem G) |
|
|
199
|
+
| `src/reasoning/relation_spectrum.py` | Spectrum, nilpotency, Katz resolvent (Theorem H) |
|
|
200
|
+
| `src/reasoning/conformal_reasoning.py` | Conformal — coverage guarantee under noise (Theorem K) |
|
|
201
|
+
| `src/reasoning/composition_algebra.py` | Composition-table learning, validated on CLUTRR (Theorem J) |
|
|
202
|
+
| `src/reasoning/horn.py` | Horn forward-chaining, least-model semantics (Theorem L) |
|
|
203
|
+
| `src/reasoning/llm_client.py` | Provider-agnostic LLM client (key read from an env var) |
|
|
204
|
+
| `src/theory/theorems.py` | **Seven theorems (F–L)** with numerical verification |
|
|
205
|
+
| `src/experiments/{guard_llm,self_grounded,nl_ontology,guard_cost,clutrr,conformal_llm,inference}_eval.py` | Real-LLM and benchmark experiments backing every claim above |
|
|
206
|
+
| `examples/hallucination_demo.py` | End-to-end function-calling demo |
|
|
207
|
+
| `examples/quickstart.ipynb` | Runnable tour of the library (offline, Colab-ready) |
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Origin story
|
|
212
|
+
|
|
213
|
+
This project began as an attempt to invent an embedding-free retrieval algorithm that
|
|
214
|
+
could compete with dense/RAG retrieval. That research question reached a rigorous,
|
|
215
|
+
fully honest **negative** conclusion (ties BM25, loses significantly to dense
|
|
216
|
+
embeddings — with a proof of why). The same mathematical toolkit — operator algebra,
|
|
217
|
+
spectral analysis — turned out to have real, measurable value on a different problem:
|
|
218
|
+
**guaranteeing** multi-hop relational reasoning. This repository ships only that
|
|
219
|
+
validated, tested reasoning system; the full retrieval research trail (including every
|
|
220
|
+
failed attempt, honestly recorded) lives in a separate research repository and is not
|
|
221
|
+
part of this package. See [PAPER.md §1](PAPER.md) for the full framing.
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Contributing & Community
|
|
226
|
+
|
|
227
|
+
- How to contribute + research principles: [CONTRIBUTING.md](CONTRIBUTING.md)
|
|
228
|
+
- Code of conduct: [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) · Security: [SECURITY.md](SECURITY.md)
|
|
229
|
+
- Version history: [CHANGELOG.md](CHANGELOG.md) · Citation: [CITATION.cff](CITATION.cff)
|
|
230
|
+
- License: **MIT** ([LICENSE](LICENSE))
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
*Principle: proof before code, formal definitions, falsifiability, and honest
|
|
235
|
+
reporting of negative results — see [CONTRIBUTING.md](CONTRIBUTING.md).*
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
grounded-reasoning — a relation-reasoning verifier for LLMs and agents.
|
|
3
|
+
|
|
4
|
+
Public API (clean import surface):
|
|
5
|
+
|
|
6
|
+
from grounded_reasoning import GroundedReasoner, verify_relation, TOOL_SPEC
|
|
7
|
+
|
|
8
|
+
- GroundedReasoner : load facts, then verify / filter_claims / contradictions.
|
|
9
|
+
- verify_relation : a stateless function-calling tool (0 tokens, returns a proof).
|
|
10
|
+
- TOOL_SPEC / openai_tool_spec : tool schemas (Anthropic / OpenAI).
|
|
11
|
+
- ConformalReasoner : distribution-free coverage guarantee >=1-alpha under a noisy graph.
|
|
12
|
+
- LLMClient : a multi-provider (OpenAI-compatible) client for demos/experiments.
|
|
13
|
+
"""
|
|
14
|
+
from src.agent import (
|
|
15
|
+
TOOL_SPEC,
|
|
16
|
+
GroundedReasoner,
|
|
17
|
+
Verdict,
|
|
18
|
+
openai_tool_spec,
|
|
19
|
+
run_tool,
|
|
20
|
+
verify_relation,
|
|
21
|
+
)
|
|
22
|
+
from src.reasoning.conformal_reasoning import ConformalReasoner, conformal_threshold
|
|
23
|
+
from src.reasoning.llm_client import LLMClient
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.0"
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"GroundedReasoner",
|
|
29
|
+
"Verdict",
|
|
30
|
+
"verify_relation",
|
|
31
|
+
"run_tool",
|
|
32
|
+
"TOOL_SPEC",
|
|
33
|
+
"openai_tool_spec",
|
|
34
|
+
"ConformalReasoner",
|
|
35
|
+
"conformal_threshold",
|
|
36
|
+
"LLMClient",
|
|
37
|
+
"__version__",
|
|
38
|
+
]
|