groundlens 2026.4.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundlens-2026.4.22/.gitignore +47 -0
- groundlens-2026.4.22/LICENSE +21 -0
- groundlens-2026.4.22/PKG-INFO +437 -0
- groundlens-2026.4.22/README.md +357 -0
- groundlens-2026.4.22/pyproject.toml +207 -0
- groundlens-2026.4.22/src/groundlens/__init__.py +61 -0
- groundlens-2026.4.22/src/groundlens/_internal/__init__.py +1 -0
- groundlens-2026.4.22/src/groundlens/_internal/csv_loader.py +132 -0
- groundlens-2026.4.22/src/groundlens/_internal/embeddings.py +110 -0
- groundlens-2026.4.22/src/groundlens/_internal/geometry.py +125 -0
- groundlens-2026.4.22/src/groundlens/_internal/thresholds.py +88 -0
- groundlens-2026.4.22/src/groundlens/_version.py +3 -0
- groundlens-2026.4.22/src/groundlens/calibrate.py +186 -0
- groundlens-2026.4.22/src/groundlens/cli/__init__.py +0 -0
- groundlens-2026.4.22/src/groundlens/cli/main.py +498 -0
- groundlens-2026.4.22/src/groundlens/data/__init__.py +1 -0
- groundlens-2026.4.22/src/groundlens/data/reference_pairs.csv +20 -0
- groundlens-2026.4.22/src/groundlens/dgi.py +306 -0
- groundlens-2026.4.22/src/groundlens/evaluate.py +151 -0
- groundlens-2026.4.22/src/groundlens/integrations/__init__.py +3 -0
- groundlens-2026.4.22/src/groundlens/integrations/autogen/__init__.py +12 -0
- groundlens-2026.4.22/src/groundlens/integrations/autogen/checker.py +192 -0
- groundlens-2026.4.22/src/groundlens/integrations/crewai/__init__.py +12 -0
- groundlens-2026.4.22/src/groundlens/integrations/crewai/tool.py +155 -0
- groundlens-2026.4.22/src/groundlens/integrations/langchain/__init__.py +15 -0
- groundlens-2026.4.22/src/groundlens/integrations/langchain/callback.py +156 -0
- groundlens-2026.4.22/src/groundlens/integrations/langchain/evaluator.py +166 -0
- groundlens-2026.4.22/src/groundlens/integrations/semantic_kernel/__init__.py +13 -0
- groundlens-2026.4.22/src/groundlens/integrations/semantic_kernel/filter.py +152 -0
- groundlens-2026.4.22/src/groundlens/providers/__init__.py +15 -0
- groundlens-2026.4.22/src/groundlens/providers/_base.py +86 -0
- groundlens-2026.4.22/src/groundlens/providers/anthropic.py +172 -0
- groundlens-2026.4.22/src/groundlens/providers/google.py +162 -0
- groundlens-2026.4.22/src/groundlens/providers/openai.py +169 -0
- groundlens-2026.4.22/src/groundlens/py.typed +0 -0
- groundlens-2026.4.22/src/groundlens/score.py +118 -0
- groundlens-2026.4.22/src/groundlens/sgi.py +168 -0
- groundlens-2026.4.22/tests/__init__.py +0 -0
- groundlens-2026.4.22/tests/conftest.py +156 -0
- groundlens-2026.4.22/tests/integration/__init__.py +0 -0
- groundlens-2026.4.22/tests/integration/test_dgi.py +108 -0
- groundlens-2026.4.22/tests/integration/test_evaluate.py +166 -0
- groundlens-2026.4.22/tests/integration/test_sgi.py +136 -0
- groundlens-2026.4.22/tests/integrations/__init__.py +0 -0
- groundlens-2026.4.22/tests/integrations/test_autogen.py +114 -0
- groundlens-2026.4.22/tests/integrations/test_crewai.py +96 -0
- groundlens-2026.4.22/tests/integrations/test_langchain.py +138 -0
- groundlens-2026.4.22/tests/integrations/test_semantic_kernel.py +102 -0
- groundlens-2026.4.22/tests/providers/__init__.py +0 -0
- groundlens-2026.4.22/tests/providers/test_anthropic.py +107 -0
- groundlens-2026.4.22/tests/providers/test_google.py +126 -0
- groundlens-2026.4.22/tests/providers/test_openai.py +120 -0
- groundlens-2026.4.22/tests/unit/__init__.py +0 -0
- groundlens-2026.4.22/tests/unit/test_calibrate.py +151 -0
- groundlens-2026.4.22/tests/unit/test_cli.py +299 -0
- groundlens-2026.4.22/tests/unit/test_csv_loader.py +199 -0
- groundlens-2026.4.22/tests/unit/test_dgi_sgi_validation.py +319 -0
- groundlens-2026.4.22/tests/unit/test_embeddings.py +73 -0
- groundlens-2026.4.22/tests/unit/test_evaluate.py +121 -0
- groundlens-2026.4.22/tests/unit/test_geometry.py +240 -0
- groundlens-2026.4.22/tests/unit/test_score.py +190 -0
- groundlens-2026.4.22/tests/unit/test_thresholds.py +135 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.whl
|
|
10
|
+
.eggs/
|
|
11
|
+
|
|
12
|
+
# Virtual environments
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
env/
|
|
16
|
+
.env
|
|
17
|
+
|
|
18
|
+
# IDE
|
|
19
|
+
.vscode/
|
|
20
|
+
.idea/
|
|
21
|
+
*.swp
|
|
22
|
+
*.swo
|
|
23
|
+
*~
|
|
24
|
+
|
|
25
|
+
# Testing / Coverage
|
|
26
|
+
.pytest_cache/
|
|
27
|
+
.mypy_cache/
|
|
28
|
+
.ruff_cache/
|
|
29
|
+
htmlcov/
|
|
30
|
+
.coverage
|
|
31
|
+
.coverage.*
|
|
32
|
+
coverage.xml
|
|
33
|
+
|
|
34
|
+
# Documentation
|
|
35
|
+
site/
|
|
36
|
+
|
|
37
|
+
# OS
|
|
38
|
+
.DS_Store
|
|
39
|
+
Thumbs.db
|
|
40
|
+
|
|
41
|
+
# Embedding model cache
|
|
42
|
+
.cache/
|
|
43
|
+
|
|
44
|
+
# Secrets
|
|
45
|
+
*.key
|
|
46
|
+
*.pem
|
|
47
|
+
.env.local
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Javier Marin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groundlens
|
|
3
|
+
Version: 2026.4.22
|
|
4
|
+
Summary: Geometric LLM hallucination detection. No second LLM. Deterministic. Auditable.
|
|
5
|
+
Project-URL: Homepage, https://groundlens.dev
|
|
6
|
+
Project-URL: Documentation, https://docs.groundlens.dev
|
|
7
|
+
Project-URL: Repository, https://github.com/groundlens-dev/groundlens
|
|
8
|
+
Project-URL: Issues, https://github.com/groundlens-dev/groundlens/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/groundlens-dev/groundlens/blob/main/CHANGELOG.md
|
|
10
|
+
Project-URL: Research (SGI), https://arxiv.org/abs/2512.13771
|
|
11
|
+
Project-URL: Research (DGI), https://arxiv.org/pdf/2602.13224v3
|
|
12
|
+
Project-URL: Research (Benchmark), https://arxiv.org/abs/2603.13259
|
|
13
|
+
Author-email: Javier Marin <javier@jmarin.info>
|
|
14
|
+
License-Expression: MIT
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Keywords: ai-safety,dgi,embedding-geometry,eu-ai-act,factual-accuracy,grounding,hallucination-detection,llm-evaluation,rag,sgi
|
|
17
|
+
Classifier: Development Status :: 4 - Beta
|
|
18
|
+
Classifier: Intended Audience :: Developers
|
|
19
|
+
Classifier: Intended Audience :: Science/Research
|
|
20
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
21
|
+
Classifier: Programming Language :: Python :: 3
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
28
|
+
Classifier: Typing :: Typed
|
|
29
|
+
Requires-Python: >=3.10
|
|
30
|
+
Requires-Dist: numpy>=1.24.0
|
|
31
|
+
Requires-Dist: sentence-transformers<4.0.0,>=2.7.0
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: anthropic>=0.25.0; extra == 'all'
|
|
34
|
+
Requires-Dist: autogen-agentchat>=0.4.0; extra == 'all'
|
|
35
|
+
Requires-Dist: crewai>=0.80.0; extra == 'all'
|
|
36
|
+
Requires-Dist: google-generativeai>=0.5.0; extra == 'all'
|
|
37
|
+
Requires-Dist: langchain-core>=0.3.0; extra == 'all'
|
|
38
|
+
Requires-Dist: langsmith>=0.1.0; extra == 'all'
|
|
39
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
40
|
+
Requires-Dist: semantic-kernel>=1.0.0; extra == 'all'
|
|
41
|
+
Provides-Extra: anthropic
|
|
42
|
+
Requires-Dist: anthropic>=0.25.0; extra == 'anthropic'
|
|
43
|
+
Provides-Extra: autogen
|
|
44
|
+
Requires-Dist: autogen-agentchat>=0.4.0; extra == 'autogen'
|
|
45
|
+
Provides-Extra: crewai
|
|
46
|
+
Requires-Dist: crewai>=0.80.0; extra == 'crewai'
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
49
|
+
Requires-Dist: pip-audit>=2.7; extra == 'dev'
|
|
50
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
51
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pytest-mock>=3.12; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: ruff>=0.5.0; extra == 'dev'
|
|
55
|
+
Provides-Extra: docs
|
|
56
|
+
Requires-Dist: mkdocs-gen-files>=0.5; extra == 'docs'
|
|
57
|
+
Requires-Dist: mkdocs-literate-nav>=0.6; extra == 'docs'
|
|
58
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
60
|
+
Provides-Extra: google
|
|
61
|
+
Requires-Dist: google-generativeai>=0.5.0; extra == 'google'
|
|
62
|
+
Provides-Extra: integrations
|
|
63
|
+
Requires-Dist: autogen-agentchat>=0.4.0; extra == 'integrations'
|
|
64
|
+
Requires-Dist: crewai>=0.80.0; extra == 'integrations'
|
|
65
|
+
Requires-Dist: langchain-core>=0.3.0; extra == 'integrations'
|
|
66
|
+
Requires-Dist: langsmith>=0.1.0; extra == 'integrations'
|
|
67
|
+
Requires-Dist: semantic-kernel>=1.0.0; extra == 'integrations'
|
|
68
|
+
Provides-Extra: langchain
|
|
69
|
+
Requires-Dist: langchain-core>=0.3.0; extra == 'langchain'
|
|
70
|
+
Requires-Dist: langsmith>=0.1.0; extra == 'langchain'
|
|
71
|
+
Provides-Extra: openai
|
|
72
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
73
|
+
Provides-Extra: providers
|
|
74
|
+
Requires-Dist: anthropic>=0.25.0; extra == 'providers'
|
|
75
|
+
Requires-Dist: google-generativeai>=0.5.0; extra == 'providers'
|
|
76
|
+
Requires-Dist: openai>=1.0.0; extra == 'providers'
|
|
77
|
+
Provides-Extra: semantic-kernel
|
|
78
|
+
Requires-Dist: semantic-kernel>=1.0.0; extra == 'semantic-kernel'
|
|
79
|
+
Description-Content-Type: text/markdown
|
|
80
|
+
|
|
81
|
+
<div align="center">
|
|
82
|
+
<img src="docs/assets/Logo_groundlens_new-05.png" alt="groundlens" width="200">
|
|
83
|
+
</div>
|
|
84
|
+
|
|
85
|
+
# Geometric LLM hallucination detection. No second LLM. Deterministic. Auditable.
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
[](https://github.com/groundlens-dev/groundlens)
|
|
90
|
+
[](https://opensource.org/licenses/MIT)
|
|
91
|
+
[](https://github.com/groundlens-dev/groundlens/actions)
|
|
92
|
+
[](https://codecov.io/gh/groundlens-dev/groundlens)
|
|
93
|
+
[](https://docs.groundlens.dev)
|
|
94
|
+
[](https://github.com/groundlens-dev/groundlens/releases)
|
|
95
|
+
|
|
96
|
+
[Documentation](https://docs.groundlens.dev) | [Research Papers](#research) | [Examples](examples/) | [Vision](VISION.md) | [Contributing](CONTRIBUTING.md)
|
|
97
|
+
|
|
98
|
+
</div>
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
***groundlens*** detects LLM hallucinations using embedding geometry instead of a second LLM. It computes deterministic, auditable scores from the spatial relationships between questions, responses, and source context in an embedding space. The result is a verification signal you can explain in an audit, reproduce on demand, and run in regulated environments.
|
|
103
|
+
|
|
104
|
+
## Why ***groundlens***?
|
|
105
|
+
|
|
106
|
+
| Problem | How groundlens solves it |
|
|
107
|
+
|---|---|
|
|
108
|
+
| Second-LLM judges are non-deterministic and expensive | Single embedding model (`all-MiniLM-L6-v2`), deterministic output, sub-second latency |
|
|
109
|
+
| Probabilistic scores cannot be audited | Geometric ratios and angular measurements with clear mathematical definitions |
|
|
110
|
+
| Regulatory compliance requires explainability | Every score traces to Euclidean distances and cosine similarities in $\mathbf{R}^n$ (n-dimensional real vector space query/anwser)|
|
|
111
|
+
| One method does not fit all use cases | SGI for RAG/context verification, DGI for context-free chat, `evaluate()` auto-selects |
|
|
112
|
+
|
|
113
|
+
`SGI`: Semantic Grounding Index | `DGI`: Directional Grounding Index
|
|
114
|
+
|
|
115
|
+
## I want to...
|
|
116
|
+
|
|
117
|
+
| Goal | Start here |
|
|
118
|
+
|---|---|
|
|
119
|
+
| **Verify my RAG pipeline outputs** | [SGI quick start](#sgi----with-context-rag-verification) Ā· [RAG verification guide](https://docs.groundlens.dev/guides/rag-verification/) |
|
|
120
|
+
| **Score chat responses without context** | [DGI quick start](#dgi----without-context) Ā· [DGI deep dive](https://docs.groundlens.dev/concepts/dgi/) |
|
|
121
|
+
| **Evaluate a batch of outputs** | [Batch evaluation](#batch-evaluation) Ā· [Batch guide](https://docs.groundlens.dev/guides/batch-evaluation/) |
|
|
122
|
+
| **Wrap my LLM provider with auto-scoring** | [Provider guard](#llm-provider-guard) Ā· [Providers docs](https://docs.groundlens.dev/providers/openai/) |
|
|
123
|
+
| **Integrate with LangChain / CrewAI / etc.** | [Integrations](#providers-and-integrations) Ā· [Integration docs](https://docs.groundlens.dev/integrations/langchain/) |
|
|
124
|
+
| **Improve accuracy for my domain** | [Domain calibration](#domain-calibration) Ā· [Calibration guide](https://docs.groundlens.dev/guides/domain-calibration/) |
|
|
125
|
+
| **Comply with the EU AI Act** | [EU AI Act guide](https://docs.groundlens.dev/guides/eu-ai-act/) |
|
|
126
|
+
| **Understand the math** | [How it works](https://docs.groundlens.dev/concepts/how-it-works/) Ā· [Research papers](#research) |
|
|
127
|
+
| **Understand what it can and cannot detect** | [Hallucination taxonomy](#taxonomy-of-llm-hallucinations) |
|
|
128
|
+
| **Check my environment is set up correctly** | [`groundlens doctor`](#cli) |
|
|
129
|
+
| **Contribute** | [CONTRIBUTING.md](CONTRIBUTING.md) Ā· [CLAUDE.md](CLAUDE.md) Ā· [AGENTS.md](AGENTS.md) |
|
|
130
|
+
|
|
131
|
+
## Installation
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
pip install groundlens
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
With LLM provider support:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
pip install "groundlens[openai]" # OpenAI
|
|
141
|
+
pip install "groundlens[anthropic]" # Anthropic
|
|
142
|
+
pip install "groundlens[google]" # Google Generative AI
|
|
143
|
+
pip install "groundlens[providers]" # All providers
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
With framework integrations:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
pip install "groundlens[langchain]" # LangChain
|
|
150
|
+
pip install "groundlens[crewai]" # CrewAI
|
|
151
|
+
pip install "groundlens[semantic-kernel]" # Semantic Kernel
|
|
152
|
+
pip install "groundlens[autogen]" # AutoGen
|
|
153
|
+
pip install "groundlens[all]" # Everything
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**Requirements:** Python 3.10+, numpy, sentence-transformers.
|
|
157
|
+
|
|
158
|
+
## Quick start
|
|
159
|
+
|
|
160
|
+
### SGI -- with context (RAG verification)
|
|
161
|
+
|
|
162
|
+
SGI (Semantic Grounding Index) measures whether a response engaged with the provided context or stayed anchored to the question. It requires three inputs.
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from groundlens import compute_sgi
|
|
166
|
+
|
|
167
|
+
result = compute_sgi(
|
|
168
|
+
question="What is the capital of France?",
|
|
169
|
+
context="France is in Western Europe. Its capital is Paris.",
|
|
170
|
+
response="The capital of France is Paris.",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
print(result.value) # 1.23 ā ratio of distances
|
|
174
|
+
print(result.normalized) # 0.61 ā mapped to [0, 1]
|
|
175
|
+
print(result.flagged) # False ā above review threshold
|
|
176
|
+
print(result.explanation) # "SGI=1.230 ā strong context engagement (pass)"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
**Interpretation:** `SGI > 1.0` means the response is closer to the context than to the question in embedding space. The response engaged with the source material.
|
|
180
|
+
|
|
181
|
+
### DGI -- without context
|
|
182
|
+
|
|
183
|
+
DGI (Directional Grounding Index) detects hallucinations without requiring source context. It checks whether the question-to-response displacement vector aligns with the characteristic direction of verified grounded responses.
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from groundlens import compute_dgi
|
|
187
|
+
|
|
188
|
+
result = compute_dgi(
|
|
189
|
+
question="What causes seasons on Earth?",
|
|
190
|
+
response="Seasons are caused by Earth's 23.5-degree axial tilt.",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
print(result.value) # 0.42 ā cosine similarity to reference direction
|
|
194
|
+
print(result.normalized) # 0.71 ā mapped to [0, 1]
|
|
195
|
+
print(result.flagged) # False ā above pass threshold (0.30)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
**Domain calibration** improves DGI accuracy from AUROC ~0.8 with a basic calibration to 0.90-0.99 with domain-specific calibration:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
from groundlens import compute_dgi
|
|
202
|
+
|
|
203
|
+
result = compute_dgi(
|
|
204
|
+
question="What is the statute of limitations for breach of contract in California?",
|
|
205
|
+
response="Four years under California Code of Civil Procedure Section 337.",
|
|
206
|
+
reference_csv="legal_calibration_pairs.csv",
|
|
207
|
+
)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### evaluate() -- auto-select
|
|
211
|
+
|
|
212
|
+
The `evaluate()` function picks the right method automatically: SGI when context is provided, DGI when it is not.
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from groundlens import evaluate
|
|
216
|
+
|
|
217
|
+
# With context -> SGI
|
|
218
|
+
score = evaluate(
|
|
219
|
+
question="What is X?",
|
|
220
|
+
response="X is Y.",
|
|
221
|
+
context="According to the manual, X is Y.",
|
|
222
|
+
)
|
|
223
|
+
assert score.method == "sgi"
|
|
224
|
+
|
|
225
|
+
# Without context -> DGI
|
|
226
|
+
score = evaluate(
|
|
227
|
+
question="What is X?",
|
|
228
|
+
response="X is Y.",
|
|
229
|
+
)
|
|
230
|
+
assert score.method == "dgi"
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### Batch evaluation
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
from groundlens import evaluate_batch
|
|
237
|
+
|
|
238
|
+
items = [
|
|
239
|
+
{"question": "Q1?", "response": "A1.", "context": "Source."},
|
|
240
|
+
{"question": "Q2?", "response": "A2."},
|
|
241
|
+
{"question": "Q3?", "response": "A3.", "context": "Reference."},
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
results = evaluate_batch(items)
|
|
245
|
+
flagged = [r for r in results if r.flagged]
|
|
246
|
+
print(f"{len(flagged)}/{len(results)} flagged for review")
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### CLI
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
# Check environment health
|
|
253
|
+
groundlens doctor
|
|
254
|
+
|
|
255
|
+
# Single response check
|
|
256
|
+
groundlens check \
|
|
257
|
+
--question "What is the capital of France?" \
|
|
258
|
+
--response "The capital of France is Paris." \
|
|
259
|
+
--context "France is in Western Europe. Its capital is Paris."
|
|
260
|
+
|
|
261
|
+
# Batch CSV evaluation
|
|
262
|
+
groundlens evaluate input.csv --output results.csv
|
|
263
|
+
|
|
264
|
+
# Domain calibration
|
|
265
|
+
groundlens calibrate --pairs domain_pairs.csv --output calibration.json
|
|
266
|
+
|
|
267
|
+
# Run the confabulation benchmark
|
|
268
|
+
groundlens benchmark
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### LLM provider guard
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from groundlens.providers.openai import OpenAIProvider
|
|
275
|
+
|
|
276
|
+
provider = OpenAIProvider(model="gpt-4o")
|
|
277
|
+
response = provider.complete(
|
|
278
|
+
prompt="Summarize this document.",
|
|
279
|
+
context="The document text here...",
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if response.groundlens_score and response.groundlens_score.flagged:
|
|
283
|
+
print("Hallucination risk detected ā review recommended.")
|
|
284
|
+
else:
|
|
285
|
+
print(response.text)
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Taxonomy of LLM hallucinations
|
|
289
|
+
|
|
290
|
+
Not all hallucinations are the same. Groundlens is built on a [geometric taxonomy](https://docs.groundlens.dev/theory/hallucination-taxonomy/) ([arXiv:2602.13224](https://arxiv.org/pdf/2602.13224v3)) that classifies hallucinations by their geometric signature in embedding space ā which determines whether they are detectable and which scoring method applies.
|
|
291
|
+
|
|
292
|
+
<div align="center">
|
|
293
|
+
<img src="docs/assets/taxonomy.png" alt="groundlens" width="400">
|
|
294
|
+
<br>
|
|
295
|
+
<sub>Every text maps to a point on the hypersphere S<sup>dā1</sup>. The question <b>q</b> and context <b>c</b> define a geodesic arc. Grounded responses (blue) fall inside the plausibility region š«<sub>q</sub>. <b>Type I</b> (purple) stays near q ā the response ignored the context. <b>Type II</b> (red) deviates far from both q and c ā invented content. <b>Type III</b> (pink) lands inside š«<sub>q</sub> alongside the correct answer ā same vocabulary and structure, wrong facts, geometrically indistinguishable.</sub>
|
|
296
|
+
</div>
|
|
297
|
+
<br>
|
|
298
|
+
|
|
299
|
+
| Type | What happens | Example | Detection |
|
|
300
|
+
|---|---|---|---|
|
|
301
|
+
| **Type I ā Unfaithfulness** | Response ignores the provided source and defaults to the question | RAG system returns an answer from memory instead of from the retrieved document | **SGI** (distance ratio) |
|
|
302
|
+
| **Type II ā Confabulation** | Response invents content outside the topic's vocabulary | Asked about CRISPR gene editing, the model describes protein-folding correction instead | **DGI** (displacement direction) |
|
|
303
|
+
| **Type III ā Within-frame error** | Response uses the right vocabulary and structure but gets the facts wrong | "The capital of Australia is Canberra" vs. "The capital of Australia is Sydney" ā same frame, wrong city | **Undetectable by geometry** |
|
|
304
|
+
|
|
305
|
+
**Why Type III is undetectable:** Sentence embeddings encode distributional similarity (vocabulary, syntax, co-occurrence), not truth value. Two responses that share the same words, entities, and syntactic frame land in the same region of embedding space regardless of which one is correct. This is not a limitation of groundlens ā it is a property of the distributional hypothesis (Harris, 1954) that constrains every embedding-based method, including NLI (which *inverts* to AUROC 0.311 on TruthfulQA, actively favoring false answers over truthful ones).
|
|
306
|
+
|
|
307
|
+
**Implications:** Groundlens is **verification triage** ā it detects the hallucination types that leave geometric traces (Types I and II), which are the most common and most damaging in production. For Type III errors in high-stakes domains (medical, legal, financial), complement groundlens with claim-level fact-checking tools on the outputs that pass geometric verification. See [Complementary Tools for Type III](https://docs.groundlens.dev/theory/confabulation-boundary/#complementary-tools-for-type-iii-detection).
|
|
308
|
+
|
|
309
|
+
## Scoring methods
|
|
310
|
+
|
|
311
|
+
Each scoring method targets a specific hallucination type from the taxonomy above.
|
|
312
|
+
|
|
313
|
+
### SGI (Semantic Grounding Index) ā detects Type I
|
|
314
|
+
|
|
315
|
+
When context is available, SGI measures whether the response engaged with the source or stayed anchored to the question:
|
|
316
|
+
|
|
317
|
+
```
|
|
318
|
+
SGI = dist(phi(response), phi(question)) / dist(phi(response), phi(context))
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
| Score | Interpretation |
|
|
322
|
+
|---|---|
|
|
323
|
+
| SGI > 1.20 | Strong context engagement (pass) |
|
|
324
|
+
| 0.95 < SGI < 1.20 | Partial engagement (review recommended) |
|
|
325
|
+
| SGI < 0.95 | Weak engagement (flagged ā possible Type I) |
|
|
326
|
+
|
|
327
|
+
### DGI (Directional Grounding Index) ā detects Type II
|
|
328
|
+
|
|
329
|
+
When no context is available, DGI checks whether the question-to-response displacement aligns with a learned "grounded direction":
|
|
330
|
+
|
|
331
|
+
```
|
|
332
|
+
delta = phi(response) - phi(question)
|
|
333
|
+
DGI = dot(delta / ||delta||, mu_hat)
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
| Score | Interpretation |
|
|
337
|
+
|---|---|
|
|
338
|
+
| DGI > 0.30 $^1$ | Aligns with grounded patterns (pass) |
|
|
339
|
+
| 0.00 < DGI < 0.30 | Weak alignment (flagged ā possible Type II) |
|
|
340
|
+
| DGI < 0.00 | Opposes grounded direction (high risk) |
|
|
341
|
+
|
|
342
|
+
$^1$ This score corresponds to a general calibration. In domain-specific calibrations the score can vary.
|
|
343
|
+
|
|
344
|
+
## Providers and integrations
|
|
345
|
+
|
|
346
|
+
| Component | Install extra | Description |
|
|
347
|
+
|---|---|---|
|
|
348
|
+
| OpenAI | `openai` | Wraps `openai` SDK with automatic scoring |
|
|
349
|
+
| Anthropic | `anthropic` | Wraps `anthropic` SDK with automatic scoring |
|
|
350
|
+
| Google | `google` | Wraps `google-generativeai` with automatic scoring |
|
|
351
|
+
| LangChain | `langchain` | Evaluator + callback handler |
|
|
352
|
+
| CrewAI | `crewai` | Tool for agent pipelines |
|
|
353
|
+
| Semantic Kernel | `semantic-kernel` | Function calling filter |
|
|
354
|
+
| AutoGen | `autogen` | Agent chat checker |
|
|
355
|
+
|
|
356
|
+
## Domain calibration
|
|
357
|
+
|
|
358
|
+
Generic DGI uses a bundled reference direction that achieves AUROC ~0.8 with a basic calibration. For production use, a domain-specific calibration can be applied (a minimum of 200 queries recommended):
|
|
359
|
+
|
|
360
|
+
```python
|
|
361
|
+
from groundlens import calibrate
|
|
362
|
+
|
|
363
|
+
result = calibrate(csv_path="my_domain_pairs.csv")
|
|
364
|
+
print(f"Concentration: {result.concentration:.2f}")
|
|
365
|
+
result.save("calibration.json")
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
Domain-specific calibration typically reaches AUROC 0.90-0.99. The confabulation benchmark (arXiv:2603.13259) reports DGI AUROC 0.958 with domain calibration.
|
|
369
|
+
|
|
370
|
+
## Architecture
|
|
371
|
+
|
|
372
|
+
```
|
|
373
|
+
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
374
|
+
ā Public API (evaluate) ā
|
|
375
|
+
āāāāāāāāāāāāāāāāāāāā¬āāāāāāāāāāāāāāāāāāāāāāāāāāā¤
|
|
376
|
+
ā SGI (sgi.py) ā DGI (dgi.py) ā
|
|
377
|
+
āāāāāāāāāāāāāāāāāāāā“āāāāāāāāāāāāāāāāāāāāāāāāāāā¤
|
|
378
|
+
ā _internal (geometry, embeddings) ā
|
|
379
|
+
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā¤
|
|
380
|
+
ā sentence-transformers (all-MiniLM-L6-v2) ā
|
|
381
|
+
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
382
|
+
ā² ā²
|
|
383
|
+
ā ā
|
|
384
|
+
āāāāāāā“āāāāāāā āāāāāāāāā“āāāāāāā
|
|
385
|
+
ā Providers ā ā Integrations ā
|
|
386
|
+
ā (OpenAI, ā ā (LangChain, ā
|
|
387
|
+
ā Anthropic,ā ā CrewAI, ā
|
|
388
|
+
ā Google) ā ā SK, AutoGen ā
|
|
389
|
+
āāāāāāāāāāāāāā āāāāāāāāāāāāāāāā
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
See [AGENTS.md](AGENTS.md) for detailed file-by-file documentation. See [CLAUDE.md](CLAUDE.md) for AI-assisted development guidelines.
|
|
393
|
+
|
|
394
|
+
## Research
|
|
395
|
+
|
|
396
|
+
groundlens implements the methods described in three research papers:
|
|
397
|
+
|
|
398
|
+
1. **Semantic Grounding Index (SGI)**
|
|
399
|
+
Marin, J. (2025). *Semantic Grounding Index for LLM Hallucination Detection.*
|
|
400
|
+
[arXiv:2512.13771](https://arxiv.org/abs/2512.13771)
|
|
401
|
+
|
|
402
|
+
2. **Directional Grounding Index (DGI)**
|
|
403
|
+
Marin, J. (2026). *A Geometric Taxonomy of Hallucinations in Large Language Models.*
|
|
404
|
+
[arXiv:2602.13224](https://arxiv.org/pdf/2602.13224v3)
|
|
405
|
+
|
|
406
|
+
3. **Mechanistic Interpretability**
|
|
407
|
+
Marin, J. (2026). *Rotational Dynamics of Factual Constraint Processing in Large Language Models.*
|
|
408
|
+
[arXiv:2603.13259](https://arxiv.org/abs/2603.13259)
|
|
409
|
+
|
|
410
|
+
4. **Hallucination Benchmark**
|
|
411
|
+
https://github.com/groundlens-dev/grounding-benchmark/blob/4abf98ec5d2f846850a44f713115323659c2a793/paper/A_Methodology_for_Building_Human_Confabulated_Hallucination_Benchmarks.pdf
|
|
412
|
+
|
|
413
|
+
## Security
|
|
414
|
+
|
|
415
|
+
See [SECURITY.md](SECURITY.md) for vulnerability reporting, scope, and response timelines.
|
|
416
|
+
|
|
417
|
+
## Contributing
|
|
418
|
+
|
|
419
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, code standards, and PR process.
|
|
420
|
+
|
|
421
|
+
```bash
|
|
422
|
+
# Quick start for contributors
|
|
423
|
+
git clone https://github.com/groundlens-dev/groundlens.git
|
|
424
|
+
cd groundlens
|
|
425
|
+
pip install -e ".[dev]"
|
|
426
|
+
pre-commit install
|
|
427
|
+
groundlens doctor # verify your environment
|
|
428
|
+
pytest tests/unit/ # run fast tests
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
## About
|
|
432
|
+
|
|
433
|
+
Groundlens is built and maintained by [Javier Marin](https://jmarin.info) -- an engineer who has reinvented himself more times than most people change jobs. The math comes from engineering, the skepticism from regulated industries, and the stubbornness from experience. Read the [origin story](VISION.md#origin).
|
|
434
|
+
|
|
435
|
+
## License
|
|
436
|
+
|
|
437
|
+
[MIT](LICENSE) -- Javier Marin (javier@jmarin.info)
|