groundguard 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundguard-0.1.0/.coverage +0 -0
- groundguard-0.1.0/.gitattributes +2 -0
- groundguard-0.1.0/.github/workflows/ci.yml +63 -0
- groundguard-0.1.0/.github/workflows/release.yml +68 -0
- groundguard-0.1.0/.gitignore +8 -0
- groundguard-0.1.0/CLAUDE.md +276 -0
- groundguard-0.1.0/GROUNDGUARD_CONTRACT.md +87 -0
- groundguard-0.1.0/LICENSE +21 -0
- groundguard-0.1.0/PKG-INFO +404 -0
- groundguard-0.1.0/README.md +363 -0
- groundguard-0.1.0/examples/__init__.py +1 -0
- groundguard-0.1.0/examples/bedrock_rag.py +71 -0
- groundguard-0.1.0/examples/cohere_rag.py +53 -0
- groundguard-0.1.0/examples/full_output_verification.py +46 -0
- groundguard-0.1.0/examples/langchain_retrieval_qa.py +63 -0
- groundguard-0.1.0/examples/llamaindex_citation.py +54 -0
- groundguard-0.1.0/examples/openai_assistants.py +80 -0
- groundguard-0.1.0/groundguard/__init__.py +37 -0
- groundguard-0.1.0/groundguard/_constants.py +14 -0
- groundguard-0.1.0/groundguard/_log.py +4 -0
- groundguard-0.1.0/groundguard/adapters/__init__.py +4 -0
- groundguard-0.1.0/groundguard/adapters/registry.py +298 -0
- groundguard-0.1.0/groundguard/circuit_breaker.py +32 -0
- groundguard-0.1.0/groundguard/core/__init__.py +0 -0
- groundguard-0.1.0/groundguard/core/claim_extractor.py +85 -0
- groundguard-0.1.0/groundguard/core/classifier.py +50 -0
- groundguard-0.1.0/groundguard/core/result_builder.py +60 -0
- groundguard-0.1.0/groundguard/core/verifier.py +758 -0
- groundguard-0.1.0/groundguard/cost_estimate.py +137 -0
- groundguard-0.1.0/groundguard/exceptions.py +55 -0
- groundguard-0.1.0/groundguard/integrations/__init__.py +0 -0
- groundguard-0.1.0/groundguard/integrations/langchain.py +64 -0
- groundguard-0.1.0/groundguard/loaders/__init__.py +0 -0
- groundguard-0.1.0/groundguard/loaders/accumulator.py +108 -0
- groundguard-0.1.0/groundguard/loaders/chunker.py +121 -0
- groundguard-0.1.0/groundguard/loaders/helpers.py +54 -0
- groundguard-0.1.0/groundguard/loaders/legal.py +124 -0
- groundguard-0.1.0/groundguard/loaders/structured.py +104 -0
- groundguard-0.1.0/groundguard/models/__init__.py +0 -0
- groundguard-0.1.0/groundguard/models/builder.py +162 -0
- groundguard-0.1.0/groundguard/models/internal.py +154 -0
- groundguard-0.1.0/groundguard/models/result.py +178 -0
- groundguard-0.1.0/groundguard/models/tier3.py +96 -0
- groundguard-0.1.0/groundguard/profiles.py +39 -0
- groundguard-0.1.0/groundguard/tiers/__init__.py +0 -0
- groundguard-0.1.0/groundguard/tiers/tier1_authenticity.py +63 -0
- groundguard-0.1.0/groundguard/tiers/tier25_preprocessing.py +244 -0
- groundguard-0.1.0/groundguard/tiers/tier2_semantic.py +83 -0
- groundguard-0.1.0/groundguard/tiers/tier3_evaluation.py +401 -0
- groundguard-0.1.0/pyproject.toml +75 -0
- groundguard-0.1.0/tests/__init__.py +0 -0
- groundguard-0.1.0/tests/conftest.py +244 -0
- groundguard-0.1.0/tests/integration/__init__.py +0 -0
- groundguard-0.1.0/tests/integration/compat_models.py +194 -0
- groundguard-0.1.0/tests/integration/test_compat_suite.py +194 -0
- groundguard-0.1.0/tests/integration/test_deterministic.py +472 -0
- groundguard-0.1.0/tests/integration/test_real_suite.py +431 -0
- groundguard-0.1.0/tests/test_accumulator.py +142 -0
- groundguard-0.1.0/tests/test_adapters.py +215 -0
- groundguard-0.1.0/tests/test_batch.py +599 -0
- groundguard-0.1.0/tests/test_builder.py +242 -0
- groundguard-0.1.0/tests/test_chunker.py +89 -0
- groundguard-0.1.0/tests/test_circuit_breaker.py +86 -0
- groundguard-0.1.0/tests/test_claim_extractor.py +93 -0
- groundguard-0.1.0/tests/test_classifier.py +68 -0
- groundguard-0.1.0/tests/test_clause_preprocessor.py +109 -0
- groundguard-0.1.0/tests/test_cost_estimate.py +68 -0
- groundguard-0.1.0/tests/test_evaluation.py +415 -0
- groundguard-0.1.0/tests/test_examples.py +47 -0
- groundguard-0.1.0/tests/test_exceptions.py +31 -0
- groundguard-0.1.0/tests/test_exports.py +51 -0
- groundguard-0.1.0/tests/test_faithfulness.py +274 -0
- groundguard-0.1.0/tests/test_flattener.py +52 -0
- groundguard-0.1.0/tests/test_helpers.py +27 -0
- groundguard-0.1.0/tests/test_langchain.py +273 -0
- groundguard-0.1.0/tests/test_log.py +9 -0
- groundguard-0.1.0/tests/test_logging.py +439 -0
- groundguard-0.1.0/tests/test_models.py +85 -0
- groundguard-0.1.0/tests/test_models_v2.py +178 -0
- groundguard-0.1.0/tests/test_profiles.py +43 -0
- groundguard-0.1.0/tests/test_structured_loader.py +98 -0
- groundguard-0.1.0/tests/test_tier1.py +95 -0
- groundguard-0.1.0/tests/test_tier2.py +134 -0
- groundguard-0.1.0/tests/test_tier25.py +187 -0
- groundguard-0.1.0/tests/test_verifier.py +539 -0
- groundguard-0.1.0/tests/test_verify_analysis.py +143 -0
- groundguard-0.1.0/tests/test_verify_answer.py +152 -0
- groundguard-0.1.0/tests/test_verify_clause.py +94 -0
|
Binary file
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["**"]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: ["main"]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
fast-suite:
|
|
11
|
+
name: Fast Suite (no LLM)
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: ${{ matrix.python-version }}
|
|
22
|
+
- name: Install core deps
|
|
23
|
+
run: pip install -e ".[dev]"
|
|
24
|
+
- name: Run fast suite
|
|
25
|
+
run: pytest -m "not llm and not loaders and not langchain and not compat" -x -q --tb=short
|
|
26
|
+
|
|
27
|
+
loaders-suite:
|
|
28
|
+
name: Optional Loaders Tests
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
- uses: actions/setup-python@v5
|
|
33
|
+
with:
|
|
34
|
+
python-version: "3.11"
|
|
35
|
+
- run: pip install -e ".[loaders,dev]"
|
|
36
|
+
- run: pytest -m loaders -x -q
|
|
37
|
+
|
|
38
|
+
langchain-suite:
|
|
39
|
+
name: Optional LangChain Tests
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
steps:
|
|
42
|
+
- uses: actions/checkout@v4
|
|
43
|
+
- uses: actions/setup-python@v5
|
|
44
|
+
with:
|
|
45
|
+
python-version: "3.11"
|
|
46
|
+
- run: pip install -e ".[langchain,dev]"
|
|
47
|
+
- run: pytest -m langchain -x -q
|
|
48
|
+
|
|
49
|
+
real-suite:
|
|
50
|
+
name: Real Suite (LLM endpoints)
|
|
51
|
+
runs-on: ubuntu-latest
|
|
52
|
+
if: github.event_name == 'pull_request'
|
|
53
|
+
steps:
|
|
54
|
+
- uses: actions/checkout@v4
|
|
55
|
+
- uses: actions/setup-python@v5
|
|
56
|
+
with:
|
|
57
|
+
python-version: "3.11"
|
|
58
|
+
- run: pip install -e ".[dev]"
|
|
59
|
+
- name: Run real suite
|
|
60
|
+
run: pytest -m llm --tb=short --timeout=120 -q
|
|
61
|
+
env:
|
|
62
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
63
|
+
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
name: Build distribution
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- name: Set up Python
|
|
15
|
+
uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.11"
|
|
18
|
+
- name: Install pypa/build
|
|
19
|
+
run: python -m pip install build --user
|
|
20
|
+
- name: Build wheel and source tarball
|
|
21
|
+
run: python -m build
|
|
22
|
+
- name: Store distribution packages
|
|
23
|
+
uses: actions/upload-artifact@v4
|
|
24
|
+
with:
|
|
25
|
+
name: python-package-distributions
|
|
26
|
+
path: dist/
|
|
27
|
+
|
|
28
|
+
publish-to-testpypi:
|
|
29
|
+
name: Publish to TestPyPI (pre-release only)
|
|
30
|
+
# Runs only on pre-release tags: v0.1.0a1, v0.2.0b2, v1.0.0rc1, etc.
|
|
31
|
+
if: contains(github.ref_name, 'a') || contains(github.ref_name, 'b') || contains(github.ref_name, 'rc')
|
|
32
|
+
needs: build
|
|
33
|
+
runs-on: ubuntu-latest
|
|
34
|
+
environment:
|
|
35
|
+
name: testpypi
|
|
36
|
+
url: https://test.pypi.org/p/groundguard
|
|
37
|
+
permissions:
|
|
38
|
+
id-token: write
|
|
39
|
+
steps:
|
|
40
|
+
- name: Download dists
|
|
41
|
+
uses: actions/download-artifact@v4
|
|
42
|
+
with:
|
|
43
|
+
name: python-package-distributions
|
|
44
|
+
path: dist/
|
|
45
|
+
- name: Publish to TestPyPI
|
|
46
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
47
|
+
with:
|
|
48
|
+
repository-url: https://test.pypi.org/legacy/
|
|
49
|
+
|
|
50
|
+
publish-to-pypi:
|
|
51
|
+
name: Publish to PyPI (stable release only)
|
|
52
|
+
# Runs only on stable tags: v0.1.0, v1.0.0 — not on pre-releases
|
|
53
|
+
if: "!contains(github.ref_name, 'a') && !contains(github.ref_name, 'b') && !contains(github.ref_name, 'rc')"
|
|
54
|
+
needs: build
|
|
55
|
+
runs-on: ubuntu-latest
|
|
56
|
+
environment:
|
|
57
|
+
name: pypi
|
|
58
|
+
url: https://pypi.org/p/groundguard
|
|
59
|
+
permissions:
|
|
60
|
+
id-token: write
|
|
61
|
+
steps:
|
|
62
|
+
- name: Download dists
|
|
63
|
+
uses: actions/download-artifact@v4
|
|
64
|
+
with:
|
|
65
|
+
name: python-package-distributions
|
|
66
|
+
path: dist/
|
|
67
|
+
- name: Publish to PyPI
|
|
68
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## What This Project Is
|
|
8
|
+
|
|
9
|
+
**groundguard** is a Python middleware library (MIT, LLM-agnostic) that verifies AI-generated text is factually grounded in developer-provided source documents. It is not a RAG pipeline, web scraper, or agentic framework — it is a deterministic assert layer for document-intensive workflows.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Execution Process Rules
|
|
14
|
+
|
|
15
|
+
These rules were established after a code review in session 5 found bugs caused by skipping them. **Do not shortcut these.**
|
|
16
|
+
|
|
17
|
+
### Strict Role Separation (ORCHESTRATOR.md §3)
|
|
18
|
+
|
|
19
|
+
Every implementation task requires **separate agent calls** for each role. Never combine into one:
|
|
20
|
+
|
|
21
|
+
```text
|
|
22
|
+
1. Test Writer Agent → writes RED test file, commits to main (isolation: none)
|
|
23
|
+
2. Coder Agent → implements in worktree until GREEN (isolation: "worktree")
|
|
24
|
+
3. Code Reviewer Agent → reviews diff against spec (subagent_type: "Explore")
|
|
25
|
+
4. Fix Agent → applies reviewer fixes if needed (isolation: "worktree")
|
|
26
|
+
5. Test Runner → confirms GREEN after fixes
|
|
27
|
+
6. Git Commit Agent → merges worktree to main
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Why:** Session 5 collapsed all 6 roles into 1 combined agent per module. The Code Reviewer was skipped entirely. This allowed 5 bugs to reach main undetected (wrong default values, incorrect routing logic, missing exception types, hardcoded field values).
|
|
31
|
+
|
|
32
|
+
### Code Reviewer Is Mandatory
|
|
33
|
+
|
|
34
|
+
After every Coder Agent, dispatch a Code Reviewer (`subagent_type: "Explore"`) with:
|
|
35
|
+
|
|
36
|
+
- The `git diff` of the branch
|
|
37
|
+
- The relevant spec section from `plan/engineering_design_update.md`
|
|
38
|
+
- The critical constraints checklist from CLAUDE.md
|
|
39
|
+
|
|
40
|
+
Do not proceed to the next task if the reviewer returns `approved: false`.
|
|
41
|
+
|
|
42
|
+
### Code Reviewer Prompt — Required Elements (session 6 correction)
|
|
43
|
+
|
|
44
|
+
The reviewer prompt **must** include all four of these or the review is invalid:
|
|
45
|
+
|
|
46
|
+
1. **Verbatim `git diff`** — run `git diff HEAD~1` (or `git diff main...<branch>`) and paste the full output inline. Do not ask the reviewer to read files instead.
|
|
47
|
+
2. **Verbatim spec section** — paste the exact section from `plan/engineering_design_v4.md` for the module. Do not paraphrase or summarise it.
|
|
48
|
+
3. **Role 4 output format exactly** — the reviewer must return:
|
|
49
|
+
|
|
50
|
+
```json
|
|
51
|
+
{
|
|
52
|
+
"approved": true,
|
|
53
|
+
"issues": [
|
|
54
|
+
{"severity": "blocking"|"advisory", "file": "...", "line_hint": "...", "description": "...", "fix": "..."}
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Do not use alternate schemas (`findings`, `constraint_checks`, etc.) — the `severity` field is load-bearing: `"blocking"` triggers a Fix Agent, `"advisory"` does not.
|
|
60
|
+
4. **No test execution** — the reviewer is read-only (`Explore`). Do not ask it to run `pytest`. That is the Test Runner's job (Role 5).
|
|
61
|
+
|
|
62
|
+
**Why:** In Phase 21 the reviewer prompt omitted the git diff, paraphrased the spec instead of pasting it verbatim, used a non-standard output schema, and asked the reviewer to run tests. The review still passed because the implementation happened to be correct — but the process was non-compliant and would have failed to catch spec deviations reliably.
|
|
63
|
+
|
|
64
|
+
### Worktree Isolation for Coders
|
|
65
|
+
|
|
66
|
+
Every Coder Agent call must use `isolation: "worktree"`. The Test Writer commits to `main` first; the Coder's worktree branches from that commit automatically.
|
|
67
|
+
|
|
68
|
+
### Parallel Dispatch — Count Before Sending
|
|
69
|
+
|
|
70
|
+
Before sending a message with multiple parallel `Agent` calls, count the expected agents and verify all are present. Session 5 missed Worker F (Result Builder) in the first parallel wave, causing it to run solo later instead of in parallel.
|
|
71
|
+
|
|
72
|
+
### Context Injection — No Placeholder Left Behind
|
|
73
|
+
|
|
74
|
+
Every agent prompt must have all `[paste ...]` markers replaced with verbatim spec content before dispatching. An unresolved placeholder causes the agent to hallucinate silently (ORCHESTRATOR.md §9).
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Token Efficiency Protocols
|
|
79
|
+
|
|
80
|
+
### Before reading any file
|
|
81
|
+
|
|
82
|
+
Check `CODEBASE_ANALYSIS.md` (same directory as this file) first.
|
|
83
|
+
It contains the full dependency graph, module contracts, implementation
|
|
84
|
+
status, and red flags. Read it instead of source files to orient yourself.
|
|
85
|
+
Only read a source file when you are about to act on it.
|
|
86
|
+
|
|
87
|
+
### Native tool rules
|
|
88
|
+
|
|
89
|
+
- **Read tool:** State which file and which section before reading.
|
|
90
|
+
Use line ranges when the section is known. If already read this
|
|
91
|
+
session, use what is in context — do not re-read.
|
|
92
|
+
- **Grep tool:** Use `output_mode: "files_with_matches"` first.
|
|
93
|
+
Only switch to content mode when you need the actual lines.
|
|
94
|
+
- **Agent tool:** Only spawn agents when: single named role, all
|
|
95
|
+
context injected inline, mutually exclusive file scope.
|
|
96
|
+
|
|
97
|
+
### Bash command rules
|
|
98
|
+
|
|
99
|
+
Always use the quiet flags below. Never use the verbose form.
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
pytest (fast suite) → pytest -m "not llm and not loaders and not langchain and not compat" -x -q --tb=short --no-header
|
|
103
|
+
pytest (single test) → pytest <test_path> -x -q --tb=short
|
|
104
|
+
pytest (compat) → pytest -m compat -v --timeout=300 -p no:cov
|
|
105
|
+
pytest (llm) → pytest -m llm --timeout=120 -q
|
|
106
|
+
pip install → pip install -q -e ".[dev,loaders,langchain]"
|
|
107
|
+
git log → git log --oneline -10
|
|
108
|
+
git diff → git diff --stat
|
|
109
|
+
git status → git status -s
|
|
110
|
+
grep → grep -rn "term" . | head -20
|
|
111
|
+
find → find . -name "*.py" | grep -v __pycache__ | head -20
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Document updates after a discussion
|
|
115
|
+
|
|
116
|
+
Do NOT immediately read all plan documents.
|
|
117
|
+
|
|
118
|
+
1. Extract decisions from the conversation (they are already in context)
|
|
119
|
+
2. Run `grep -n "^#" <filename>` to get section headings cheaply
|
|
120
|
+
3. Read only the specific sections that need updating
|
|
121
|
+
4. Edit targeted sections — never rewrite whole documents
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## Commands
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
# Install for development (all optional extras)
|
|
129
|
+
pip install -e ".[dev,loaders,langchain]"
|
|
130
|
+
|
|
131
|
+
# Run fast suite (zero LLM calls — default for all dev work)
|
|
132
|
+
pytest -m "not llm and not loaders and not langchain and not compat" -x -q
|
|
133
|
+
|
|
134
|
+
# Run a single test
|
|
135
|
+
pytest tests/test_tier2.py::test_all_zero_scores_triggers_escalate_all_low_score -x -q
|
|
136
|
+
|
|
137
|
+
# Run real LLM integration tests (requires OPENAI_API_KEY or GOOGLE_API_KEY)
|
|
138
|
+
pytest -m llm --timeout=120 -q
|
|
139
|
+
|
|
140
|
+
# Run loaders tests (requires pip install -e ".[loaders]")
|
|
141
|
+
pytest -m loaders -q
|
|
142
|
+
|
|
143
|
+
# Run with coverage report
|
|
144
|
+
pytest -m "not llm and not loaders and not langchain and not compat" --cov=groundguard --cov-report=term-missing
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Pipeline Architecture
|
|
150
|
+
|
|
151
|
+
The pipeline is a sequential 4-tier chain. `core/verifier.py` (stub only — Phase 9 implements it) is the orchestrator.
|
|
152
|
+
|
|
153
|
+
```text
|
|
154
|
+
verify(claim, sources)
|
|
155
|
+
│
|
|
156
|
+
├── Tier 0 core/classifier.py parse_and_classify(claim)
|
|
157
|
+
│ → list[ClassifiedAtom] zero-cost rules, decimal-safe regex
|
|
158
|
+
│
|
|
159
|
+
├── chunker loaders/chunker.py chunk_sources(ctx)
|
|
160
|
+
│ → list[Chunk] sliding window if source > max_source_tokens
|
|
161
|
+
│
|
|
162
|
+
├── Tier 1 tiers/tier1_authenticity.py check_fuzzy(evidence, chunks, threshold)
|
|
163
|
+
│ gate only — raises or passes rapidfuzz partial_token_set_ratio
|
|
164
|
+
│ NEVER produces a terminal result
|
|
165
|
+
│
|
|
166
|
+
├── Tier 2 tiers/tier2_semantic.py route_claim(ctx, chunks)
|
|
167
|
+
│ → Tier2Result BM25Okapi — 3 routing branches (see below)
|
|
168
|
+
│
|
|
169
|
+
└── Tier 3 tiers/tier3_evaluation.py evaluate(ctx, chunks)
|
|
170
|
+
→ adapters/registry.py get_adapter(model) → pre_call_kwargs + post_process
|
|
171
|
+
→ Tier3ResponseModel litellm.completion, 2-attempt retry
|
|
172
|
+
→ ResultBuilder → VerificationResult (public output)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Tier 2 Routing Branches
|
|
176
|
+
|
|
177
|
+
| Condition | Branch | Action |
|
|
178
|
+
| --- | --- | --- |
|
|
179
|
+
| `score >= 0.85` | A — `SKIP_LLM_HIGH_CONFIDENCE` | Return VERIFIED, no LLM call |
|
|
180
|
+
| `0.01 < score < 0.85` | B — `ESCALATE_TO_LLM` | Send top-k chunks to Tier 3 |
|
|
181
|
+
| `score <= 0.01` and `raw_score >= 0` | C — `ESCALATE_ALL_LOW_SCORE` | Send all chunks (capped at `top_k * 3`, document order) |
|
|
182
|
+
|
|
183
|
+
The `raw_score >= 0` guard in Branch C is intentional: BM25Okapi returns negative scores on very small corpora (IDF artefact) — those fall through to Branch B instead.
|
|
184
|
+
|
|
185
|
+
### Key Mapping Rules (ResultBuilder)
|
|
186
|
+
|
|
187
|
+
- `Tier3ResponseModel.factual_consistency_score` is **0–100**; `VerificationResult.factual_consistency_score` is **0.0–1.0**. `ResultBuilder` divides by 100.
|
|
188
|
+
- `Neutral` entailment → **always** `"UNVERIFIABLE"`, `is_valid=False`. Never promoted regardless of coverage %.
|
|
189
|
+
- `sources_used` is filtered against `ctx.original_sources` — hallucinated `source_id`s are scrubbed.
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Module Ownership Rules
|
|
194
|
+
|
|
195
|
+
These prevent circular imports and are load-bearing:
|
|
196
|
+
|
|
197
|
+
| Rule | Detail |
|
|
198
|
+
| --- | --- |
|
|
199
|
+
| `Chunk` defined in `loaders/chunker.py` | NOT in `models/internal.py` — would create circular import |
|
|
200
|
+
| `models/internal.py` imports `Chunk` only under `TYPE_CHECKING` | Use string annotations at runtime |
|
|
201
|
+
| All tier modules import `Chunk` under `TYPE_CHECKING` | Same pattern throughout |
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Data Model Hierarchy
|
|
206
|
+
|
|
207
|
+
```text
|
|
208
|
+
Public (models/result.py): Source, AtomicClaimResult, VerificationResult
|
|
209
|
+
Internal (models/internal.py): VerificationContext, SharedCostTracker, ClassifiedAtom,
|
|
210
|
+
RoutingDecision, Tier2Result, ClaimInput
|
|
211
|
+
Tier 3 (models/tier3.py): Tier3ResponseModel + sub-models (Pydantic v2 BaseModel)
|
|
212
|
+
Chunker (loaders/chunker.py): Chunk ← defined here, not in models/
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
`VerificationContext` is the per-call state bag passed through all tiers, constructed once per `verify()` call.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Critical Implementation Details
|
|
220
|
+
|
|
221
|
+
**`SharedCostTracker`** — soft cap enforced *after* LLM call completes; triggering call is already billed. Cap check runs inside `threading.Lock`. Default `max_spend=float('inf')` — **do not change to a numeric default**.
|
|
222
|
+
|
|
223
|
+
**`_boundary_id`** — `secrets.token_hex(6)`, 12 hex chars, 48-bit entropy. Set at `VerificationContext` construction. `render_prompt` reads `ctx._boundary_id` — it never generates a new one.
|
|
224
|
+
|
|
225
|
+
**Tier 1 algorithm** — uses `rapidfuzz.fuzz.partial_token_set_ratio` (deliberate deviation from spec's `partial_ratio`). This is a known, preserved decision: `partial_ratio` scored too low on dropped-filler-word cases. Do not revert to `partial_ratio` without re-running the calibration benchmark (Phase 14).
|
|
226
|
+
|
|
227
|
+
**`parse_response(response, model)`** — routes via `get_adapter(model).post_process()`. OLLAMA_ADAPTER strips `<think>` tags (rfind-based; regex fallback), falls back to `reasoning_content` if content is empty. DEFAULT adapter strips markdown fences. Retry catches `pydantic.ValidationError`, `ValueError`, and `IndexError` (the last covers `choices=[]` edge case).
|
|
228
|
+
|
|
229
|
+
**`auto_chunk=False`** — recommended for large-context models (Gemini 1.5 Pro, Claude 3.5+). BM25 can silently drop low-scoring chunks that contain negating context ("Lost Context Problem").
|
|
230
|
+
|
|
231
|
+
**Exception contract** — `verify()` / `averify()` are fail-loud (all exceptions propagate except `ParseError` → returned as `status="PARSE_ERROR"`). `verify_batch_async()` is fail-contained (all exceptions absorbed per item). This asymmetry is intentional — see `plan/engineering_design_update.md` §8.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Test Structure
|
|
236
|
+
|
|
237
|
+
```text
|
|
238
|
+
tests/
|
|
239
|
+
├── test_exceptions.py # Phase 1
|
|
240
|
+
├── test_log.py # Phase 1
|
|
241
|
+
├── test_models.py # Phase 2 — TDD #15 (VerificationContext defaults) + Phase 21 (profile fields)
|
|
242
|
+
├── test_classifier.py # Phase 3 — decimal-safe split, inferential signals
|
|
243
|
+
├── test_chunker.py # Phase 4 — char offsets, overlap guard, sliding window
|
|
244
|
+
├── test_helpers.py # Phase 4 — @pytest.mark.loaders (skipped by default)
|
|
245
|
+
├── test_tier1.py # Phase 5 — fuzzy match boundaries
|
|
246
|
+
├── test_tier2.py # Phase 6 — BM25 routing branches, Branch C cap
|
|
247
|
+
├── test_evaluation.py # Phase 7 — render_prompt, parse_response, retry loop
|
|
248
|
+
├── test_builder.py # Phase 8 — score division, Neutral mapping, page_hint
|
|
249
|
+
├── test_adapters.py # Phase 16 (T-60) — adapter prefix routing, post_process, pre_call_kwargs
|
|
250
|
+
├── fixtures/ # Phase 16 (T-65) — sample.pdf, sample.docx (generated, gitignored)
|
|
251
|
+
│ └── scaffold_fixtures.py # Run once to generate fixture files
|
|
252
|
+
└── integration/ # Phase 12+ — @pytest.mark.llm (requires real API keys)
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
Fast Suite runs in ~5 seconds. `litellm.completion` and `litellm.acompletion` are always mocked in Fast Suite tests via `pytest-mock`.
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Using a Local LLM
|
|
260
|
+
|
|
261
|
+
The library uses `litellm` which supports Ollama natively. No code changes needed:
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
ollama pull qwen3:14b # ~9GB Q4_K_M, fits in 15GB RAM — recommended for integration tests
|
|
265
|
+
ollama serve # starts on http://localhost:11434
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
result = verify(claim="...", sources=[...], model="ollama/qwen3:14b", max_spend=0.0)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
**Ollama thinking mode (Phase 16 complete)** — Thinking-capable Ollama models (qwen3, DeepSeek-R1, Gemma 4, Kimi K2, LFM2.5, GPT-OSS, etc.) emit `<think>...</think>` tags or drop `content` when thinking is active. `OLLAMA_ADAPTER` in `adapters/registry.py` handles this automatically: strips `<think>` tags (rfind-based, regex fallback), falls back to `reasoning_content` if content is empty. Models reason freely — `think=False` was removed in T-62.
|
|
273
|
+
|
|
274
|
+
Ollama supports `response_format` (structured output via JSON schema grammar). `parse_response()` uses the fence-stripping + `<think>` tag stripping fallback for any remaining edge cases.
|
|
275
|
+
|
|
276
|
+
For the Real Suite integration tests, use `--timeout=300` (qwen3:30b takes 30–90s per call).
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# GROUNDGUARD_CONTRACT.md
|
|
2
|
+
|
|
3
|
+
This document defines the execution contract for the `groundguard` library. It describes what the library guarantees unconditionally, what callers can configure, how invariants hold under composition, and what is explicitly out of scope.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 1. Guarantees
|
|
8
|
+
|
|
9
|
+
These invariants always hold. If any of them would be violated, an exception is raised instead of returning a result.
|
|
10
|
+
|
|
11
|
+
**Citation on VERIFIED results**
|
|
12
|
+
Every `AtomicClaimResult` with `status="VERIFIED"` has a non-null `citation`. This is enforced by `_assert_citation_invariant` in `ResultBuilder` — a `VERIFIED` result with a null citation raises `InvariantError` before it can be returned.
|
|
13
|
+
|
|
14
|
+
**Non-negative cost**
|
|
15
|
+
`cost_usd` is never negative on any result. The `max_spend` cap is a *soft* cap: the triggering LLM call is allowed to complete and is billed before the cap fires. Subsequent calls in the same context are blocked.
|
|
16
|
+
|
|
17
|
+
**Per-call boundary ID**
|
|
18
|
+
Every `verify()` and `averify()` call generates a unique `boundary_id` of exactly 12 hex characters (48-bit entropy via `secrets.token_hex(6)`). The boundary ID is embedded in the prompt to prevent prompt-injection attacks that splice content across the boundary. It is never reused across calls.
|
|
19
|
+
|
|
20
|
+
**Majority vote call count**
|
|
21
|
+
When majority vote is active (triggered by a profile with `majority_vote=True` and a result score below `majority_vote_confidence_threshold`), exactly 3 LLM calls are made — never 1 or 2. The vote cannot complete with fewer.
|
|
22
|
+
|
|
23
|
+
**Tie-break conservatism**
|
|
24
|
+
A 1-1-1 split across 3 majority vote calls always yields `is_grounded=False` and `status="NOT_GROUNDED"`. Ties are never silently promoted to a positive verdict.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## 2. Configurables
|
|
29
|
+
|
|
30
|
+
These parameters are caller-controlled. They change behaviour but do not affect the invariants above.
|
|
31
|
+
|
|
32
|
+
**`profile`** — a `VerificationProfile` dataclass (preset: `GENERAL_PROFILE`, `STRICT_PROFILE`, or `RESEARCH_PROFILE`). Sets defaults for `faithfulness_threshold`, `tier2_lexical_threshold`, `bm25_top_k`, `majority_vote`, and `audit`. Explicit per-call parameters always override profile defaults.
|
|
33
|
+
|
|
34
|
+
**`faithfulness_threshold`** — float, 0.0–1.0. Minimum score for a result to be considered grounded. Explicit value beats `majority_vote_on_borderline` which beats the profile default (precedence enforced in `VerificationContext.__post_init__`).
|
|
35
|
+
|
|
36
|
+
**`max_spend`** — soft USD cap. Default `float('inf')` (no cap). The triggering call is billed; subsequent calls in the batch or context are blocked with `status="SKIPPED_DUE_TO_COST"`.
|
|
37
|
+
|
|
38
|
+
**`model`** — any litellm model string (e.g. `"gpt-4o-mini"`, `"ollama/qwen3:14b"`, `"gemini/gemini-2.0-flash"`). Passed through to litellm without transformation.
|
|
39
|
+
|
|
40
|
+
**`api_base`** — passed to litellm for custom endpoints (e.g. local Ollama, Azure deployments).
|
|
41
|
+
|
|
42
|
+
**`auto_chunk`** — default `True`. When `True`, long sources are split by a sliding-window chunker and BM25 retrieves the top-k chunks for Tier 3. When `False`, full source content is forwarded to Tier 3 without chunking — recommended for large-context models to avoid the Lost Context Problem (negating clauses in low-scoring chunks not reaching the LLM).
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## 3. Invariants Under Composition
|
|
47
|
+
|
|
48
|
+
These invariants hold when combining multiple groundguard APIs in a single workflow.
|
|
49
|
+
|
|
50
|
+
**`SourceAccumulator` is opt-in**
|
|
51
|
+
`SourceAccumulator` is not wired into the pipeline. No verification function accepts a `SourceAccumulator` directly — callers always call `.sources()` explicitly and pass the resulting `list[Source]`:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
acc = SourceAccumulator()
|
|
55
|
+
acc.add(db_source, provenance="database_lookup", agent_id="agent_1")
|
|
56
|
+
acc.add(llm_source, provenance="llm_generated", is_llm_derived=True, agent_id="agent_2")
|
|
57
|
+
result = verify_analysis(agent_output, sources=acc.sources(), model="gpt-4o-mini")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
This keeps the public API stable and lets callers inspect, filter, or log the source list before verification runs.
|
|
61
|
+
|
|
62
|
+
**`verify_clause` + `TermRegistry`**
|
|
63
|
+
When a `TermRegistry` is provided to `verify_clause`, term definitions are injected as pinned `Source` objects into the sources list *before* `verify()` is called. Term resolution runs before Tier 0, so pinned definitions affect the atom count and can alter routing decisions.
|
|
64
|
+
|
|
65
|
+
**`averify_batch` failure isolation**
|
|
66
|
+
`averify_batch` is fail-contained per item. An exception in one item (including `VerificationCostExceededError`) does not abort the batch. Items that hit the spend cap return `status="SKIPPED_DUE_TO_COST"`. All other per-item exceptions return `status="ERROR"`. The shared `SharedCostTracker` applies across the entire batch.
|
|
67
|
+
|
|
68
|
+
**Parameter precedence**
|
|
69
|
+
`explicit call params > ctx.majority_vote_on_borderline > profile defaults`. This precedence is enforced in `VerificationContext.__post_init__` and cannot be overridden by downstream tiers.
|
|
70
|
+
|
|
71
|
+
**`verify()` / `averify()` vs. `averify_batch`**
|
|
72
|
+
`verify()` and `averify()` are fail-loud: all exceptions propagate to the caller except `ParseError`, which is returned as `status="PARSE_ERROR"`. `averify_batch` is fail-contained. This asymmetry is intentional — single-call users get explicit error signals; batch callers get partial results without a full abort.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 4. Undefined Behaviour
|
|
77
|
+
|
|
78
|
+
The following scenarios are not guaranteed to work correctly and may change across versions without a deprecation notice.
|
|
79
|
+
|
|
80
|
+
**Models that ignore `response_format`**
|
|
81
|
+
Groundguard requests structured JSON output via `response_format`. If a model ignores the schema (returning free text instead), the retry loop attempts to parse the output using fence-stripping and Pydantic validation. This is a best-effort fallback — it is not guaranteed to succeed on all model/prompt combinations.
|
|
82
|
+
|
|
83
|
+
**Tier 2.5 on non-English text**
|
|
84
|
+
The numerical conflict detector (`tier25_preprocessing`) uses regex patterns calibrated on English-language text (digits, `%`, `$`, metric suffixes `M/B/K`). Behaviour on non-English numerals or currency formats is undefined.
|
|
85
|
+
|
|
86
|
+
**BM25 on single-sentence sources**
|
|
87
|
+
BM25Okapi can return negative scores on very small corpora due to IDF artefacts. On single-sentence sources, routing may fall through to Branch B (`ESCALATE_TO_LLM`) even when Branch C (`ESCALATE_ALL_LOW_SCORE`) would be more appropriate. Use `auto_chunk=False` on very short sources to avoid this edge case.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pulkit Jain
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|