groundguard 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. groundguard-0.1.0/.coverage +0 -0
  2. groundguard-0.1.0/.gitattributes +2 -0
  3. groundguard-0.1.0/.github/workflows/ci.yml +63 -0
  4. groundguard-0.1.0/.github/workflows/release.yml +68 -0
  5. groundguard-0.1.0/.gitignore +8 -0
  6. groundguard-0.1.0/CLAUDE.md +276 -0
  7. groundguard-0.1.0/GROUNDGUARD_CONTRACT.md +87 -0
  8. groundguard-0.1.0/LICENSE +21 -0
  9. groundguard-0.1.0/PKG-INFO +404 -0
  10. groundguard-0.1.0/README.md +363 -0
  11. groundguard-0.1.0/examples/__init__.py +1 -0
  12. groundguard-0.1.0/examples/bedrock_rag.py +71 -0
  13. groundguard-0.1.0/examples/cohere_rag.py +53 -0
  14. groundguard-0.1.0/examples/full_output_verification.py +46 -0
  15. groundguard-0.1.0/examples/langchain_retrieval_qa.py +63 -0
  16. groundguard-0.1.0/examples/llamaindex_citation.py +54 -0
  17. groundguard-0.1.0/examples/openai_assistants.py +80 -0
  18. groundguard-0.1.0/groundguard/__init__.py +37 -0
  19. groundguard-0.1.0/groundguard/_constants.py +14 -0
  20. groundguard-0.1.0/groundguard/_log.py +4 -0
  21. groundguard-0.1.0/groundguard/adapters/__init__.py +4 -0
  22. groundguard-0.1.0/groundguard/adapters/registry.py +298 -0
  23. groundguard-0.1.0/groundguard/circuit_breaker.py +32 -0
  24. groundguard-0.1.0/groundguard/core/__init__.py +0 -0
  25. groundguard-0.1.0/groundguard/core/claim_extractor.py +85 -0
  26. groundguard-0.1.0/groundguard/core/classifier.py +50 -0
  27. groundguard-0.1.0/groundguard/core/result_builder.py +60 -0
  28. groundguard-0.1.0/groundguard/core/verifier.py +758 -0
  29. groundguard-0.1.0/groundguard/cost_estimate.py +137 -0
  30. groundguard-0.1.0/groundguard/exceptions.py +55 -0
  31. groundguard-0.1.0/groundguard/integrations/__init__.py +0 -0
  32. groundguard-0.1.0/groundguard/integrations/langchain.py +64 -0
  33. groundguard-0.1.0/groundguard/loaders/__init__.py +0 -0
  34. groundguard-0.1.0/groundguard/loaders/accumulator.py +108 -0
  35. groundguard-0.1.0/groundguard/loaders/chunker.py +121 -0
  36. groundguard-0.1.0/groundguard/loaders/helpers.py +54 -0
  37. groundguard-0.1.0/groundguard/loaders/legal.py +124 -0
  38. groundguard-0.1.0/groundguard/loaders/structured.py +104 -0
  39. groundguard-0.1.0/groundguard/models/__init__.py +0 -0
  40. groundguard-0.1.0/groundguard/models/builder.py +162 -0
  41. groundguard-0.1.0/groundguard/models/internal.py +154 -0
  42. groundguard-0.1.0/groundguard/models/result.py +178 -0
  43. groundguard-0.1.0/groundguard/models/tier3.py +96 -0
  44. groundguard-0.1.0/groundguard/profiles.py +39 -0
  45. groundguard-0.1.0/groundguard/tiers/__init__.py +0 -0
  46. groundguard-0.1.0/groundguard/tiers/tier1_authenticity.py +63 -0
  47. groundguard-0.1.0/groundguard/tiers/tier25_preprocessing.py +244 -0
  48. groundguard-0.1.0/groundguard/tiers/tier2_semantic.py +83 -0
  49. groundguard-0.1.0/groundguard/tiers/tier3_evaluation.py +401 -0
  50. groundguard-0.1.0/pyproject.toml +75 -0
  51. groundguard-0.1.0/tests/__init__.py +0 -0
  52. groundguard-0.1.0/tests/conftest.py +244 -0
  53. groundguard-0.1.0/tests/integration/__init__.py +0 -0
  54. groundguard-0.1.0/tests/integration/compat_models.py +194 -0
  55. groundguard-0.1.0/tests/integration/test_compat_suite.py +194 -0
  56. groundguard-0.1.0/tests/integration/test_deterministic.py +472 -0
  57. groundguard-0.1.0/tests/integration/test_real_suite.py +431 -0
  58. groundguard-0.1.0/tests/test_accumulator.py +142 -0
  59. groundguard-0.1.0/tests/test_adapters.py +215 -0
  60. groundguard-0.1.0/tests/test_batch.py +599 -0
  61. groundguard-0.1.0/tests/test_builder.py +242 -0
  62. groundguard-0.1.0/tests/test_chunker.py +89 -0
  63. groundguard-0.1.0/tests/test_circuit_breaker.py +86 -0
  64. groundguard-0.1.0/tests/test_claim_extractor.py +93 -0
  65. groundguard-0.1.0/tests/test_classifier.py +68 -0
  66. groundguard-0.1.0/tests/test_clause_preprocessor.py +109 -0
  67. groundguard-0.1.0/tests/test_cost_estimate.py +68 -0
  68. groundguard-0.1.0/tests/test_evaluation.py +415 -0
  69. groundguard-0.1.0/tests/test_examples.py +47 -0
  70. groundguard-0.1.0/tests/test_exceptions.py +31 -0
  71. groundguard-0.1.0/tests/test_exports.py +51 -0
  72. groundguard-0.1.0/tests/test_faithfulness.py +274 -0
  73. groundguard-0.1.0/tests/test_flattener.py +52 -0
  74. groundguard-0.1.0/tests/test_helpers.py +27 -0
  75. groundguard-0.1.0/tests/test_langchain.py +273 -0
  76. groundguard-0.1.0/tests/test_log.py +9 -0
  77. groundguard-0.1.0/tests/test_logging.py +439 -0
  78. groundguard-0.1.0/tests/test_models.py +85 -0
  79. groundguard-0.1.0/tests/test_models_v2.py +178 -0
  80. groundguard-0.1.0/tests/test_profiles.py +43 -0
  81. groundguard-0.1.0/tests/test_structured_loader.py +98 -0
  82. groundguard-0.1.0/tests/test_tier1.py +95 -0
  83. groundguard-0.1.0/tests/test_tier2.py +134 -0
  84. groundguard-0.1.0/tests/test_tier25.py +187 -0
  85. groundguard-0.1.0/tests/test_verifier.py +539 -0
  86. groundguard-0.1.0/tests/test_verify_analysis.py +143 -0
  87. groundguard-0.1.0/tests/test_verify_answer.py +152 -0
  88. groundguard-0.1.0/tests/test_verify_clause.py +94 -0
Binary file
@@ -0,0 +1,2 @@
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
@@ -0,0 +1,63 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["**"]
6
+ pull_request:
7
+ branches: ["main"]
8
+
9
+ jobs:
10
+ fast-suite:
11
+ name: Fast Suite (no LLM)
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - name: Set up Python ${{ matrix.python-version }}
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+ - name: Install core deps
23
+ run: pip install -e ".[dev]"
24
+ - name: Run fast suite
25
+ run: pytest -m "not llm and not loaders and not langchain and not compat" -x -q --tb=short
26
+
27
+ loaders-suite:
28
+ name: Optional Loaders Tests
29
+ runs-on: ubuntu-latest
30
+ steps:
31
+ - uses: actions/checkout@v4
32
+ - uses: actions/setup-python@v5
33
+ with:
34
+ python-version: "3.11"
35
+ - run: pip install -e ".[loaders,dev]"
36
+ - run: pytest -m loaders -x -q
37
+
38
+ langchain-suite:
39
+ name: Optional LangChain Tests
40
+ runs-on: ubuntu-latest
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+ - uses: actions/setup-python@v5
44
+ with:
45
+ python-version: "3.11"
46
+ - run: pip install -e ".[langchain,dev]"
47
+ - run: pytest -m langchain -x -q
48
+
49
+ real-suite:
50
+ name: Real Suite (LLM endpoints)
51
+ runs-on: ubuntu-latest
52
+ if: github.event_name == 'pull_request'
53
+ steps:
54
+ - uses: actions/checkout@v4
55
+ - uses: actions/setup-python@v5
56
+ with:
57
+ python-version: "3.11"
58
+ - run: pip install -e ".[dev]"
59
+ - name: Run real suite
60
+ run: pytest -m llm --tb=short --timeout=120 -q
61
+ env:
62
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
63
+ GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
@@ -0,0 +1,68 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ jobs:
9
+ build:
10
+ name: Build distribution
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.11"
18
+ - name: Install pypa/build
19
+ run: python -m pip install build --user
20
+ - name: Build wheel and source tarball
21
+ run: python -m build
22
+ - name: Store distribution packages
23
+ uses: actions/upload-artifact@v4
24
+ with:
25
+ name: python-package-distributions
26
+ path: dist/
27
+
28
+ publish-to-testpypi:
29
+ name: Publish to TestPyPI (pre-release only)
30
+ # Runs only on pre-release tags: v0.1.0a1, v0.2.0b2, v1.0.0rc1, etc.
31
+ if: contains(github.ref_name, 'a') || contains(github.ref_name, 'b') || contains(github.ref_name, 'rc')
32
+ needs: build
33
+ runs-on: ubuntu-latest
34
+ environment:
35
+ name: testpypi
36
+ url: https://test.pypi.org/p/groundguard
37
+ permissions:
38
+ id-token: write
39
+ steps:
40
+ - name: Download dists
41
+ uses: actions/download-artifact@v4
42
+ with:
43
+ name: python-package-distributions
44
+ path: dist/
45
+ - name: Publish to TestPyPI
46
+ uses: pypa/gh-action-pypi-publish@release/v1
47
+ with:
48
+ repository-url: https://test.pypi.org/legacy/
49
+
50
+ publish-to-pypi:
51
+ name: Publish to PyPI (stable release only)
52
+ # Runs only on stable tags: v0.1.0, v1.0.0 — not on pre-releases
53
+ if: "!contains(github.ref_name, 'a') && !contains(github.ref_name, 'b') && !contains(github.ref_name, 'rc')"
54
+ needs: build
55
+ runs-on: ubuntu-latest
56
+ environment:
57
+ name: pypi
58
+ url: https://pypi.org/p/groundguard
59
+ permissions:
60
+ id-token: write
61
+ steps:
62
+ - name: Download dists
63
+ uses: actions/download-artifact@v4
64
+ with:
65
+ name: python-package-distributions
66
+ path: dist/
67
+ - name: Publish to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,8 @@
1
+ # Python virtual environments
2
+ .venv/
3
+ venv/
4
+ ENV/
5
+ env/
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
@@ -0,0 +1,276 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ---
6
+
7
+ ## What This Project Is
8
+
9
+ **groundguard** is a Python middleware library (MIT, LLM-agnostic) that verifies AI-generated text is factually grounded in developer-provided source documents. It is not a RAG pipeline, web scraper, or agentic framework — it is a deterministic assert layer for document-intensive workflows.
10
+
11
+ ---
12
+
13
+ ## Execution Process Rules
14
+
15
+ These rules were established after a code review in session 5 found bugs caused by skipping them. **Do not shortcut these.**
16
+
17
+ ### Strict Role Separation (ORCHESTRATOR.md §3)
18
+
19
+ Every implementation task requires **separate agent calls** for each role. Never combine into one:
20
+
21
+ ```text
22
+ 1. Test Writer Agent → writes RED test file, commits to main (isolation: none)
23
+ 2. Coder Agent → implements in worktree until GREEN (isolation: "worktree")
24
+ 3. Code Reviewer Agent → reviews diff against spec (subagent_type: "Explore")
25
+ 4. Fix Agent → applies reviewer fixes if needed (isolation: "worktree")
26
+ 5. Test Runner → confirms GREEN after fixes
27
+ 6. Git Commit Agent → merges worktree to main
28
+ ```
29
+
30
+ **Why:** Session 5 collapsed all 6 roles into 1 combined agent per module. The Code Reviewer was skipped entirely. This allowed 5 bugs to reach main undetected (wrong default values, incorrect routing logic, missing exception types, hardcoded field values).
31
+
32
+ ### Code Reviewer Is Mandatory
33
+
34
+ After every Coder Agent, dispatch a Code Reviewer (`subagent_type: "Explore"`) with:
35
+
36
+ - The `git diff` of the branch
37
+ - The relevant spec section from `plan/engineering_design_update.md`
38
+ - The critical constraints checklist from CLAUDE.md
39
+
40
+ Do not proceed to the next task if the reviewer returns `approved: false`.
41
+
42
+ ### Code Reviewer Prompt — Required Elements (session 6 correction)
43
+
44
+ The reviewer prompt **must** include all four of these or the review is invalid:
45
+
46
+ 1. **Verbatim `git diff`** — run `git diff HEAD~1` (or `git diff main...<branch>`) and paste the full output inline. Do not ask the reviewer to read files instead.
47
+ 2. **Verbatim spec section** — paste the exact section from `plan/engineering_design_v4.md` for the module. Do not paraphrase or summarise it.
48
+ 3. **Role 4 output format exactly** — the reviewer must return:
49
+
50
+ ```json
51
+ {
52
+ "approved": true,
53
+ "issues": [
54
+ {"severity": "blocking"|"advisory", "file": "...", "line_hint": "...", "description": "...", "fix": "..."}
55
+ ]
56
+ }
57
+ ```
58
+
59
+ Do not use alternate schemas (`findings`, `constraint_checks`, etc.) — the `severity` field is load-bearing: `"blocking"` triggers a Fix Agent, `"advisory"` does not.
60
+ 4. **No test execution** — the reviewer is read-only (`Explore`). Do not ask it to run `pytest`. That is the Test Runner's job (Role 5).
61
+
62
+ **Why:** In Phase 21 the reviewer prompt omitted the git diff, paraphrased the spec instead of pasting it verbatim, used a non-standard output schema, and asked the reviewer to run tests. The review still passed because the implementation happened to be correct — but the process was non-compliant and would have failed to catch spec deviations reliably.
63
+
64
+ ### Worktree Isolation for Coders
65
+
66
+ Every Coder Agent call must use `isolation: "worktree"`. The Test Writer commits to `main` first; the Coder's worktree branches from that commit automatically.
67
+
68
+ ### Parallel Dispatch — Count Before Sending
69
+
70
+ Before sending a message with multiple parallel `Agent` calls, count the expected agents and verify all are present. Session 5 missed Worker F (Result Builder) in the first parallel wave, causing it to run solo later instead of in parallel.
71
+
72
+ ### Context Injection — No Placeholder Left Behind
73
+
74
+ Every agent prompt must have all `[paste ...]` markers replaced with verbatim spec content before dispatching. An unresolved placeholder causes the agent to hallucinate silently (ORCHESTRATOR.md §9).
75
+
76
+ ---
77
+
78
+ ## Token Efficiency Protocols
79
+
80
+ ### Before reading any file
81
+
82
+ Check `CODEBASE_ANALYSIS.md` (same directory as this file) first.
83
+ It contains the full dependency graph, module contracts, implementation
84
+ status, and red flags. Read it instead of source files to orient yourself.
85
+ Only read a source file when you are about to act on it.
86
+
87
+ ### Native tool rules
88
+
89
+ - **Read tool:** State which file and which section before reading.
90
+ Use line ranges when the section is known. If already read this
91
+ session, use what is in context — do not re-read.
92
+ - **Grep tool:** Use `output_mode: "files_with_matches"` first.
93
+ Only switch to content mode when you need the actual lines.
94
+ - **Agent tool:** Only spawn agents when: single named role, all
95
+ context injected inline, mutually exclusive file scope.
96
+
97
+ ### Bash command rules
98
+
99
+ Always use the quiet flags below. Never use the verbose form.
100
+
101
+ ```
102
+ pytest (fast suite) → pytest -m "not llm and not loaders and not langchain and not compat" -x -q --tb=short --no-header
103
+ pytest (single test) → pytest <test_path> -x -q --tb=short
104
+ pytest (compat) → pytest -m compat -v --timeout=300 -p no:cov
105
+ pytest (llm) → pytest -m llm --timeout=120 -q
106
+ pip install → pip install -q -e ".[dev,loaders,langchain]"
107
+ git log → git log --oneline -10
108
+ git diff → git diff --stat
109
+ git status → git status -s
110
+ grep → grep -rn "term" . | head -20
111
+ find → find . -name "*.py" | grep -v __pycache__ | head -20
112
+ ```
113
+
114
+ ### Document updates after a discussion
115
+
116
+ Do NOT immediately read all plan documents.
117
+
118
+ 1. Extract decisions from the conversation (they are already in context)
119
+ 2. Run `grep -n "^#" <filename>` to get section headings cheaply
120
+ 3. Read only the specific sections that need updating
121
+ 4. Edit targeted sections — never rewrite whole documents
122
+
123
+ ---
124
+
125
+ ## Commands
126
+
127
+ ```bash
128
+ # Install for development (all optional extras)
129
+ pip install -e ".[dev,loaders,langchain]"
130
+
131
+ # Run fast suite (zero LLM calls — default for all dev work)
132
+ pytest -m "not llm and not loaders and not langchain and not compat" -x -q
133
+
134
+ # Run a single test
135
+ pytest tests/test_tier2.py::test_all_zero_scores_triggers_escalate_all_low_score -x -q
136
+
137
+ # Run real LLM integration tests (requires OPENAI_API_KEY or GOOGLE_API_KEY)
138
+ pytest -m llm --timeout=120 -q
139
+
140
+ # Run loaders tests (requires pip install -e ".[loaders]")
141
+ pytest -m loaders -q
142
+
143
+ # Run with coverage report
144
+ pytest -m "not llm and not loaders and not langchain and not compat" --cov=groundguard --cov-report=term-missing
145
+ ```
146
+
147
+ ---
148
+
149
+ ## Pipeline Architecture
150
+
151
+ The pipeline is a sequential 4-tier chain. `core/verifier.py` (stub only — Phase 9 implements it) is the orchestrator.
152
+
153
+ ```text
154
+ verify(claim, sources)
155
+
156
+ ├── Tier 0 core/classifier.py parse_and_classify(claim)
157
+ │ → list[ClassifiedAtom] zero-cost rules, decimal-safe regex
158
+
159
+ ├── chunker loaders/chunker.py chunk_sources(ctx)
160
+ │ → list[Chunk] sliding window if source > max_source_tokens
161
+
162
+ ├── Tier 1 tiers/tier1_authenticity.py check_fuzzy(evidence, chunks, threshold)
163
+ │ gate only — raises or passes rapidfuzz partial_token_set_ratio
164
+ │ NEVER produces a terminal result
165
+
166
+ ├── Tier 2 tiers/tier2_semantic.py route_claim(ctx, chunks)
167
+ │ → Tier2Result BM25Okapi — 3 routing branches (see below)
168
+
169
+ └── Tier 3 tiers/tier3_evaluation.py evaluate(ctx, chunks)
170
+ → adapters/registry.py get_adapter(model) → pre_call_kwargs + post_process
171
+ → Tier3ResponseModel litellm.completion, 2-attempt retry
172
+ → ResultBuilder → VerificationResult (public output)
173
+ ```
174
+
175
+ ### Tier 2 Routing Branches
176
+
177
+ | Condition | Branch | Action |
178
+ | --- | --- | --- |
179
+ | `score >= 0.85` | A — `SKIP_LLM_HIGH_CONFIDENCE` | Return VERIFIED, no LLM call |
180
+ | `0.01 < score < 0.85` | B — `ESCALATE_TO_LLM` | Send top-k chunks to Tier 3 |
181
+ | `score <= 0.01` and `raw_score >= 0` | C — `ESCALATE_ALL_LOW_SCORE` | Send all chunks (capped at `top_k * 3`, document order) |
182
+
183
+ The `raw_score >= 0` guard in Branch C is intentional: BM25Okapi returns negative scores on very small corpora (IDF artefact) — those fall through to Branch B instead.
184
+
185
+ ### Key Mapping Rules (ResultBuilder)
186
+
187
+ - `Tier3ResponseModel.factual_consistency_score` is **0–100**; `VerificationResult.factual_consistency_score` is **0.0–1.0**. `ResultBuilder` divides by 100.
188
+ - `Neutral` entailment → **always** `"UNVERIFIABLE"`, `is_valid=False`. Never promoted regardless of coverage %.
189
+ - `sources_used` is filtered against `ctx.original_sources` — hallucinated `source_id`s are scrubbed.
190
+
191
+ ---
192
+
193
+ ## Module Ownership Rules
194
+
195
+ These prevent circular imports and are load-bearing:
196
+
197
+ | Rule | Detail |
198
+ | --- | --- |
199
+ | `Chunk` defined in `loaders/chunker.py` | NOT in `models/internal.py` — would create circular import |
200
+ | `models/internal.py` imports `Chunk` only under `TYPE_CHECKING` | Use string annotations at runtime |
201
+ | All tier modules import `Chunk` under `TYPE_CHECKING` | Same pattern throughout |
202
+
203
+ ---
204
+
205
+ ## Data Model Hierarchy
206
+
207
+ ```text
208
+ Public (models/result.py): Source, AtomicClaimResult, VerificationResult
209
+ Internal (models/internal.py): VerificationContext, SharedCostTracker, ClassifiedAtom,
210
+ RoutingDecision, Tier2Result, ClaimInput
211
+ Tier 3 (models/tier3.py): Tier3ResponseModel + sub-models (Pydantic v2 BaseModel)
212
+ Chunker (loaders/chunker.py): Chunk ← defined here, not in models/
213
+ ```
214
+
215
+ `VerificationContext` is the per-call state bag passed through all tiers, constructed once per `verify()` call.
216
+
217
+ ---
218
+
219
+ ## Critical Implementation Details
220
+
221
+ **`SharedCostTracker`** — soft cap enforced *after* LLM call completes; triggering call is already billed. Cap check runs inside `threading.Lock`. Default `max_spend=float('inf')` — **do not change to a numeric default**.
222
+
223
+ **`_boundary_id`** — `secrets.token_hex(6)`, 12 hex chars, 48-bit entropy. Set at `VerificationContext` construction. `render_prompt` reads `ctx._boundary_id` — it never generates a new one.
224
+
225
+ **Tier 1 algorithm** — uses `rapidfuzz.fuzz.partial_token_set_ratio` (deliberate deviation from spec's `partial_ratio`). This is a known, preserved decision: `partial_ratio` scored too low on dropped-filler-word cases. Do not revert to `partial_ratio` without re-running the calibration benchmark (Phase 14).
226
+
227
+ **`parse_response(response, model)`** — routes via `get_adapter(model).post_process()`. OLLAMA_ADAPTER strips `<think>` tags (rfind-based; regex fallback), falls back to `reasoning_content` if content is empty. DEFAULT adapter strips markdown fences. Retry catches `pydantic.ValidationError`, `ValueError`, and `IndexError` (the last covers `choices=[]` edge case).
228
+
229
+ **`auto_chunk=False`** — recommended for large-context models (Gemini 1.5 Pro, Claude 3.5+). BM25 can silently drop low-scoring chunks that contain negating context ("Lost Context Problem").
230
+
231
+ **Exception contract** — `verify()` / `averify()` are fail-loud (all exceptions propagate except `ParseError` → returned as `status="PARSE_ERROR"`). `verify_batch_async()` is fail-contained (all exceptions absorbed per item). This asymmetry is intentional — see `plan/engineering_design_update.md` §8.
232
+
233
+ ---
234
+
235
+ ## Test Structure
236
+
237
+ ```text
238
+ tests/
239
+ ├── test_exceptions.py # Phase 1
240
+ ├── test_log.py # Phase 1
241
+ ├── test_models.py # Phase 2 — TDD #15 (VerificationContext defaults) + Phase 21 (profile fields)
242
+ ├── test_classifier.py # Phase 3 — decimal-safe split, inferential signals
243
+ ├── test_chunker.py # Phase 4 — char offsets, overlap guard, sliding window
244
+ ├── test_helpers.py # Phase 4 — @pytest.mark.loaders (skipped by default)
245
+ ├── test_tier1.py # Phase 5 — fuzzy match boundaries
246
+ ├── test_tier2.py # Phase 6 — BM25 routing branches, Branch C cap
247
+ ├── test_evaluation.py # Phase 7 — render_prompt, parse_response, retry loop
248
+ ├── test_builder.py # Phase 8 — score division, Neutral mapping, page_hint
249
+ ├── test_adapters.py # Phase 16 (T-60) — adapter prefix routing, post_process, pre_call_kwargs
250
+ ├── fixtures/ # Phase 16 (T-65) — sample.pdf, sample.docx (generated, gitignored)
251
+ │ └── scaffold_fixtures.py # Run once to generate fixture files
252
+ └── integration/ # Phase 12+ — @pytest.mark.llm (requires real API keys)
253
+ ```
254
+
255
+ Fast Suite runs in ~5 seconds. `litellm.completion` and `litellm.acompletion` are always mocked in Fast Suite tests via `pytest-mock`.
256
+
257
+ ---
258
+
259
+ ## Using a Local LLM
260
+
261
+ The library uses `litellm` which supports Ollama natively. No code changes needed:
262
+
263
+ ```bash
264
+ ollama pull qwen3:14b # ~9GB Q4_K_M, fits in 15GB RAM — recommended for integration tests
265
+ ollama serve # starts on http://localhost:11434
266
+ ```
267
+
268
+ ```python
269
+ result = verify(claim="...", sources=[...], model="ollama/qwen3:14b", max_spend=0.0)
270
+ ```
271
+
272
+ **Ollama thinking mode (Phase 16 complete)** — Thinking-capable Ollama models (qwen3, DeepSeek-R1, Gemma 4, Kimi K2, LFM2.5, GPT-OSS, etc.) emit `<think>...</think>` tags or drop `content` when thinking is active. `OLLAMA_ADAPTER` in `adapters/registry.py` handles this automatically: strips `<think>` tags (rfind-based, regex fallback), falls back to `reasoning_content` if content is empty. Models reason freely — `think=False` was removed in T-62.
273
+
274
+ Ollama supports `response_format` (structured output via JSON schema grammar). `parse_response()` uses the fence-stripping + `<think>` tag stripping fallback for any remaining edge cases.
275
+
276
+ For the Real Suite integration tests, use `--timeout=300` (qwen3:30b takes 30–90s per call).
@@ -0,0 +1,87 @@
1
+ # GROUNDGUARD_CONTRACT.md
2
+
3
+ This document defines the execution contract for the `groundguard` library. It describes what the library guarantees unconditionally, what callers can configure, how invariants hold under composition, and what is explicitly out of scope.
4
+
5
+ ---
6
+
7
+ ## 1. Guarantees
8
+
9
+ These invariants always hold. If any of them would be violated, an exception is raised instead of returning a result.
10
+
11
+ **Citation on VERIFIED results**
12
+ Every `AtomicClaimResult` with `status="VERIFIED"` has a non-null `citation`. This is enforced by `_assert_citation_invariant` in `ResultBuilder` — a `VERIFIED` result with a null citation raises `InvariantError` before it can be returned.
13
+
14
+ **Non-negative cost**
15
+ `cost_usd` is never negative on any result. The `max_spend` cap is a *soft* cap: the triggering LLM call is allowed to complete and is billed before the cap fires. Subsequent calls in the same context are blocked.
16
+
17
+ **Per-call boundary ID**
18
+ Every `verify()` and `averify()` call generates a unique `boundary_id` of exactly 12 hex characters (48-bit entropy via `secrets.token_hex(6)`). The boundary ID is embedded in the prompt to prevent prompt-injection attacks that splice content across the boundary. It is never reused across calls.
19
+
20
+ **Majority vote call count**
21
+ When majority vote is active (triggered by a profile with `majority_vote=True` and a result score below `majority_vote_confidence_threshold`), exactly 3 LLM calls are made — never 1 or 2. The vote cannot complete with fewer.
22
+
23
+ **Tie-break conservatism**
24
+ A 1-1-1 split across 3 majority vote calls always yields `is_grounded=False` and `status="NOT_GROUNDED"`. Ties are never silently promoted to a positive verdict.
25
+
26
+ ---
27
+
28
+ ## 2. Configurables
29
+
30
+ These parameters are caller-controlled. They change behaviour but do not affect the invariants above.
31
+
32
+ **`profile`** — a `VerificationProfile` dataclass (preset: `GENERAL_PROFILE`, `STRICT_PROFILE`, or `RESEARCH_PROFILE`). Sets defaults for `faithfulness_threshold`, `tier2_lexical_threshold`, `bm25_top_k`, `majority_vote`, and `audit`. Explicit per-call parameters always override profile defaults.
33
+
34
+ **`faithfulness_threshold`** — float, 0.0–1.0. Minimum score for a result to be considered grounded. Explicit value beats `majority_vote_on_borderline` which beats the profile default (precedence enforced in `VerificationContext.__post_init__`).
35
+
36
+ **`max_spend`** — soft USD cap. Default `float('inf')` (no cap). The triggering call is billed; subsequent calls in the batch or context are blocked with `status="SKIPPED_DUE_TO_COST"`.
37
+
38
+ **`model`** — any litellm model string (e.g. `"gpt-4o-mini"`, `"ollama/qwen3:14b"`, `"gemini/gemini-2.0-flash"`). Passed through to litellm without transformation.
39
+
40
+ **`api_base`** — passed to litellm for custom endpoints (e.g. local Ollama, Azure deployments).
41
+
42
+ **`auto_chunk`** — default `True`. When `True`, long sources are split by a sliding-window chunker and BM25 retrieves the top-k chunks for Tier 3. When `False`, full source content is forwarded to Tier 3 without chunking — recommended for large-context models to avoid the Lost Context Problem (negating clauses in low-scoring chunks not reaching the LLM).
43
+
44
+ ---
45
+
46
+ ## 3. Invariants Under Composition
47
+
48
+ These invariants hold when combining multiple groundguard APIs in a single workflow.
49
+
50
+ **`SourceAccumulator` is opt-in**
51
+ `SourceAccumulator` is not wired into the pipeline. No verification function accepts a `SourceAccumulator` directly — callers always call `.sources()` explicitly and pass the resulting `list[Source]`:
52
+
53
+ ```python
54
+ acc = SourceAccumulator()
55
+ acc.add(db_source, provenance="database_lookup", agent_id="agent_1")
56
+ acc.add(llm_source, provenance="llm_generated", is_llm_derived=True, agent_id="agent_2")
57
+ result = verify_analysis(agent_output, sources=acc.sources(), model="gpt-4o-mini")
58
+ ```
59
+
60
+ This keeps the public API stable and lets callers inspect, filter, or log the source list before verification runs.
61
+
62
+ **`verify_clause` + `TermRegistry`**
63
+ When a `TermRegistry` is provided to `verify_clause`, term definitions are injected as pinned `Source` objects into the sources list *before* `verify()` is called. Term resolution runs before Tier 0, so pinned definitions affect the atom count and can alter routing decisions.
64
+
65
+ **`averify_batch` failure isolation**
66
+ `averify_batch` is fail-contained per item. An exception in one item (including `VerificationCostExceededError`) does not abort the batch. Items that hit the spend cap return `status="SKIPPED_DUE_TO_COST"`. All other per-item exceptions return `status="ERROR"`. The shared `SharedCostTracker` applies across the entire batch.
67
+
68
+ **Parameter precedence**
69
+ `explicit call params > ctx.majority_vote_on_borderline > profile defaults`. This precedence is enforced in `VerificationContext.__post_init__` and cannot be overridden by downstream tiers.
70
+
71
+ **`verify()` / `averify()` vs. `averify_batch`**
72
+ `verify()` and `averify()` are fail-loud: all exceptions propagate to the caller except `ParseError`, which is returned as `status="PARSE_ERROR"`. `averify_batch` is fail-contained. This asymmetry is intentional — single-call users get explicit error signals; batch callers get partial results without a full abort.
73
+
74
+ ---
75
+
76
+ ## 4. Undefined Behaviour
77
+
78
+ The following scenarios are not guaranteed to work correctly and may change across versions without a deprecation notice.
79
+
80
+ **Models that ignore `response_format`**
81
+ Groundguard requests structured JSON output via `response_format`. If a model ignores the schema (returning free text instead), the retry loop attempts to parse the output using fence-stripping and Pydantic validation. This is a best-effort fallback — it is not guaranteed to succeed on all model/prompt combinations.
82
+
83
+ **Tier 2.5 on non-English text**
84
+ The numerical conflict detector (`tier25_preprocessing`) uses regex patterns calibrated on English-language text (digits, `%`, `$`, metric suffixes `M/B/K`). Behaviour on non-English numerals or currency formats is undefined.
85
+
86
+ **BM25 on single-sentence sources**
87
+ BM25Okapi can return negative scores on very small corpora due to IDF artefacts. On single-sentence sources, routing may fall through to Branch B (`ESCALATE_TO_LLM`) even when Branch C (`ESCALATE_ALL_LOW_SCORE`) would be more appropriate. Use `auto_chunk=False` on very short sources to avoid this edge case.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pulkit Jain
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.