graphite-engine 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphite_engine-0.3.2/PKG-INFO +251 -0
- graphite_engine-0.3.2/README.md +213 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/pyproject.toml +9 -10
- graphite_engine-0.3.2/src/graphite/__init__.py +53 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/adapters/alphaearth.py +4 -1
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/adapters/weathernext.py +1 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/cache.py +1 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/claim.py +160 -25
- graphite_engine-0.3.2/src/graphite/claim_store.py +321 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/confidence.py +101 -67
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/domain.py +23 -13
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/enums.py +13 -6
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/evidence.py +19 -11
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/llm.py +1 -0
- graphite_engine-0.3.2/src/graphite/pipeline/__init__.py +18 -0
- graphite_engine-0.3.2/src/graphite/pipeline/analyzer.py +107 -0
- graphite_engine-0.3.2/src/graphite/pipeline/extractor.py +81 -0
- graphite_engine-0.3.2/src/graphite/pipeline/report.py +82 -0
- graphite_engine-0.3.2/src/graphite/pipeline/retriever.py +81 -0
- graphite_engine-0.3.2/src/graphite/pipeline/verifier.py +132 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/rules.py +3 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/schemas.py +63 -15
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/text.py +128 -33
- graphite_engine-0.3.2/src/graphite_engine.egg-info/PKG-INFO +251 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite_engine.egg-info/SOURCES.txt +7 -17
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite_engine.egg-info/requires.txt +2 -7
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/tests/test_claim.py +30 -11
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/tests/test_confidence.py +21 -14
- graphite_engine-0.3.0/PKG-INFO +0 -205
- graphite_engine-0.3.0/README.md +0 -164
- graphite_engine-0.3.0/src/graphite/__init__.py +0 -47
- graphite_engine-0.3.0/src/graphite/assembler.py +0 -299
- graphite_engine-0.3.0/src/graphite/claim_store.py +0 -133
- graphite_engine-0.3.0/src/graphite/features/__init__.py +0 -1
- graphite_engine-0.3.0/src/graphite/features/alphaearth_enricher.py +0 -125
- graphite_engine-0.3.0/src/graphite/features/embedding_similarity.py +0 -193
- graphite_engine-0.3.0/src/graphite/geo_evidence/__init__.py +0 -1
- graphite_engine-0.3.0/src/graphite/geo_evidence/geo_foundation.py +0 -86
- graphite_engine-0.3.0/src/graphite/graph.py +0 -182
- graphite_engine-0.3.0/src/graphite/io.py +0 -194
- graphite_engine-0.3.0/src/graphite/scenario.py +0 -157
- graphite_engine-0.3.0/src/graphite/scenarios/__init__.py +0 -1
- graphite_engine-0.3.0/src/graphite/scenarios/weathernext_forecast.py +0 -140
- graphite_engine-0.3.0/src/graphite/simulate.py +0 -245
- graphite_engine-0.3.0/src/graphite_engine.egg-info/PKG-INFO +0 -205
- graphite_engine-0.3.0/tests/test_graph_store.py +0 -57
- graphite_engine-0.3.0/tests/test_pipeline_core.py +0 -460
- graphite_engine-0.3.0/tests/test_propagation.py +0 -248
- graphite_engine-0.3.0/tests/test_scenario.py +0 -196
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/LICENSE +0 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/setup.cfg +0 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/adapters/__init__.py +0 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite/py.typed +0 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite_engine.egg-info/dependency_links.txt +0 -0
- {graphite_engine-0.3.0 → graphite_engine-0.3.2}/src/graphite_engine.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graphite-engine
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Open-source claim verification engine for agent-generated assertions in high-stakes domains
|
|
5
|
+
Author: Min Jun Kim
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/minjun1/graphite-core
|
|
8
|
+
Project-URL: Documentation, https://github.com/minjun1/graphite-core#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/minjun1/graphite-core
|
|
10
|
+
Project-URL: Issues, https://github.com/minjun1/graphite-core/issues
|
|
11
|
+
Keywords: verification,claims,evidence,provenance,trust,graph
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: networkx>=3.0
|
|
25
|
+
Requires-Dist: pydantic>=2.0
|
|
26
|
+
Provides-Extra: llm
|
|
27
|
+
Requires-Dist: openai>=1.0; extra == "llm"
|
|
28
|
+
Requires-Dist: google-genai>=1.0; extra == "llm"
|
|
29
|
+
Provides-Extra: geo
|
|
30
|
+
Requires-Dist: rasterio>=1.3; extra == "geo"
|
|
31
|
+
Requires-Dist: numpy>=1.24; extra == "geo"
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: graphite-engine[geo,llm]; extra == "all"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
36
|
+
Requires-Dist: python-dotenv>=1.0; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
<div align="center">
|
|
40
|
+
<h1>⛏️ Graphite</h1>
|
|
41
|
+
<p><strong>Claim verification engine for AI agent outputs.</strong></p>
|
|
42
|
+
<p><em>LLMs judge. Graphs remember.</em></p>
|
|
43
|
+
<p>Graphite extracts claims from agent-generated text, retrieves evidence, verifies support and contradiction, flags unsupported reasoning leaps, and stores every verdict with a full provenance trail — building a verification memory that gets stronger with every review.</p>
|
|
44
|
+
<p>
|
|
45
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/license-Apache--2.0-blue.svg" alt="License"></a>
|
|
46
|
+
<a href="https://python.org"><img src="https://img.shields.io/badge/python-3.10%2B-brightgreen.svg" alt="Python"></a>
|
|
47
|
+
</p>
|
|
48
|
+
</div>
|
|
49
|
+
|
|
50
|
+
> ⚠️ **v0.3.x — Experimental**. Usable and tested, but API may change before 1.0. Pin your version.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
### How it works
|
|
55
|
+
|
|
56
|
+
Graphite turns raw agent output into a structured verification report.
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from graphite.pipeline import verify_agent_output
|
|
60
|
+
|
|
61
|
+
report = verify_agent_output(
|
|
62
|
+
text=agent_memo_markdown,
|
|
63
|
+
corpus=sec_filings_corpus,
|
|
64
|
+
model="gemini-2.5-flash" # any OpenAI-compatible model works
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
print(f"Supported: {report.supported_count} | Conflicted: {report.conflicted_count}")
|
|
68
|
+
print(f"Requires Human Review: {len(report.risky_claim_ids)} claims")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
This single API wraps a 5-step pipeline:
|
|
72
|
+
|
|
73
|
+
1. **Extract**: Parses the document into atomic claims using LLMs.
|
|
74
|
+
2. **Retrieve**: Finds candidate evidence spans across the corpus for each claim.
|
|
75
|
+
3. **Verify**: Judges claims against the retrieved spans (Supported, Conflicted, Insufficient).
|
|
76
|
+
4. **Analyze**: Flags argument-level reasoning leaps (`CONCLUSION_JUMP`).
|
|
77
|
+
5. **Report**: Aggregates the findings into a `VerificationReport` with structured rationale, review flags, and full provenance.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
### Handling Logic Leaps & Human Review
|
|
82
|
+
|
|
83
|
+
Downstream UI and review workflows can be built directly on top of the structured output.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from graphite.claim import ArgumentVerdictEnum
|
|
87
|
+
|
|
88
|
+
# Check for logic leaps (Argument-level verification)
|
|
89
|
+
for argument in report.argument_verdicts:
|
|
90
|
+
if argument.verdict == ArgumentVerdictEnum.CONCLUSION_JUMP:
|
|
91
|
+
print(f"⚠️ LOGIC LEAP: {argument.text}")
|
|
92
|
+
|
|
93
|
+
# Route high-risk factual claims to a Human-in-the-loop review queue
|
|
94
|
+
for claim_id in report.risky_claim_ids:
|
|
95
|
+
verdict = report.get_verdict(claim_id)
|
|
96
|
+
if verdict.needs_human_review:
|
|
97
|
+
print(f"🚨 REVIEW NEEDED: {verdict.claim_text}")
|
|
98
|
+
print(f" Reason: {verdict.rationale.missing_evidence_reason or verdict.rationale.contradiction_type}")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
### Quickstart
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install "graphite-engine[llm]"
|
|
107
|
+
export GEMINI_API_KEY="your-api-key-here"
|
|
108
|
+
python examples/quickstart_verification/run.py
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Or from source:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
git clone https://github.com/minjun1/graphite-core.git
|
|
115
|
+
cd graphite-core
|
|
116
|
+
pip install -e ".[llm]"
|
|
117
|
+
export GEMINI_API_KEY="your-api-key-here"
|
|
118
|
+
python examples/quickstart_verification/run.py
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Graphite defaults to Gemini via the OpenAI-compatible endpoint, so any OpenAI-compatible provider also works — including local models via Ollama or vLLM, or hosted endpoints like Together and Groq. Set `OPENAI_API_KEY` and `OPENAI_BASE_URL` to point at any compatible endpoint.
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## Why a Graph?
|
|
126
|
+
|
|
127
|
+
Most verification tools run once and forget. Graphite anchors every judgment into a persistent graph — turning disposable LLM outputs into a living verification memory.
|
|
128
|
+
|
|
129
|
+
**Claims are first-class objects.** The same assertion can be identified, revisited, and re-evaluated across documents and time — not lost in prompt logs.
|
|
130
|
+
|
|
131
|
+
**Evidence accumulates, not overwrites.** When a second source confirms (or contradicts) a claim, Graphite appends the new evidence to the existing node instead of starting from scratch.
|
|
132
|
+
|
|
133
|
+
**Review history becomes lineage.** AI verdict → analyst override → re-evaluation with new data — every step is recorded as a relationship in the graph, not a flat log entry.
|
|
134
|
+
|
|
135
|
+
**Cross-document deduplication.** When the same claim appears in TSMC's 10-K and Nvidia's 10-K, Graphite recognizes it as one canonical claim backed by two independent sources.
|
|
136
|
+
|
|
137
|
+
**Reasoning structure, not just fact-checking.** Claims don't exist in isolation. Graphite can represent claim-to-conclusion relationships, enabling checks like `CONCLUSION_JUMP` when the logical link between premises and conclusion is unsupported.
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Stateful Verification Memory
|
|
142
|
+
|
|
143
|
+
Unlike stateless evaluators that produce a score and discard context, Graphite's `ClaimStore` builds a persistent fact base that strengthens over time.
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
Run 1: Extract "TSMC supplies CoWoS to Nvidia" from TSMC 10-K
|
|
147
|
+
→ 1 evidence source recorded
|
|
148
|
+
|
|
149
|
+
Run 2: Same claim found in Nvidia 10-K
|
|
150
|
+
→ evidence accumulates → 2 independent sources
|
|
151
|
+
|
|
152
|
+
Run 3: Exact duplicate from same source
|
|
153
|
+
→ deduplicated, no change
|
|
154
|
+
|
|
155
|
+
Run 4: Related claim "Nvidia depends on TSMC" extracted
|
|
156
|
+
→ cross-claim linkage via shared entities
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
What this looks like as a graph:
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
Claim: "TSMC supplies CoWoS to Nvidia"
|
|
163
|
+
├── supported_by → TSMC 10-K (cited span)
|
|
164
|
+
├── supported_by → Nvidia 10-K (cited span)
|
|
165
|
+
├── reviewed_as → SUPPORTED (model verdict)
|
|
166
|
+
└── contributes_to → "Nvidia depends on TSMC"
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Each claim is a deduplicated node. Evidence merges across extraction runs. Analyst overrides persist. The result is a verification memory where repeated reviews compound — not repeat.
|
|
170
|
+
|
|
171
|
+
*Most verification tools forget. Graphite remembers — and gets stronger with every review.*
|
|
172
|
+
|
|
173
|
+
*(See `examples/evidence_accumulation/` for a runnable demo — no API keys required.)*
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Graphite vs. Existing Tools
|
|
178
|
+
|
|
179
|
+
*Evaluators grade your prompts. Graphite audits your agent's claims — and remembers every verdict.*
|
|
180
|
+
|
|
181
|
+
These tools solve adjacent but different problems:
|
|
182
|
+
|
|
183
|
+
| Dimension | Ragas / TruLens / DeepEval | Graphite |
|
|
184
|
+
|-----------|---------------------------|----------|
|
|
185
|
+
| **Purpose** | Prompt/model evaluation (CI/CD) | Production output verification (runtime) |
|
|
186
|
+
| **State** | Stateless — each run is independent | Stateful — evidence accumulates across runs |
|
|
187
|
+
| **Output** | Scores (faithfulness, relevance) | Structured `VerificationReport` with provenance |
|
|
188
|
+
| **Logic Leaps** | Not addressed | `CONCLUSION_JUMP` / `OVERSTATED` detection |
|
|
189
|
+
| **Human Review** | Manual review of score dashboards | `needs_human_review` routing with analyst override |
|
|
190
|
+
| **Audit Trail** | Execution logs | Every verdict links to exact `cited_span` with full lineage |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Evaluation Snapshot
|
|
195
|
+
|
|
196
|
+
Representative verification cases from our golden test suite (current snapshot run on GPT-4o; `evals/verify_eval.py`):
|
|
197
|
+
|
|
198
|
+
| Test Case | Type | Expected | Graphite Output |
|
|
199
|
+
|-----------|------|----------|-----------------|
|
|
200
|
+
| Paraphrased contradiction | Semantic | CONFLICTED | CONFLICTED |
|
|
201
|
+
| Numeric mismatch (10× error) | Factual | CONFLICTED | CONFLICTED |
|
|
202
|
+
| Temporal mismatch (stale CEO) | Temporal | CONFLICTED | CONFLICTED |
|
|
203
|
+
| Unsupported revenue prediction | Reasoning Leap | CONCLUSION_JUMP | CONCLUSION_JUMP |
|
|
204
|
+
|
|
205
|
+
- Claim-level verdict: correct in 3/3 factual cases
|
|
206
|
+
- Argument-level verdict: correct in 1/1 reasoning case
|
|
207
|
+
- These cases are intended as regression checks for key failure modes, not as a broad accuracy benchmark.
|
|
208
|
+
|
|
209
|
+
> *This is a representative snapshot, not a comprehensive benchmark. See `evals/` for the full test suite and `examples/` for runnable demos. A larger-scale evaluation suite (100+ memos) is on the roadmap.*
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Core Primitives
|
|
214
|
+
|
|
215
|
+
| Object | What it does |
|
|
216
|
+
|-----------|-------------|
|
|
217
|
+
| `VerificationReport` | Top-level summary of the entire review, ready for product UI integrations |
|
|
218
|
+
| `Verdict` | Claim-level judgment (`SUPPORTED`, `CONFLICTED`, `INSUFFICIENT`) with structured rationale |
|
|
219
|
+
| `ArgumentVerdict` | Argument-level judgment (`GROUNDED`, `CONCLUSION_JUMP`, `OVERSTATED`) |
|
|
220
|
+
| `ClaimStore` | Persistent verification memory — deduplicates claims, merges evidence, and preserves review history across runs |
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Reference Applications
|
|
225
|
+
|
|
226
|
+
Graphite is designed as the verification engine for high-stakes workflows across multiple domains:
|
|
227
|
+
|
|
228
|
+
- **Compliance & Legal Review**: Checking internal policy documents or marketing copy against regulatory guidelines.
|
|
229
|
+
- **Healthcare & Scientific Fact-checking**: Cross-referencing generated medical or scientific summaries against peer-reviewed journals.
|
|
230
|
+
- **Investment & Research QA**: Verifying AI-generated analyst memos against SEC filings or earnings call transcripts.
|
|
231
|
+
|
|
232
|
+
*(See `examples/quickstart_verification/` for end-to-end verification, `examples/evidence_accumulation/` for stateful memory, and `examples/lineage_override_demo/` for analyst override workflows.)*
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Optional extras
|
|
237
|
+
|
|
238
|
+
**Core** (always included): `networkx` + `pydantic`
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
pip install -e ".[llm]" # LLM support (OpenAI-compatible providers)
|
|
242
|
+
pip install -e ".[all]" # Everything
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
> Set `GEMINI_API_KEY` to get started. To use other providers, set `OPENAI_API_KEY` and `OPENAI_BASE_URL`.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## License
|
|
250
|
+
|
|
251
|
+
Apache-2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<h1>⛏️ Graphite</h1>
|
|
3
|
+
<p><strong>Claim verification engine for AI agent outputs.</strong></p>
|
|
4
|
+
<p><em>LLMs judge. Graphs remember.</em></p>
|
|
5
|
+
<p>Graphite extracts claims from agent-generated text, retrieves evidence, verifies support and contradiction, flags unsupported reasoning leaps, and stores every verdict with a full provenance trail — building a verification memory that gets stronger with every review.</p>
|
|
6
|
+
<p>
|
|
7
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/license-Apache--2.0-blue.svg" alt="License"></a>
|
|
8
|
+
<a href="https://python.org"><img src="https://img.shields.io/badge/python-3.10%2B-brightgreen.svg" alt="Python"></a>
|
|
9
|
+
</p>
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
> ⚠️ **v0.3.x — Experimental**. Usable and tested, but API may change before 1.0. Pin your version.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
### How it works
|
|
17
|
+
|
|
18
|
+
Graphite turns raw agent output into a structured verification report.
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from graphite.pipeline import verify_agent_output
|
|
22
|
+
|
|
23
|
+
report = verify_agent_output(
|
|
24
|
+
text=agent_memo_markdown,
|
|
25
|
+
corpus=sec_filings_corpus,
|
|
26
|
+
model="gemini-2.5-flash" # any OpenAI-compatible model works
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
print(f"Supported: {report.supported_count} | Conflicted: {report.conflicted_count}")
|
|
30
|
+
print(f"Requires Human Review: {len(report.risky_claim_ids)} claims")
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
This single API wraps a 5-step pipeline:
|
|
34
|
+
|
|
35
|
+
1. **Extract**: Parses the document into atomic claims using LLMs.
|
|
36
|
+
2. **Retrieve**: Finds candidate evidence spans across the corpus for each claim.
|
|
37
|
+
3. **Verify**: Judges claims against the retrieved spans (Supported, Conflicted, Insufficient).
|
|
38
|
+
4. **Analyze**: Flags argument-level reasoning leaps (`CONCLUSION_JUMP`).
|
|
39
|
+
5. **Report**: Aggregates the findings into a `VerificationReport` with structured rationale, review flags, and full provenance.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
### Handling Logic Leaps & Human Review
|
|
44
|
+
|
|
45
|
+
Downstream UI and review workflows can be built directly on top of the structured output.
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from graphite.claim import ArgumentVerdictEnum
|
|
49
|
+
|
|
50
|
+
# Check for logic leaps (Argument-level verification)
|
|
51
|
+
for argument in report.argument_verdicts:
|
|
52
|
+
if argument.verdict == ArgumentVerdictEnum.CONCLUSION_JUMP:
|
|
53
|
+
print(f"⚠️ LOGIC LEAP: {argument.text}")
|
|
54
|
+
|
|
55
|
+
# Route high-risk factual claims to a Human-in-the-loop review queue
|
|
56
|
+
for claim_id in report.risky_claim_ids:
|
|
57
|
+
verdict = report.get_verdict(claim_id)
|
|
58
|
+
if verdict.needs_human_review:
|
|
59
|
+
print(f"🚨 REVIEW NEEDED: {verdict.claim_text}")
|
|
60
|
+
print(f" Reason: {verdict.rationale.missing_evidence_reason or verdict.rationale.contradiction_type}")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
### Quickstart
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install "graphite-engine[llm]"
|
|
69
|
+
export GEMINI_API_KEY="your-api-key-here"
|
|
70
|
+
python examples/quickstart_verification/run.py
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or from source:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/minjun1/graphite-core.git
|
|
77
|
+
cd graphite-core
|
|
78
|
+
pip install -e ".[llm]"
|
|
79
|
+
export GEMINI_API_KEY="your-api-key-here"
|
|
80
|
+
python examples/quickstart_verification/run.py
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Graphite defaults to Gemini via the OpenAI-compatible endpoint, so any OpenAI-compatible provider also works — including local models via Ollama or vLLM, or hosted endpoints like Together and Groq. Set `OPENAI_API_KEY` and `OPENAI_BASE_URL` to point at any compatible endpoint.
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Why a Graph?
|
|
88
|
+
|
|
89
|
+
Most verification tools run once and forget. Graphite anchors every judgment into a persistent graph — turning disposable LLM outputs into a living verification memory.
|
|
90
|
+
|
|
91
|
+
**Claims are first-class objects.** The same assertion can be identified, revisited, and re-evaluated across documents and time — not lost in prompt logs.
|
|
92
|
+
|
|
93
|
+
**Evidence accumulates, not overwrites.** When a second source confirms (or contradicts) a claim, Graphite appends the new evidence to the existing node instead of starting from scratch.
|
|
94
|
+
|
|
95
|
+
**Review history becomes lineage.** AI verdict → analyst override → re-evaluation with new data — every step is recorded as a relationship in the graph, not a flat log entry.
|
|
96
|
+
|
|
97
|
+
**Cross-document deduplication.** When the same claim appears in TSMC's 10-K and Nvidia's 10-K, Graphite recognizes it as one canonical claim backed by two independent sources.
|
|
98
|
+
|
|
99
|
+
**Reasoning structure, not just fact-checking.** Claims don't exist in isolation. Graphite can represent claim-to-conclusion relationships, enabling checks like `CONCLUSION_JUMP` when the logical link between premises and conclusion is unsupported.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Stateful Verification Memory
|
|
104
|
+
|
|
105
|
+
Unlike stateless evaluators that produce a score and discard context, Graphite's `ClaimStore` builds a persistent fact base that strengthens over time.
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
Run 1: Extract "TSMC supplies CoWoS to Nvidia" from TSMC 10-K
|
|
109
|
+
→ 1 evidence source recorded
|
|
110
|
+
|
|
111
|
+
Run 2: Same claim found in Nvidia 10-K
|
|
112
|
+
→ evidence accumulates → 2 independent sources
|
|
113
|
+
|
|
114
|
+
Run 3: Exact duplicate from same source
|
|
115
|
+
→ deduplicated, no change
|
|
116
|
+
|
|
117
|
+
Run 4: Related claim "Nvidia depends on TSMC" extracted
|
|
118
|
+
→ cross-claim linkage via shared entities
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
What this looks like as a graph:
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
Claim: "TSMC supplies CoWoS to Nvidia"
|
|
125
|
+
├── supported_by → TSMC 10-K (cited span)
|
|
126
|
+
├── supported_by → Nvidia 10-K (cited span)
|
|
127
|
+
├── reviewed_as → SUPPORTED (model verdict)
|
|
128
|
+
└── contributes_to → "Nvidia depends on TSMC"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Each claim is a deduplicated node. Evidence merges across extraction runs. Analyst overrides persist. The result is a verification memory where repeated reviews compound — not repeat.
|
|
132
|
+
|
|
133
|
+
*Most verification tools forget. Graphite remembers — and gets stronger with every review.*
|
|
134
|
+
|
|
135
|
+
*(See `examples/evidence_accumulation/` for a runnable demo — no API keys required.)*
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Graphite vs. Existing Tools
|
|
140
|
+
|
|
141
|
+
*Evaluators grade your prompts. Graphite audits your agent's claims — and remembers every verdict.*
|
|
142
|
+
|
|
143
|
+
These tools solve adjacent but different problems:
|
|
144
|
+
|
|
145
|
+
| Dimension | Ragas / TruLens / DeepEval | Graphite |
|
|
146
|
+
|-----------|---------------------------|----------|
|
|
147
|
+
| **Purpose** | Prompt/model evaluation (CI/CD) | Production output verification (runtime) |
|
|
148
|
+
| **State** | Stateless — each run is independent | Stateful — evidence accumulates across runs |
|
|
149
|
+
| **Output** | Scores (faithfulness, relevance) | Structured `VerificationReport` with provenance |
|
|
150
|
+
| **Logic Leaps** | Not addressed | `CONCLUSION_JUMP` / `OVERSTATED` detection |
|
|
151
|
+
| **Human Review** | Manual review of score dashboards | `needs_human_review` routing with analyst override |
|
|
152
|
+
| **Audit Trail** | Execution logs | Every verdict links to exact `cited_span` with full lineage |
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Evaluation Snapshot
|
|
157
|
+
|
|
158
|
+
Representative verification cases from our golden test suite (current snapshot run on GPT-4o; `evals/verify_eval.py`):
|
|
159
|
+
|
|
160
|
+
| Test Case | Type | Expected | Graphite Output |
|
|
161
|
+
|-----------|------|----------|-----------------|
|
|
162
|
+
| Paraphrased contradiction | Semantic | CONFLICTED | CONFLICTED |
|
|
163
|
+
| Numeric mismatch (10× error) | Factual | CONFLICTED | CONFLICTED |
|
|
164
|
+
| Temporal mismatch (stale CEO) | Temporal | CONFLICTED | CONFLICTED |
|
|
165
|
+
| Unsupported revenue prediction | Reasoning Leap | CONCLUSION_JUMP | CONCLUSION_JUMP |
|
|
166
|
+
|
|
167
|
+
- Claim-level verdict: correct in 3/3 factual cases
|
|
168
|
+
- Argument-level verdict: correct in 1/1 reasoning case
|
|
169
|
+
- These cases are intended as regression checks for key failure modes, not as a broad accuracy benchmark.
|
|
170
|
+
|
|
171
|
+
> *This is a representative snapshot, not a comprehensive benchmark. See `evals/` for the full test suite and `examples/` for runnable demos. A larger-scale evaluation suite (100+ memos) is on the roadmap.*
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Core Primitives
|
|
176
|
+
|
|
177
|
+
| Object | What it does |
|
|
178
|
+
|-----------|-------------|
|
|
179
|
+
| `VerificationReport` | Top-level summary of the entire review, ready for product UI integrations |
|
|
180
|
+
| `Verdict` | Claim-level judgment (`SUPPORTED`, `CONFLICTED`, `INSUFFICIENT`) with structured rationale |
|
|
181
|
+
| `ArgumentVerdict` | Argument-level judgment (`GROUNDED`, `CONCLUSION_JUMP`, `OVERSTATED`) |
|
|
182
|
+
| `ClaimStore` | Persistent verification memory — deduplicates claims, merges evidence, and preserves review history across runs |
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Reference Applications
|
|
187
|
+
|
|
188
|
+
Graphite is designed as the verification engine for high-stakes workflows across multiple domains:
|
|
189
|
+
|
|
190
|
+
- **Compliance & Legal Review**: Checking internal policy documents or marketing copy against regulatory guidelines.
|
|
191
|
+
- **Healthcare & Scientific Fact-checking**: Cross-referencing generated medical or scientific summaries against peer-reviewed journals.
|
|
192
|
+
- **Investment & Research QA**: Verifying AI-generated analyst memos against SEC filings or earnings call transcripts.
|
|
193
|
+
|
|
194
|
+
*(See `examples/quickstart_verification/` for end-to-end verification, `examples/evidence_accumulation/` for stateful memory, and `examples/lineage_override_demo/` for analyst override workflows.)*
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## Optional extras
|
|
199
|
+
|
|
200
|
+
**Core** (always included): `networkx` + `pydantic`
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
pip install -e ".[llm]" # LLM support (OpenAI-compatible providers)
|
|
204
|
+
pip install -e ".[all]" # Everything
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
> Set `GEMINI_API_KEY` to get started. To use other providers, set `OPENAI_API_KEY` and `OPENAI_BASE_URL`.
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## License
|
|
212
|
+
|
|
213
|
+
Apache-2.0 — see [LICENSE](LICENSE).
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "graphite-engine"
|
|
7
|
-
version = "0.3.
|
|
8
|
-
description = "Open-source claim verification engine for high-stakes
|
|
7
|
+
version = "0.3.2"
|
|
8
|
+
description = "Open-source claim verification engine for agent-generated assertions in high-stakes domains"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
11
11
|
license = {text = "Apache-2.0"}
|
|
@@ -28,24 +28,23 @@ dependencies = [
|
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
[project.optional-dependencies]
|
|
31
|
-
|
|
32
|
-
llm = ["google-genai>=1.0"]
|
|
33
|
-
pdf = ["pdfplumber>=0.9"]
|
|
31
|
+
llm = ["openai>=1.0", "google-genai>=1.0"]
|
|
34
32
|
geo = ["rasterio>=1.3", "numpy>=1.24"]
|
|
35
|
-
all = ["graphite-engine[
|
|
33
|
+
all = ["graphite-engine[llm,geo]"]
|
|
36
34
|
dev = [
|
|
37
35
|
"pytest>=8.0",
|
|
38
36
|
"python-dotenv>=1.0",
|
|
39
37
|
]
|
|
40
38
|
|
|
41
39
|
[project.urls]
|
|
42
|
-
Homepage = "https://github.com/
|
|
43
|
-
Documentation = "https://github.com/
|
|
44
|
-
Repository = "https://github.com/
|
|
45
|
-
Issues = "https://github.com/
|
|
40
|
+
Homepage = "https://github.com/minjun1/graphite-core"
|
|
41
|
+
Documentation = "https://github.com/minjun1/graphite-core#readme"
|
|
42
|
+
Repository = "https://github.com/minjun1/graphite-core"
|
|
43
|
+
Issues = "https://github.com/minjun1/graphite-core/issues"
|
|
46
44
|
|
|
47
45
|
[tool.setuptools.packages.find]
|
|
48
46
|
where = ["src"]
|
|
47
|
+
exclude = ["graphite._archive", "graphite._archive.*"]
|
|
49
48
|
|
|
50
49
|
[tool.setuptools.package-data]
|
|
51
50
|
graphite = ["py.typed"]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graphite — Open-source claim verification engine for agent-generated
|
|
3
|
+
assertions in high-stakes domains.
|
|
4
|
+
|
|
5
|
+
Core primitives:
|
|
6
|
+
- Claim: the atomic unit of trust — structured assertion with provenance
|
|
7
|
+
- ClaimStore: evidence-accumulating registry (claims dedupe, evidence merges)
|
|
8
|
+
- Provenance: first-class evidence source (document, quote, confidence)
|
|
9
|
+
- ConfidenceScorer: explainable confidence scoring with named factors
|
|
10
|
+
|
|
11
|
+
Pipeline:
|
|
12
|
+
Agent/Extractor → Claim[] → ClaimStore (accumulate) → Verify
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# ── Core schemas ──
|
|
16
|
+
from .schemas import ExtractedEdge, NodeRef, Provenance, InferenceBasis, ExtractionError
|
|
17
|
+
from .enums import (
|
|
18
|
+
EdgeType,
|
|
19
|
+
NodeType,
|
|
20
|
+
SourceType,
|
|
21
|
+
ConfidenceLevel,
|
|
22
|
+
AssertionMode,
|
|
23
|
+
EvidenceType,
|
|
24
|
+
)
|
|
25
|
+
from .evidence import EvidencePacket, EvidenceData
|
|
26
|
+
|
|
27
|
+
# ── Trust engine primitives ──
|
|
28
|
+
from .claim import (
|
|
29
|
+
Claim,
|
|
30
|
+
ClaimType,
|
|
31
|
+
ClaimStatus,
|
|
32
|
+
ClaimGranularity,
|
|
33
|
+
ReviewState,
|
|
34
|
+
ClaimOrigin,
|
|
35
|
+
)
|
|
36
|
+
from .claim import ConfidenceFactor, ConfidenceResult
|
|
37
|
+
from .claim_store import ClaimStore
|
|
38
|
+
from .confidence import ConfidenceScorer
|
|
39
|
+
|
|
40
|
+
# ── Domain plugin contracts ──
|
|
41
|
+
from .domain import (
|
|
42
|
+
BaseFetcher,
|
|
43
|
+
BaseExtractor,
|
|
44
|
+
BasePipeline,
|
|
45
|
+
DocumentContext,
|
|
46
|
+
DomainSpec,
|
|
47
|
+
)
|
|
48
|
+
from .domain import register_domain, get_domain, list_domains
|
|
49
|
+
|
|
50
|
+
# ── Rules ──
|
|
51
|
+
from .rules import BaseRuleEngine, RuleResult, ScoreBreakdown
|
|
52
|
+
|
|
53
|
+
__version__ = "0.3.2"
|
|
@@ -12,6 +12,7 @@ AlphaEarth Foundations:
|
|
|
12
12
|
- Earth Engine dataset: GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL
|
|
13
13
|
- GCS bucket: gs://alphaearth_foundations (Requester Pays)
|
|
14
14
|
"""
|
|
15
|
+
|
|
15
16
|
import json
|
|
16
17
|
import os
|
|
17
18
|
from pathlib import Path
|
|
@@ -122,7 +123,9 @@ class AlphaEarthAdapter:
|
|
|
122
123
|
Returns:
|
|
123
124
|
numpy array of shape (64,)
|
|
124
125
|
"""
|
|
125
|
-
cache_key =
|
|
126
|
+
cache_key = (
|
|
127
|
+
node_id or f"bbox_{bbox[0]:.4f}_{bbox[1]:.4f}_{bbox[2]:.4f}_{bbox[3]:.4f}"
|
|
128
|
+
)
|
|
126
129
|
|
|
127
130
|
cached = self._read_cache(cache_key, year)
|
|
128
131
|
if cached is not None:
|