faithgate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faithgate-0.1.0/.github/workflows/eval-gate.yml +68 -0
- faithgate-0.1.0/.github/workflows/licenses.yml +34 -0
- faithgate-0.1.0/.github/workflows/release.yml +27 -0
- faithgate-0.1.0/.github/workflows/tests.yml +19 -0
- faithgate-0.1.0/.gitignore +30 -0
- faithgate-0.1.0/DESIGN.md +94 -0
- faithgate-0.1.0/LICENSE +21 -0
- faithgate-0.1.0/PKG-INFO +251 -0
- faithgate-0.1.0/README.md +225 -0
- faithgate-0.1.0/ROADMAP.md +49 -0
- faithgate-0.1.0/assets/logo.svg +23 -0
- faithgate-0.1.0/examples/demo_gate.py +73 -0
- faithgate-0.1.0/examples/rag_app/README.md +23 -0
- faithgate-0.1.0/examples/rag_app/app.py +58 -0
- faithgate-0.1.0/examples/rag_app/docs/asteroid-belt.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/comets.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/earth.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/europa.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/hubble.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/iss.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/jupiter.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/mars-rovers.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/mars.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/mercury-mission.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/mercury.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/moon.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/neptune.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/pluto.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/saturn.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/sun.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/titan.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/uranus.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/venus.md +2 -0
- faithgate-0.1.0/examples/rag_app/docs/voyager.md +2 -0
- faithgate-0.1.0/examples/rag_app/retriever.py +34 -0
- faithgate-0.1.0/examples/rag_app/suite_baseline.jsonl +12 -0
- faithgate-0.1.0/examples/rag_app/suite_candidate.jsonl +12 -0
- faithgate-0.1.0/examples/rag_app/suite_regressed.jsonl +12 -0
- faithgate-0.1.0/faithgate/__init__.py +7 -0
- faithgate-0.1.0/faithgate/calibrate/__init__.py +0 -0
- faithgate-0.1.0/faithgate/calibrate/calibrate.py +101 -0
- faithgate-0.1.0/faithgate/calibrate/goldens/faithfulness.jsonl +40 -0
- faithgate-0.1.0/faithgate/gate/__init__.py +0 -0
- faithgate-0.1.0/faithgate/gate/cli.py +394 -0
- faithgate-0.1.0/faithgate/gate/diff.py +202 -0
- faithgate-0.1.0/faithgate/ingest/__init__.py +0 -0
- faithgate-0.1.0/faithgate/ingest/decorator.py +49 -0
- faithgate-0.1.0/faithgate/ingest/openinference.py +96 -0
- faithgate-0.1.0/faithgate/keys.py +23 -0
- faithgate-0.1.0/faithgate/panel/__init__.py +0 -0
- faithgate-0.1.0/faithgate/panel/server.py +203 -0
- faithgate-0.1.0/faithgate/score/__init__.py +0 -0
- faithgate-0.1.0/faithgate/score/judges.py +58 -0
- faithgate-0.1.0/faithgate/score/scorer.py +166 -0
- faithgate-0.1.0/faithgate/score/worker.py +91 -0
- faithgate-0.1.0/faithgate/store/__init__.py +0 -0
- faithgate-0.1.0/faithgate/store/db.py +59 -0
- faithgate-0.1.0/faithgate/store/schema.sql +108 -0
- faithgate-0.1.0/pyproject.toml +44 -0
- faithgate-0.1.0/tests/test_calibrate.py +53 -0
- faithgate-0.1.0/tests/test_diff.py +119 -0
- faithgate-0.1.0/tests/test_gate_cli.py +182 -0
- faithgate-0.1.0/tests/test_ingest.py +73 -0
- faithgate-0.1.0/tests/test_ragas_contract.py +61 -0
- faithgate-0.1.0/tests/test_run.py +67 -0
- faithgate-0.1.0/tests/test_scorer.py +41 -0
- faithgate-0.1.0/tests/test_store.py +70 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
name: eval-gate
|
|
2
|
+
|
|
3
|
+
# Two jobs, both offline/deterministic (heuristic judge, no API key):
|
|
4
|
+
# gate — baseline vs candidate must stay green (the normal PR gate)
|
|
5
|
+
# proves-detection — baseline vs a suite with PLANTED hallucinations must go RED;
|
|
6
|
+
# the step INVERTS the exit code, so this job failing means the gate
|
|
7
|
+
# LOST its ability to catch regressions. A green badge here is proof.
|
|
8
|
+
# To score with the real Claude judge instead: pip install ".[claude]", swap --judge claude,
|
|
9
|
+
# and set the ANTHROPIC_API_KEY repository secret.
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
push:
|
|
13
|
+
pull_request:
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
gate:
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
env:
|
|
19
|
+
PYTHONPATH: .
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.11"
|
|
25
|
+
|
|
26
|
+
- name: Score the baseline version
|
|
27
|
+
run: python -m faithgate.gate.cli --db ci.db run --suite examples/rag_app/suite_baseline.jsonl --label baseline --judge heuristic
|
|
28
|
+
|
|
29
|
+
- name: Score the candidate version
|
|
30
|
+
run: python -m faithgate.gate.cli --db ci.db run --suite examples/rag_app/suite_candidate.jsonl --label candidate --judge heuristic
|
|
31
|
+
|
|
32
|
+
- name: Gate — fail if faithfulness regressed
|
|
33
|
+
run: |
|
|
34
|
+
set +e
|
|
35
|
+
python -m faithgate.gate.cli --db ci.db gate --base baseline --head candidate \
|
|
36
|
+
--max-regression 0.05 --min-score 0.5 | tee gate.txt
|
|
37
|
+
status=${PIPESTATUS[0]}
|
|
38
|
+
{ echo '### faithgate — baseline vs candidate'; echo '```'; cat gate.txt; echo '```'; } >> "$GITHUB_STEP_SUMMARY"
|
|
39
|
+
exit "$status"
|
|
40
|
+
|
|
41
|
+
proves-detection:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
env:
|
|
44
|
+
PYTHONPATH: .
|
|
45
|
+
steps:
|
|
46
|
+
- uses: actions/checkout@v4
|
|
47
|
+
- uses: actions/setup-python@v5
|
|
48
|
+
with:
|
|
49
|
+
python-version: "3.11"
|
|
50
|
+
|
|
51
|
+
- name: Score the baseline version
|
|
52
|
+
run: python -m faithgate.gate.cli --db proof.db run --suite examples/rag_app/suite_baseline.jsonl --label baseline --judge heuristic
|
|
53
|
+
|
|
54
|
+
- name: Score a version with PLANTED hallucinations
|
|
55
|
+
run: python -m faithgate.gate.cli --db proof.db run --suite examples/rag_app/suite_regressed.jsonl --label regressed --judge heuristic
|
|
56
|
+
|
|
57
|
+
- name: Assert the gate catches the planted regression (inverted exit)
|
|
58
|
+
run: |
|
|
59
|
+
set +e
|
|
60
|
+
python -m faithgate.gate.cli --db proof.db gate --base baseline --head regressed \
|
|
61
|
+
--max-regression 0.05 --min-score 0.5 | tee proof.txt
|
|
62
|
+
status=${PIPESTATUS[0]}
|
|
63
|
+
{ echo '### faithgate — planted-regression detection proof'; echo '```'; cat proof.txt; echo '```'; } >> "$GITHUB_STEP_SUMMARY"
|
|
64
|
+
if [ "$status" -eq 0 ]; then
|
|
65
|
+
echo '::error::the gate FAILED to catch a known planted regression'
|
|
66
|
+
exit 1
|
|
67
|
+
fi
|
|
68
|
+
echo "gate correctly went red (exit $status) — detection proven"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: licenses
|
|
2
|
+
|
|
3
|
+
# Guards two promises at once:
|
|
4
|
+
# 1. MIT cleanliness — fails if any dependency ships under a non-permissive license
|
|
5
|
+
# (e.g. Elastic License 2.0 like Phoenix Evals, or SSPL).
|
|
6
|
+
# 2. The RAGAS contract — installs the [claude] extra and runs the contract tests, so an
|
|
7
|
+
# incompatible ragas release breaks THIS job loudly instead of silently green runs.
|
|
8
|
+
|
|
9
|
+
on:
|
|
10
|
+
push:
|
|
11
|
+
pull_request:
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
scan:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.11"
|
|
21
|
+
- name: Install package with the claude extra + license scanner
|
|
22
|
+
run: |
|
|
23
|
+
python -m pip install --upgrade pip
|
|
24
|
+
pip install ".[claude]"
|
|
25
|
+
pip install pip-licenses
|
|
26
|
+
- name: Entry point smoke
|
|
27
|
+
run: faithgate --help
|
|
28
|
+
- name: RAGAS contract tests (run only where ragas is installed)
|
|
29
|
+
run: python -m unittest tests.test_ragas_contract -v
|
|
30
|
+
- name: Fail on non-permissive licenses
|
|
31
|
+
run: |
|
|
32
|
+
pip-licenses --format=csv --with-system \
|
|
33
|
+
| grep -Ei 'elastic|sspl|server side public|business source|bsl' \
|
|
34
|
+
&& (echo "Non-permissive license found" && exit 1) || echo "All dependency licenses are permissive."
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
|
|
3
|
+
# Publishes to PyPI on a version tag via TRUSTED PUBLISHING (OIDC) — no long-lived API token
|
|
4
|
+
# stored anywhere. One-time setup on pypi.org: add this repo + workflow as a trusted publisher
|
|
5
|
+
# for the 'faithgate' project (environment: pypi).
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
push:
|
|
9
|
+
tags: ["v*"]
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
pypi:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
environment: pypi
|
|
15
|
+
permissions:
|
|
16
|
+
id-token: write
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.11"
|
|
22
|
+
- name: Build sdist + wheel
|
|
23
|
+
run: |
|
|
24
|
+
python -m pip install --upgrade pip build
|
|
25
|
+
python -m build
|
|
26
|
+
- name: Publish to PyPI (trusted publishing)
|
|
27
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
strategy:
|
|
11
|
+
matrix:
|
|
12
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: ${{ matrix.python-version }}
|
|
18
|
+
- name: Run unit tests (stdlib only — no install needed)
|
|
19
|
+
run: python -m unittest discover -s tests -v
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Secrets — the user's own API key lives here, NEVER commit it.
|
|
2
|
+
.env
|
|
3
|
+
.env.*
|
|
4
|
+
|
|
5
|
+
# Local-only working docs — internal notes, never part of the public repo.
|
|
6
|
+
PLAN.md
|
|
7
|
+
HANDOFF.md
|
|
8
|
+
AUDIT_*.md
|
|
9
|
+
|
|
10
|
+
# Local trace store
|
|
11
|
+
*.db
|
|
12
|
+
*.db-wal
|
|
13
|
+
*.db-shm
|
|
14
|
+
.faithgate/
|
|
15
|
+
|
|
16
|
+
# Python
|
|
17
|
+
__pycache__/
|
|
18
|
+
*.py[cod]
|
|
19
|
+
.venv/
|
|
20
|
+
venv/
|
|
21
|
+
dist/
|
|
22
|
+
build/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.ruff_cache/
|
|
26
|
+
|
|
27
|
+
# OS / editors
|
|
28
|
+
.DS_Store
|
|
29
|
+
.idea/
|
|
30
|
+
.vscode/
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# FaithGate — design rationale
|
|
2
|
+
|
|
3
|
+
Why the tool is shaped the way it is: the locked decisions, the traps they avoid, and what was
|
|
4
|
+
deliberately cut. Companion to [README.md](README.md) (what it does) and [ROADMAP.md](ROADMAP.md)
|
|
5
|
+
(what comes next).
|
|
6
|
+
|
|
7
|
+
## The one principle
|
|
8
|
+
|
|
9
|
+
**Never present a number as more certain than it is.** Every design call below is this principle
|
|
10
|
+
applied somewhere: to the judge, to the gate verdict, to the docs, to the dependency list.
|
|
11
|
+
|
|
12
|
+
## Positioning
|
|
13
|
+
|
|
14
|
+
The "simple + local + eval-first" lane is well served (promptfoo, Arize Phoenix, DeepEval — see the
|
|
15
|
+
README's comparison). FaithGate deliberately does NOT compete there. It is a *complement* with one
|
|
16
|
+
job: **fail-closed blocking of faithfulness regressions**, version-to-version, with measured honesty
|
|
17
|
+
about the judge that produced every number.
|
|
18
|
+
|
|
19
|
+
## Locked decisions
|
|
20
|
+
|
|
21
|
+
| # | Decision | Why |
|
|
22
|
+
|---|---|---|
|
|
23
|
+
| D1 | No per-claim breakdown in v1 | RAGAS publicly returns only a scalar; scraping its internals for per-claim verdicts would be a fragile reimplementation that breaks on minor releases. The panel shows the score + a one-line reason and stays honest. (RAGAS's newer collections API exposes `.reason` — the v3 path.) |
|
|
24
|
+
| D2 | "Local" mode = `local-verification`, precisely labeled | RAGAS's HHEM variant still requires an LLM for claim *extraction* — HHEM only replaces the entailment check. A "fully offline trusted judge" cannot honestly be delivered, so it isn't advertised. |
|
|
25
|
+
| D3 | Frontier judge (Claude) as the trusted default | Laptop-size local models score ~61% on faithfulness benchmarks — near coin-flip on hard cases. The offline heuristic exists so the pipeline runs keyless, but it is deliberately distrusted and its blindness is asserted by a unit test. |
|
|
26
|
+
| D4 | Capture = span ingest + `capture()` helper + suite files | No custom proxy (single point of failure in the user's request path) and no per-provider SDK (unmaintainable). The ingest adapter is one file, isolating the still-experimental GenAI semconv; a startup self-test fails loudly if extraction breaks. |
|
|
27
|
+
| D5 | SQLite + WAL, single file | Single-writer is a non-issue for a single-user tool. Analytical layers (DuckDB etc.) were cut when plain SQL proved sufficient — see "Changed during build". |
|
|
28
|
+
| D6 | Embed RAGAS; never re-implement metric math | The metric is battle-tested and citable. This project's value is the harness around it: the gate, the diffing, the calibration, the honesty surfaces. A CI contract-test job imports the exact RAGAS surface used, so an incompatible release breaks loudly. |
|
|
29
|
+
| D7 | Abstention is a distinct state | NaN / zero-extracted-statements / judge errors are never a `0.0` and never a regression — but if *everything* abstained, the gate fails closed instead of blessing an unjudged run. |
|
|
30
|
+
| D8 | Manifests + judge-change guard | Every run pins judge id/model/kind, RAGAS version, runner version, suite hash. The gate refuses to compare runs whose judges differ (exit 3): a judge swap must never masquerade as a model regression. Corrupted manifests fail closed — corruption is not absence. |
|
|
31
|
+
| D9 | Content-keyed diffing | Runs are matched by `SHA256(normalized question + contexts)` (or an explicit per-case `id`), never by row id — so the gate distinguishes "same case, score moved" from "different case". |
|
|
32
|
+
| D10 | Zero-dependency base install | Capture, the offline judge, the gate, and the stdlib web panel run with no third-party packages. RAGAS + the Claude client live in the `[claude]` extra; torch/HHEM in `[local]`. |
|
|
33
|
+
| D11 | Version identity at capture time | Every trace can carry a `prompt_version_id` (see `faithgate.keys.version_key`) — the join key for the replay/drift layers on the roadmap. System-under-test identity is kept separate from judge identity: merging them would destroy the "did my app change or did my judge change?" distinction. |
|
|
34
|
+
|
|
35
|
+
## Gate semantics (fail-closed)
|
|
36
|
+
|
|
37
|
+
The verdict is a function of *(matched, regressed, abstained, new, missing, duplicates)* — not just
|
|
38
|
+
"any regressions?":
|
|
39
|
+
|
|
40
|
+
- **Zero matched cases → FAIL** ("nothing compared"). A renamed suite, an unscored run, or 100%
|
|
41
|
+
abstention can never turn CI green.
|
|
42
|
+
- **The score floor guards every scored case** — new cases and abstained-baseline cases included.
|
|
43
|
+
- **Duplicates:** the head keeps the *lowest* duplicate score (strict on the new version); the
|
|
44
|
+
baseline keeps the *highest* (a stray low baseline entry must not quietly lower the bar).
|
|
45
|
+
- **Policy knobs** close the quiet channels: `--fail-on-missing` (deleted cases), `--max-abstained`
|
|
46
|
+
(targeted abstention). Exit codes: 0 ok · 1 fail · 2 usage/input error · 3 judge changed.
|
|
47
|
+
- Known, documented limitation: without an explicit `id`, rewording a question mints a new case —
|
|
48
|
+
inherent to content-keyed matching; the `id` field is the escape hatch.
|
|
49
|
+
|
|
50
|
+
## Data model (score-centric, forward-wired)
|
|
51
|
+
|
|
52
|
+
Seven tables: `run`, `trace`, `span`, `judge_run`, `eval_score`, `dataset`, `dataset_item`.
|
|
53
|
+
`eval_score` is the hub linking a captured response to its number, grounding context, and judge
|
|
54
|
+
provenance. Three forward-looking choices cost nothing now and unblock the roadmap without
|
|
55
|
+
migrations:
|
|
56
|
+
|
|
57
|
+
- `eval_score.metric` is a column (second metric = INSERT, not ALTER).
|
|
58
|
+
- A captured response and a curated test case share row shape; `dataset_item.origin_trace_id` ↔
|
|
59
|
+
`trace.source_dataset_item_id` pre-wire "promote a bad production answer into a regression test"
|
|
60
|
+
as two INSERTs (roadmap v2).
|
|
61
|
+
- Contexts live on the trace (not only inside raw span attributes), so replay and promotion never
|
|
62
|
+
re-parse experimental span formats. Raw spans are retained verbatim as churn insurance.
|
|
63
|
+
|
|
64
|
+
## The calibration harness
|
|
65
|
+
|
|
66
|
+
The judge is an AI too, so it takes an exam: a 40-example hand-labeled golden set, stratified across
|
|
67
|
+
failure types (faithful paraphrases, partial support, date/entity swaps, negations, unsupported
|
|
68
|
+
additions). `faithgate calibrate` reports agreement-with-humans per judge, with n shown, and its
|
|
69
|
+
error surface is loud: per-sample timeouts, live progress, first-error printed, exit 2 when nothing
|
|
70
|
+
was actually judged. Measured results live in the README and are re-measurable by anyone with a key.
|
|
71
|
+
|
|
72
|
+
## Changed during build (deliberate descopes)
|
|
73
|
+
|
|
74
|
+
Recorded because pretending the original plan was built would violate the one principle:
|
|
75
|
+
|
|
76
|
+
- **Panel:** stdlib `http.server` instead of a web framework — this is what makes the base install
|
|
77
|
+
dependency-free.
|
|
78
|
+
- **DuckDB analytics layer:** cut; plain SQL over SQLite is sufficient at this scale.
|
|
79
|
+
- **Async scoring worker:** deferred; `run` scores synchronously after capture, ingested spans are
|
|
80
|
+
scored via `faithgate score`.
|
|
81
|
+
- **OTLP wire format:** deferred; `/v1/spans` accepts OpenInference-shaped JSON and is documented as
|
|
82
|
+
such, not as OTLP compatibility.
|
|
83
|
+
- **Per-call hardening added after real-world testing:** RAGAS injects `temperature=0.01` per call
|
|
84
|
+
unless bypassed (Sonnet-5-class models reject non-default sampling params) and retries every
|
|
85
|
+
exception 10× with up to 60s waits — turning any persistent error into a multi-minute silent
|
|
86
|
+
hang. FaithGate bypasses the injection, caps retries, bounds every judge call with a timeout, and
|
|
87
|
+
surfaces the first real error instead.
|
|
88
|
+
|
|
89
|
+
## What proves the claims
|
|
90
|
+
|
|
91
|
+
Every capability sentence in the README maps to a test or CI job. The flagship: the `eval-gate`
|
|
92
|
+
workflow's **proves-detection** job scores a suite with planted hallucinations and *asserts the
|
|
93
|
+
gate fails* by inverting the exit code — if the gate ever loses its ability to catch a known
|
|
94
|
+
regression, CI itself goes red.
|
faithgate-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 albertofettucini
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
faithgate-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: faithgate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fails your CI when your LLM app starts making things up — local-first faithfulness regression gate.
|
|
5
|
+
Project-URL: Homepage, https://github.com/albertofettucini/faithgate
|
|
6
|
+
Author: albertofettucini
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: ci,evaluation,faithfulness,llm,llm-as-judge,local-first,observability,rag,ragas,regression
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Provides-Extra: claude
|
|
12
|
+
Requires-Dist: langchain-anthropic>=1.0; extra == 'claude'
|
|
13
|
+
Requires-Dist: langchain-community<0.4,>=0.3; extra == 'claude'
|
|
14
|
+
Requires-Dist: ragas<0.5,>=0.4; extra == 'claude'
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
17
|
+
Provides-Extra: local
|
|
18
|
+
Requires-Dist: langchain-anthropic>=1.0; extra == 'local'
|
|
19
|
+
Requires-Dist: langchain-community<0.4,>=0.3; extra == 'local'
|
|
20
|
+
Requires-Dist: langchain-ollama>=0.2; extra == 'local'
|
|
21
|
+
Requires-Dist: ragas<0.5,>=0.4; extra == 'local'
|
|
22
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'local'
|
|
23
|
+
Requires-Dist: torch>=2.2; extra == 'local'
|
|
24
|
+
Requires-Dist: transformers<5,>=4.40; extra == 'local'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<img src="assets/logo.svg" width="110" alt="FaithGate — a pixel shield, half green half red, split by the gate">
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
# FaithGate
|
|
32
|
+
|
|
33
|
+
> **You changed a prompt. Did any answer quietly start making things up?**
|
|
34
|
+
>
|
|
35
|
+
> FaithGate scores every answer against its sources, diffs versions, and **fails CI on
|
|
36
|
+
> regression** — with a judge whose trustworthiness is *measured* (85%, n=40), never assumed.
|
|
37
|
+
> Zero infra, fully local, **pytest for prompts.**
|
|
38
|
+
|
|
39
|
+

|
|
40
|
+

|
|
41
|
+

|
|
42
|
+

|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## What it is
|
|
47
|
+
|
|
48
|
+
It captures your app's question → answer → retrieved-context turns, scores each for **faithfulness**
|
|
49
|
+
(is the answer supported by the context it was given?), and **fails the build when a new version
|
|
50
|
+
scores worse than the last**. Everything runs locally — traces never leave your machine.
|
|
51
|
+
|
|
52
|
+
**Zero telemetry.** FaithGate itself sends nothing anywhere: no analytics, no phoning home, no
|
|
53
|
+
account. The only network traffic is the API call to the judge **you** configured (plus a one-time
|
|
54
|
+
HuggingFace model download if you opt into the `[local]` HHEM extra).
|
|
55
|
+
|
|
56
|
+
## See it catch a regression (60 seconds, offline, no API key)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
git clone https://github.com/albertofettucini/faithgate && cd faithgate
|
|
60
|
+
|
|
61
|
+
PYTHONPATH=. python3 examples/demo_gate.py
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
faithgate regression gate: FAIL ❌
|
|
66
|
+
|
|
67
|
+
matched 12 · 3 regressed · 0 improved · 9 unchanged · 0 abstained · 0 new · 0 missing
|
|
68
|
+
|
|
69
|
+
Regressions:
|
|
70
|
+
❌ How many moons does Earth have? 1.00 → 0.29 (below floor)
|
|
71
|
+
❌ When was Pluto reclassified as a dwarf planet? 1.00 → 0.12 (below floor)
|
|
72
|
+
❌ At what altitude does the ISS orbit? 0.90 → 0.20 (below floor)
|
|
73
|
+
|
|
74
|
+
CI exit code would be: 1
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
That's a real end-to-end run: a demo RAG suite where three answers started hallucinating, scored by
|
|
78
|
+
the offline judge, caught by the gate. No scripted numbers.
|
|
79
|
+
|
|
80
|
+
Drive it yourself:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
PYTHONPATH=. python3 -m faithgate.gate.cli --db demo.db run \
|
|
84
|
+
--suite examples/rag_app/suite_baseline.jsonl --label baseline --judge heuristic
|
|
85
|
+
PYTHONPATH=. python3 -m faithgate.gate.cli --db demo.db run \
|
|
86
|
+
--suite examples/rag_app/suite_regressed.jsonl --label regressed --judge heuristic
|
|
87
|
+
PYTHONPATH=. python3 -m faithgate.gate.cli --db demo.db gate --base baseline --head regressed
|
|
88
|
+
PYTHONPATH=. python3 -m faithgate.gate.cli --db demo.db up # browse it → http://127.0.0.1:7654
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
The **base install is stdlib-only**: capture, the offline judge, the gate, and the web panel need
|
|
92
|
+
zero dependencies. The real judge lives in an extra (below).
|
|
93
|
+
|
|
94
|
+
## Why it exists (the honest version)
|
|
95
|
+
|
|
96
|
+
The "simple + local + eval-first" lane is already well served:
|
|
97
|
+
|
|
98
|
+
| Tool | Reality |
|
|
99
|
+
|---|---|
|
|
100
|
+
| [promptfoo](https://github.com/promptfoo/promptfoo) | MIT, one-command, local, eval-first. Acquired by OpenAI (2026). |
|
|
101
|
+
| [Arize Phoenix](https://github.com/Arize-ai/phoenix) | `pip install` + SQLite, local, ships judge templates. Core is Elastic-2.0 (source-available). |
|
|
102
|
+
| [DeepEval](https://github.com/confident-ai/deepeval) | Apache-2.0, `pip install` + Ollama, 50+ metrics. |
|
|
103
|
+
|
|
104
|
+
**Why not just promptfoo?** You probably should use promptfoo — FaithGate is not a replacement
|
|
105
|
+
for your eval stack, it's a **complement** that does exactly one thing: fail-closed blocking of
|
|
106
|
+
*faithfulness* regressions, version-to-version, with honesty guarantees about the judge that
|
|
107
|
+
produced every number. Keep your existing suites and tools; FaithGate runs beside them as the
|
|
108
|
+
narrow tripwire for "did my app quietly start making things up." And the roadmap inverts the one
|
|
109
|
+
thing hand-written suites can't do: promptfoo's golden sets are authored by you — FaithGate's v2
|
|
110
|
+
mines them **from your production traffic** (the schema already makes promotion two INSERTs).
|
|
111
|
+
|
|
112
|
+
FaithGate does **not** try to out-feature anyone. It targets the narrow seam:
|
|
113
|
+
|
|
114
|
+
1. **A regression *gate*, not a dashboard** — run the suite on every change, diff the scores, fail
|
|
115
|
+
the build. Fail-closed: zero matched cases or total abstention can never turn CI green.
|
|
116
|
+
2. **Radical honesty about the judge** — a frontier judge (Claude) by default; a *precisely labeled*
|
|
117
|
+
local mode; a measured judge-vs-human agreement number; abstention instead of fake zeros; and a
|
|
118
|
+
judge swap between runs is **flagged and blocked**, never mistaken for a model regression.
|
|
119
|
+
3. **Truly zero-infra** — the base tool is Python stdlib only. No Postgres, no ClickHouse, no
|
|
120
|
+
Docker, no Node, no web framework. The one metric library (RAGAS) is an opt-in extra.
|
|
121
|
+
|
|
122
|
+
## Real scoring with Claude
|
|
123
|
+
|
|
124
|
+
The trusted default judge is Claude, using **your own** API key (never hardcoded, `.env` is
|
|
125
|
+
gitignored):
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
# needs Python 3.10+ — on stock macOS (3.9) the easiest path is uv:
|
|
129
|
+
# brew install uv && uv venv --python 3.12 .venv && source .venv/bin/activate
|
|
130
|
+
uv pip install -e ".[claude]" # or: python3 -m pip install --upgrade pip && pip install -e ".[claude]"
|
|
131
|
+
export ANTHROPIC_API_KEY=sk-ant-... # your own key, read from the environment only
|
|
132
|
+
faithgate run --suite my_suite.jsonl --label v2 --judge claude
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
A suite is JSONL, one turn per line (`id` is optional — it pins the case's identity so a reworded
|
|
136
|
+
question keeps its baseline instead of being treated as a new case):
|
|
137
|
+
|
|
138
|
+
```json
|
|
139
|
+
{"id": "capital-q", "question": "...", "answer": "...", "contexts": ["retrieved chunk 1", "..."]}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Honest judging
|
|
143
|
+
|
|
144
|
+
FaithGate never presents a score as more certain than it is.
|
|
145
|
+
|
|
146
|
+
- **Frontier judge by default.** Small local models are weak judges — an 8B model scores ~61% on
|
|
147
|
+
faithfulness benchmarks. So Claude is the trusted default; everything else is a labeled trade-off.
|
|
148
|
+
- **`local-verification` is labeled precisely.** `--judge claude-local` runs the entailment check
|
|
149
|
+
on-device (Vectara HHEM, via `faithgate[local]`) but still uses Claude for claim extraction. It
|
|
150
|
+
is **not** "fully offline", and the tool says so. Its cost is measured, not guessed: on-device
|
|
151
|
+
verification trades ~8 points of balanced agreement vs the full Claude judge (see the table).
|
|
152
|
+
- **The offline judge is deliberately distrusted.** `--judge heuristic` is a token-overlap proxy so
|
|
153
|
+
the whole pipeline runs keyless. Its measured weakness — it cannot see contradictions built from
|
|
154
|
+
the context's own words — is **asserted by a unit test** (`test_heuristic_misses_contradiction`),
|
|
155
|
+
not hidden in a footnote.
|
|
156
|
+
- **Abstention, not fake zeros.** When the judge can't score, the result is `abstained` — excluded
|
|
157
|
+
from the gate math and reported separately. If *everything* abstained or errored, the gate and the
|
|
158
|
+
CLI **fail loudly** instead of laundering a broken judge into a green run.
|
|
159
|
+
- **Judge changes are flagged.** Every run records a manifest (judge id/model/kind, RAGAS version,
|
|
160
|
+
runner version, suite hash). `gate` refuses to compare runs whose judges differ (exit 3) unless
|
|
161
|
+
you pass `--allow-judge-change`.
|
|
162
|
+
- **Measured agreement, sample size shown.** `faithgate calibrate` runs the judge over a 40-example
|
|
163
|
+
hand-labeled golden set (faithful paraphrases, partial support, date/entity swaps, negations,
|
|
164
|
+
unsupported additions):
|
|
165
|
+
|
|
166
|
+
| judge | agreement (balanced) | faithful kept | unfaithful caught |
|
|
167
|
+
|---|---|---|---|
|
|
168
|
+
| `claude` (default, claude-sonnet-5) | **85%** *(n=40 — directional)* | 20/20 | 14/20 |
|
|
169
|
+
| `claude-local` (HHEM verify on-device) | 77% *(n=40 — directional)* | 19/20 | 12/20 |
|
|
170
|
+
| `heuristic` (offline proxy) | 68% *(n=40 — directional)* | 18/20 | 9/20 |
|
|
171
|
+
|
|
172
|
+
Even the trusted judge is not an oracle — 85% agreement is consistent with published
|
|
173
|
+
frontier-judge benchmarks, and that's exactly why the number is measured and shown instead
|
|
174
|
+
of assumed. Re-measure against your own key anytime: `faithgate calibrate --judge claude`.
|
|
175
|
+
|
|
176
|
+
## In CI
|
|
177
|
+
|
|
178
|
+
`.github/workflows/eval-gate.yml` runs **two** offline, deterministic jobs on every push/PR:
|
|
179
|
+
|
|
180
|
+
- **gate** — baseline vs candidate must stay green (the normal PR gate).
|
|
181
|
+
- **proves-detection** — baseline vs a suite with *planted hallucinations* must go **red**; the job
|
|
182
|
+
inverts the exit code. If the gate ever loses its ability to catch a known regression, **this job
|
|
183
|
+
fails the build.** A green badge here isn't decoration — it's continuously re-proven detection.
|
|
184
|
+
|
|
185
|
+
Both jobs post the score-diff table to the Actions run summary.
|
|
186
|
+
|
|
187
|
+
## How it works
|
|
188
|
+
|
|
189
|
+
```
|
|
190
|
+
your app ──spans──► POST /v1/spans ─┐ (or: faithgate.capture(...) / faithgate run --suite)
|
|
191
|
+
├─► SQLite (WAL) ──► scoring (RAGAS or offline proxy)
|
|
192
|
+
ingest adapter (one file, │ │ │
|
|
193
|
+
isolates experimental semconv) │ ▼ ▼
|
|
194
|
+
│ web panel version-to-version diff (by content key)
|
|
195
|
+
└────────┴────────────────────┴──► gate: pass / fail / not-comparable
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
- **Capture:** ingest OpenInference-shaped JSON spans, call `faithgate.capture()` directly, or feed
|
|
199
|
+
a suite file. Every trace can carry a `prompt_version_id` (see `faithgate.keys.version_key`) — the
|
|
200
|
+
join key for the replay/drift layers on the [roadmap](ROADMAP.md).
|
|
201
|
+
- **Score:** [RAGAS](https://github.com/explodinggradients/ragas) computes faithfulness — embedded,
|
|
202
|
+
not re-implemented. A CI contract-test job imports the exact RAGAS surface we use, so an
|
|
203
|
+
incompatible release breaks loudly.
|
|
204
|
+
- **Store:** SQLite + WAL, one file, zero server.
|
|
205
|
+
- **Gate:** runs are matched by content (never row id); new, missing, duplicate, and abstained cases
|
|
206
|
+
are always visible in the report.
|
|
207
|
+
|
|
208
|
+
## Design notes (the non-obvious parts)
|
|
209
|
+
|
|
210
|
+
Full rationale — locked decisions, fail-closed semantics, schema pre-wiring, deliberate descopes —
|
|
211
|
+
lives in [DESIGN.md](DESIGN.md). Highlights:
|
|
212
|
+
|
|
213
|
+
- **We show the score, not a faked per-claim breakdown.** RAGAS only exposes a scalar publicly;
|
|
214
|
+
scraping its internals would be a fragile reimplementation. v1 shows the score + a one-line
|
|
215
|
+
reason and stays honest. (The per-claim path returns via RAGAS's newer API — see ROADMAP v3.)
|
|
216
|
+
- **"Local" can't mean "no LLM."** RAGAS's HHEM variant still needs an LLM to *extract* claims — so
|
|
217
|
+
faithgate ships `local-verification` (HHEM verifies, Claude extracts) and refuses to advertise a
|
|
218
|
+
keyless trusted mode it can't honestly deliver.
|
|
219
|
+
- **Fail-closed gate semantics.** The gate's verdict is a function of (matched, regressed, abstained,
|
|
220
|
+
new, missing) — not just "any regressions?". A renamed suite, a broken judge, or 100% abstention
|
|
221
|
+
fails the gate instead of passing vacuously.
|
|
222
|
+
- **Case identity is content-based by default.** Rewording a question mints a new case (only the
|
|
223
|
+
score floor guards it, not the delta). If your suite evolves, give cases a stable `id` — that
|
|
224
|
+
pins identity across rewordings.
|
|
225
|
+
|
|
226
|
+
## Commands
|
|
227
|
+
|
|
228
|
+
| Command | What it does |
|
|
229
|
+
|---|---|
|
|
230
|
+
| `faithgate run --suite S --label L [--judge]` | score a suite of answers into a named run |
|
|
231
|
+
| `faithgate gate --base A --head B [--allow-judge-change]` | compare two runs; exit non-zero on regression |
|
|
232
|
+
| `faithgate runs` | list captured runs |
|
|
233
|
+
| `faithgate show --run R` | show a run's scored cases |
|
|
234
|
+
| `faithgate score [--judge] [--run] [--retry-errors]` | score pending traces; optionally re-score errored ones |
|
|
235
|
+
| `faithgate calibrate [--judge]` | judge agreement with the human-labeled golden set |
|
|
236
|
+
| `faithgate up` | start the local web panel |
|
|
237
|
+
|
|
238
|
+
Judges: `claude` (default, needs key + `[claude]` extra) · `claude-local` (HHEM, `[local]` extra) ·
|
|
239
|
+
`heuristic` (offline, zero deps).
|
|
240
|
+
Exit codes: `0` ok · `1` gate failed (regression / nothing compared) · `2` usage or input error ·
|
|
241
|
+
`3` judge changed between runs.
|
|
242
|
+
|
|
243
|
+
## Roadmap
|
|
244
|
+
|
|
245
|
+
See [ROADMAP.md](ROADMAP.md) — v2 auto-eval generation (bad production answers become regression
|
|
246
|
+
tests), v2.5 CI gates over the mined dataset, v3 prescriptive feedback + failure clustering, then
|
|
247
|
+
replay / drift / cost-quality reports. The v1 schema already carries the join keys they need.
|
|
248
|
+
|
|
249
|
+
## License
|
|
250
|
+
|
|
251
|
+
MIT — see [LICENSE](LICENSE).
|