evidentia-eval 0.10.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evidentia_eval-0.10.5/.gitignore +148 -0
- evidentia_eval-0.10.5/PKG-INFO +112 -0
- evidentia_eval-0.10.5/README.md +87 -0
- evidentia_eval-0.10.5/pyproject.toml +49 -0
- evidentia_eval-0.10.5/src/evidentia_eval/__init__.py +131 -0
- evidentia_eval-0.10.5/src/evidentia_eval/claim_extraction.py +179 -0
- evidentia_eval-0.10.5/src/evidentia_eval/faithfulness.py +475 -0
- evidentia_eval-0.10.5/src/evidentia_eval/faithfulness_semantic.py +230 -0
- evidentia_eval-0.10.5/src/evidentia_eval/harness.py +502 -0
- evidentia_eval-0.10.5/src/evidentia_eval/metrics.py +166 -0
- evidentia_eval-0.10.5/src/evidentia_eval/py.typed +0 -0
- evidentia_eval-0.10.5/src/evidentia_eval/seeds.py +82 -0
- evidentia_eval-0.10.5/src/evidentia_eval/signing.py +176 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Secrets & credential stores (belt-and-suspenders safety net; the
|
|
2
|
+
# user's working directory MUST keep secrets in C:\Users\<user>\.secrets\
|
|
3
|
+
# and never inside the repo, but these patterns block accidental adds).
|
|
4
|
+
# .env.example IS tracked intentionally — it documents the expected
|
|
5
|
+
# variable names for new contributors.
|
|
6
|
+
.env
|
|
7
|
+
.env.*
|
|
8
|
+
!.env.example
|
|
9
|
+
!.env.template
|
|
10
|
+
*.pem
|
|
11
|
+
*.key
|
|
12
|
+
*.crt
|
|
13
|
+
*.p12
|
|
14
|
+
*.pfx
|
|
15
|
+
secrets/
|
|
16
|
+
credentials.json
|
|
17
|
+
|
|
18
|
+
# v0.4.0 — frontend build output lands in the Python package's static
|
|
19
|
+
# directory at wheel-assembly time via the hatchling build hook. The
|
|
20
|
+
# .gitkeep file in static/ is tracked; everything else is regenerated.
|
|
21
|
+
packages/evidentia-api/src/evidentia_api/static/assets/
|
|
22
|
+
packages/evidentia-api/src/evidentia_api/static/index.html
|
|
23
|
+
packages/evidentia-api/src/evidentia_api/static/*.js
|
|
24
|
+
packages/evidentia-api/src/evidentia_api/static/*.css
|
|
25
|
+
|
|
26
|
+
# Python
|
|
27
|
+
__pycache__/
|
|
28
|
+
*.py[cod]
|
|
29
|
+
*$py.class
|
|
30
|
+
*.so
|
|
31
|
+
.Python
|
|
32
|
+
build/
|
|
33
|
+
develop-eggs/
|
|
34
|
+
dist/
|
|
35
|
+
downloads/
|
|
36
|
+
eggs/
|
|
37
|
+
.eggs/
|
|
38
|
+
lib/
|
|
39
|
+
lib64/
|
|
40
|
+
parts/
|
|
41
|
+
sdist/
|
|
42
|
+
var/
|
|
43
|
+
wheels/
|
|
44
|
+
# NB: `lib/` and `lib64/` above would otherwise also match
|
|
45
|
+
# packages/evidentia-ui/src/lib/ (TypeScript utils). Scope to top-level
|
|
46
|
+
# only — there's no real Python-venv lib/ we'd fail to ignore because
|
|
47
|
+
# .venv/ and venv/ below cover that case.
|
|
48
|
+
!packages/evidentia-ui/src/lib/
|
|
49
|
+
*.egg-info/
|
|
50
|
+
.installed.cfg
|
|
51
|
+
*.egg
|
|
52
|
+
MANIFEST
|
|
53
|
+
|
|
54
|
+
# Virtual environments
|
|
55
|
+
.venv/
|
|
56
|
+
venv/
|
|
57
|
+
ENV/
|
|
58
|
+
env/
|
|
59
|
+
|
|
60
|
+
# uv
|
|
61
|
+
# NOTE: uv.lock is committed for reproducible builds.
|
|
62
|
+
# https://docs.astral.sh/uv/concepts/projects/sync/#locking-dependencies
|
|
63
|
+
|
|
64
|
+
# Testing
|
|
65
|
+
.pytest_cache/
|
|
66
|
+
.coverage
|
|
67
|
+
.coverage.*
|
|
68
|
+
htmlcov/
|
|
69
|
+
.tox/
|
|
70
|
+
.cache
|
|
71
|
+
coverage.xml
|
|
72
|
+
*.cover
|
|
73
|
+
.hypothesis/
|
|
74
|
+
|
|
75
|
+
# mypy
|
|
76
|
+
.mypy_cache/
|
|
77
|
+
.dmypy.json
|
|
78
|
+
dmypy.json
|
|
79
|
+
|
|
80
|
+
# Ruff
|
|
81
|
+
.ruff_cache/
|
|
82
|
+
|
|
83
|
+
# Supply-chain scan artifact — the CycloneDX SBOM is regenerated by
|
|
84
|
+
# scripts/run_osv_scan.py and by release.yml; it is never committed.
|
|
85
|
+
evidentia-sbom.cdx.json
|
|
86
|
+
# Step 7 post-tag verification artifacts: SBOMs downloaded from the
|
|
87
|
+
# GitHub Release attached assets for osv-scan re-verification. The
|
|
88
|
+
# canonical SBOM lives on the published release, not in the repo.
|
|
89
|
+
published-sbom*.cdx.json
|
|
90
|
+
|
|
91
|
+
# IDE
|
|
92
|
+
# .vscode/ is ignored by default but the canonical shared workspace
|
|
93
|
+
# config files are version-controlled (see docs/ide-setup.md). Per-developer
|
|
94
|
+
# overrides (.vscode/*.local.json, scratch files, etc.) stay ignored.
|
|
95
|
+
.vscode/*
|
|
96
|
+
!.vscode/settings.json
|
|
97
|
+
!.vscode/launch.json
|
|
98
|
+
!.vscode/tasks.json
|
|
99
|
+
!.vscode/extensions.json
|
|
100
|
+
.idea/
|
|
101
|
+
|
|
102
|
+
# .cursor/ is the per-developer private Cursor workspace directory
|
|
103
|
+
# (project rules under .cursor/rules/*.mdc, MCP server configs, and any
|
|
104
|
+
# other Cursor IDE state). The public-facing Cursor conventions live at
|
|
105
|
+
# the repo-root .cursorrules file (already version-controlled). The
|
|
106
|
+
# .cursor/ directory is for per-developer extensions that should not be
|
|
107
|
+
# committed.
|
|
108
|
+
.cursor/
|
|
109
|
+
|
|
110
|
+
# Local-only / per-developer scratch directory for working notes,
|
|
111
|
+
# drafts, and anything not ready to share. The convention follows the
|
|
112
|
+
# .vscode/ split: ignore the whole directory by default; un-ignore
|
|
113
|
+
# specific files only if they're meant to be shared across the team.
|
|
114
|
+
.local/
|
|
115
|
+
|
|
116
|
+
# Private competitive-strategy / market-research working docs. The repo
|
|
117
|
+
# is public (Polycentric-Labs/evidentia); strategy material naming
|
|
118
|
+
# specific competitor-feature adoption decisions or commercial
|
|
119
|
+
# positioning stays out of the public tree. Neutral landscape analysis
|
|
120
|
+
# lives in docs/ (e.g. docs/positioning-and-value.md) and IS tracked.
|
|
121
|
+
/private/
|
|
122
|
+
*.swp
|
|
123
|
+
*.swo
|
|
124
|
+
*~
|
|
125
|
+
.DS_Store
|
|
126
|
+
|
|
127
|
+
# Claude Code local state
|
|
128
|
+
.claude/
|
|
129
|
+
|
|
130
|
+
# Evidentia runtime — user project state (NOT bundled examples).
|
|
131
|
+
# `.controlbridge/` and `/controlbridge.yaml` remain ignored as a courtesy
|
|
132
|
+
# for legacy project workspaces (v0.1.0 – v0.5.0) so files generated by
|
|
133
|
+
# pre-rename code don't leak into git when those projects are migrated.
|
|
134
|
+
.evidentia/
|
|
135
|
+
.controlbridge/
|
|
136
|
+
/evidentia.yaml
|
|
137
|
+
/controlbridge.yaml
|
|
138
|
+
*.local.yaml
|
|
139
|
+
evidence/
|
|
140
|
+
reports/
|
|
141
|
+
risks/
|
|
142
|
+
|
|
143
|
+
# Generated reports from examples (keep source files, ignore generated ones)
|
|
144
|
+
examples/**/report.json
|
|
145
|
+
examples/**/report.csv
|
|
146
|
+
examples/**/report.md
|
|
147
|
+
examples/**/report.oscal.json
|
|
148
|
+
examples/**/risks.json
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evidentia-eval
|
|
3
|
+
Version: 0.10.5
|
|
4
|
+
Summary: DFAH (Decision-Faithfulness Assessment Harness) determinism + faithfulness eval harness for Evidentia — dev-time AI-output quality gates
|
|
5
|
+
Project-URL: Homepage, https://github.com/polycentric-labs/evidentia
|
|
6
|
+
Project-URL: Repository, https://github.com/polycentric-labs/evidentia
|
|
7
|
+
Project-URL: Issues, https://github.com/polycentric-labs/evidentia/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/polycentric-labs/evidentia/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Allen Byrd <allen@allenfbyrd.com>
|
|
10
|
+
License-Expression: Apache-2.0
|
|
11
|
+
Keywords: ai-quality,compliance,determinism,dfah,faithfulness,grc,llm-eval
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: evidentia-core<0.11.0,>=0.10.5
|
|
21
|
+
Provides-Extra: faithfulness-semantic
|
|
22
|
+
Requires-Dist: numpy>=1.26; extra == 'faithfulness-semantic'
|
|
23
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'faithfulness-semantic'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# evidentia-eval
|
|
27
|
+
|
|
28
|
+
Dev-time AI-output quality eval harness for Evidentia.
|
|
29
|
+
|
|
30
|
+
Hosts the **DFAH (Decision-Faithfulness Assessment Harness)** —
|
|
31
|
+
the auditor-defensible numerical proof layer that validates
|
|
32
|
+
LLM-driven artifact production is deterministic, replay-
|
|
33
|
+
equivalent, and faithful to its source policy clauses.
|
|
34
|
+
|
|
35
|
+
## Why this package exists (v0.10.5 P9 extraction)
|
|
36
|
+
|
|
37
|
+
The DFAH harness was originally bundled into `evidentia-ai` (the
|
|
38
|
+
risk-statement generator + control explainer package). That
|
|
39
|
+
conflated two very different deployment surfaces:
|
|
40
|
+
|
|
41
|
+
- **`evidentia-ai`** — PRODUCTION runtime. Needed in air-gap
|
|
42
|
+
installs to actually generate risk statements.
|
|
43
|
+
- **`evidentia-eval`** — DEVELOPMENT-time evaluation. NOT needed
|
|
44
|
+
in air-gap installs; only fires when a CI pipeline runs a
|
|
45
|
+
determinism / faithfulness gate before tagging a release.
|
|
46
|
+
|
|
47
|
+
Extracting the eval harness lets air-gap installs of
|
|
48
|
+
`evidentia-ai` skip the optional sentence-transformers stack
|
|
49
|
+
entirely (it now lives behind `evidentia-eval[faithfulness-semantic]`
|
|
50
|
+
instead of `evidentia-ai[eval-faithfulness]`).
|
|
51
|
+
|
|
52
|
+
## Quick start
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Stdlib Jaccard baseline (no extra needed; <10 MB install)
|
|
56
|
+
pip install evidentia-eval
|
|
57
|
+
|
|
58
|
+
# Optional semantic-similarity faithfulness (~250 MB extra
|
|
59
|
+
# for sentence-transformers + numpy + model cache on first use)
|
|
60
|
+
pip install 'evidentia-eval[faithfulness-semantic]'
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
CLI verbs:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# Smoke test against a deterministic stub generator (no LLM
|
|
67
|
+
# tokens burned)
|
|
68
|
+
evidentia eval stub-smoke
|
|
69
|
+
|
|
70
|
+
# Real-LLM determinism gate against the risk-statement generator
|
|
71
|
+
evidentia eval risk-determinism --gap-report gaps.json \
|
|
72
|
+
--system-context ctx.yaml \
|
|
73
|
+
--fail-on-determinism-rate-below 0.95
|
|
74
|
+
|
|
75
|
+
# Verify a previously-signed eval bundle
|
|
76
|
+
evidentia eval verify path/to/eval-output.json
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
The CLI verbs live in `evidentia.cli.eval` (the meta-package);
|
|
80
|
+
this package contributes the underlying library.
|
|
81
|
+
|
|
82
|
+
## Public API
|
|
83
|
+
|
|
84
|
+
| Symbol | Purpose |
|
|
85
|
+
|---|---|
|
|
86
|
+
| `DFAHarness` | Owns the run loop + audit emit |
|
|
87
|
+
| `EvalResult` | Top-level harness output (JSON-serializable, Sigstore-signable) |
|
|
88
|
+
| `EvalSample` | One prompt's inputs (immutable; audit-trail-stable) |
|
|
89
|
+
| `DeterminismResult` | Per-prompt determinism outcome |
|
|
90
|
+
| `ReplayResult` | Per-prompt replay-equivalence outcome |
|
|
91
|
+
| `FaithfulnessResult` | Per-claim faithfulness outcome |
|
|
92
|
+
| `PromptFaithfulnessResult` | Aggregated per-prompt faithfulness |
|
|
93
|
+
| `faithfulness_score` | Stdlib Jaccard token-overlap baseline |
|
|
94
|
+
| `faithfulness_score_semantic` | Sentence-transformers path (optional extra) |
|
|
95
|
+
| `determinism_score` | Computes the modal-output pass rate |
|
|
96
|
+
| `replay_equivalent` | Binary replay-equivalence check |
|
|
97
|
+
| `extract_claims` | Atomic-claim extraction from generated artifacts |
|
|
98
|
+
| `normalize_for_determinism` | Canonical whitespace + punctuation normalization |
|
|
99
|
+
| `hash_output` | SHA-256 hex of normalized output |
|
|
100
|
+
| `sign_eval_result` | Sigstore-sign an `EvalResult` JSON |
|
|
101
|
+
| `verify_eval_result` | Verify a previously-signed eval bundle |
|
|
102
|
+
|
|
103
|
+
## Backward-compat shim
|
|
104
|
+
|
|
105
|
+
For external scripts that still import `from evidentia_ai.eval
|
|
106
|
+
import ...`, `evidentia-ai` ships a deprecation shim that
|
|
107
|
+
re-exports from `evidentia_eval`. The shim warns once at import
|
|
108
|
+
time and is scheduled for removal in **v0.12.0**.
|
|
109
|
+
|
|
110
|
+
## License
|
|
111
|
+
|
|
112
|
+
Apache-2.0. See the workspace root LICENSE file.
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# evidentia-eval
|
|
2
|
+
|
|
3
|
+
Dev-time AI-output quality eval harness for Evidentia.
|
|
4
|
+
|
|
5
|
+
Hosts the **DFAH (Decision-Faithfulness Assessment Harness)** —
|
|
6
|
+
the auditor-defensible numerical proof layer that validates
|
|
7
|
+
LLM-driven artifact production is deterministic, replay-
|
|
8
|
+
equivalent, and faithful to its source policy clauses.
|
|
9
|
+
|
|
10
|
+
## Why this package exists (v0.10.5 P9 extraction)
|
|
11
|
+
|
|
12
|
+
The DFAH harness was originally bundled into `evidentia-ai` (the
|
|
13
|
+
risk-statement generator + control explainer package). That
|
|
14
|
+
conflated two very different deployment surfaces:
|
|
15
|
+
|
|
16
|
+
- **`evidentia-ai`** — PRODUCTION runtime. Needed in air-gap
|
|
17
|
+
installs to actually generate risk statements.
|
|
18
|
+
- **`evidentia-eval`** — DEVELOPMENT-time evaluation. NOT needed
|
|
19
|
+
in air-gap installs; only fires when a CI pipeline runs a
|
|
20
|
+
determinism / faithfulness gate before tagging a release.
|
|
21
|
+
|
|
22
|
+
Extracting the eval harness lets air-gap installs of
|
|
23
|
+
`evidentia-ai` skip the optional sentence-transformers stack
|
|
24
|
+
entirely (it now lives behind `evidentia-eval[faithfulness-semantic]`
|
|
25
|
+
instead of `evidentia-ai[eval-faithfulness]`).
|
|
26
|
+
|
|
27
|
+
## Quick start
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Stdlib Jaccard baseline (no extra needed; <10 MB install)
|
|
31
|
+
pip install evidentia-eval
|
|
32
|
+
|
|
33
|
+
# Optional semantic-similarity faithfulness (~250 MB extra
|
|
34
|
+
# for sentence-transformers + numpy + model cache on first use)
|
|
35
|
+
pip install 'evidentia-eval[faithfulness-semantic]'
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
CLI verbs:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Smoke test against a deterministic stub generator (no LLM
|
|
42
|
+
# tokens burned)
|
|
43
|
+
evidentia eval stub-smoke
|
|
44
|
+
|
|
45
|
+
# Real-LLM determinism gate against the risk-statement generator
|
|
46
|
+
evidentia eval risk-determinism --gap-report gaps.json \
|
|
47
|
+
--system-context ctx.yaml \
|
|
48
|
+
--fail-on-determinism-rate-below 0.95
|
|
49
|
+
|
|
50
|
+
# Verify a previously-signed eval bundle
|
|
51
|
+
evidentia eval verify path/to/eval-output.json
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
The CLI verbs live in `evidentia.cli.eval` (the meta-package);
|
|
55
|
+
this package contributes the underlying library.
|
|
56
|
+
|
|
57
|
+
## Public API
|
|
58
|
+
|
|
59
|
+
| Symbol | Purpose |
|
|
60
|
+
|---|---|
|
|
61
|
+
| `DFAHarness` | Owns the run loop + audit emit |
|
|
62
|
+
| `EvalResult` | Top-level harness output (JSON-serializable, Sigstore-signable) |
|
|
63
|
+
| `EvalSample` | One prompt's inputs (immutable; audit-trail-stable) |
|
|
64
|
+
| `DeterminismResult` | Per-prompt determinism outcome |
|
|
65
|
+
| `ReplayResult` | Per-prompt replay-equivalence outcome |
|
|
66
|
+
| `FaithfulnessResult` | Per-claim faithfulness outcome |
|
|
67
|
+
| `PromptFaithfulnessResult` | Aggregated per-prompt faithfulness |
|
|
68
|
+
| `faithfulness_score` | Stdlib Jaccard token-overlap baseline |
|
|
69
|
+
| `faithfulness_score_semantic` | Sentence-transformers path (optional extra) |
|
|
70
|
+
| `determinism_score` | Computes the modal-output pass rate |
|
|
71
|
+
| `replay_equivalent` | Binary replay-equivalence check |
|
|
72
|
+
| `extract_claims` | Atomic-claim extraction from generated artifacts |
|
|
73
|
+
| `normalize_for_determinism` | Canonical whitespace + punctuation normalization |
|
|
74
|
+
| `hash_output` | SHA-256 hex of normalized output |
|
|
75
|
+
| `sign_eval_result` | Sigstore-sign an `EvalResult` JSON |
|
|
76
|
+
| `verify_eval_result` | Verify a previously-signed eval bundle |
|
|
77
|
+
|
|
78
|
+
## Backward-compat shim
|
|
79
|
+
|
|
80
|
+
For external scripts that still import `from evidentia_ai.eval
|
|
81
|
+
import ...`, `evidentia-ai` ships a deprecation shim that
|
|
82
|
+
re-exports from `evidentia_eval`. The shim warns once at import
|
|
83
|
+
time and is scheduled for removal in **v0.12.0**.
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
Apache-2.0. See the workspace root LICENSE file.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "evidentia-eval"
|
|
3
|
+
version = "0.10.5"
|
|
4
|
+
description = "DFAH (Decision-Faithfulness Assessment Harness) determinism + faithfulness eval harness for Evidentia — dev-time AI-output quality gates"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [{name = "Allen Byrd", email = "allen@allenfbyrd.com"}]
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
requires-python = ">=3.12"
|
|
9
|
+
keywords = ["grc", "compliance", "llm-eval", "dfah", "determinism", "faithfulness", "ai-quality"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Information Technology",
|
|
13
|
+
"License :: OSI Approved :: Apache Software License",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
|
+
"Typing :: Typed",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"evidentia-core>=0.10.5,<0.11.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
# v0.10.5 P9 extraction: the v0.8.3 P1.1 sentence-transformers
|
|
25
|
+
# faithfulness path lives on evidentia-eval now. Same opt-in shape:
|
|
26
|
+
# ~90 MB model download (default all-MiniLM-L6-v2) on first use;
|
|
27
|
+
# cached at ~/.cache/huggingface/. Operators relying on the stdlib
|
|
28
|
+
# Jaccard baseline are unaffected (no extra needed). Heavyweight;
|
|
29
|
+
# intentionally extra-gated.
|
|
30
|
+
faithfulness-semantic = [
|
|
31
|
+
"sentence-transformers>=3.0",
|
|
32
|
+
"numpy>=1.26",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/polycentric-labs/evidentia"
|
|
37
|
+
Repository = "https://github.com/polycentric-labs/evidentia"
|
|
38
|
+
Issues = "https://github.com/polycentric-labs/evidentia/issues"
|
|
39
|
+
Changelog = "https://github.com/polycentric-labs/evidentia/blob/main/CHANGELOG.md"
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["hatchling"]
|
|
43
|
+
build-backend = "hatchling.build"
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.wheel]
|
|
46
|
+
packages = ["src/evidentia_eval"]
|
|
47
|
+
|
|
48
|
+
[tool.uv.sources]
|
|
49
|
+
evidentia-core = { workspace = true }
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""evidentia-eval — DFAH determinism + faithfulness harness for Evidentia.
|
|
2
|
+
|
|
3
|
+
Decision-Faithfulness Assessment Harness per arXiv 2601.15322.
|
|
4
|
+
Validates that risk-statement generation (or any AI-driven
|
|
5
|
+
artifact production) is auditor-defensibly stable: same input +
|
|
6
|
+
same model + same temperature produces the same output, and a
|
|
7
|
+
re-run with pinned ``(input, model, temperature, prompt_hash,
|
|
8
|
+
run_id)`` is byte-equivalent to the original.
|
|
9
|
+
|
|
10
|
+
Three metrics ship:
|
|
11
|
+
|
|
12
|
+
- **Decision determinism** — same prompt produces the same
|
|
13
|
+
normalized output across N samples. The pass rate is the
|
|
14
|
+
fraction of samples that match the modal output (modulo
|
|
15
|
+
whitespace + punctuation normalization). Reported as a 0..1
|
|
16
|
+
score; CI-gateable via
|
|
17
|
+
``evidentia eval --fail-on-determinism-rate-below 0.95``.
|
|
18
|
+
- **Replay equivalence** — re-running with a pinned context
|
|
19
|
+
(``GenerationContext`` instance) produces an output whose
|
|
20
|
+
SHA-256 hash matches the original. Either the run is replay-
|
|
21
|
+
equivalent or it isn't — there is no graceful degradation.
|
|
22
|
+
- **Faithfulness** — do the atomic claims in a generated artifact
|
|
23
|
+
trace back to source policy clauses? Stdlib Jaccard baseline
|
|
24
|
+
(always available) + optional sentence-transformers semantic
|
|
25
|
+
path (``[faithfulness-semantic]`` extra).
|
|
26
|
+
|
|
27
|
+
Public API:
|
|
28
|
+
|
|
29
|
+
- :class:`DFAHarness` — owns the run loop + audit emit.
|
|
30
|
+
- :class:`DeterminismResult` — Pydantic model summarizing one
|
|
31
|
+
prompt's determinism outcome (modal output + pass rate +
|
|
32
|
+
per-sample hashes).
|
|
33
|
+
- :class:`ReplayResult` — Pydantic model summarizing replay-
|
|
34
|
+
equivalence for a single ``GenerationContext`` re-run.
|
|
35
|
+
- :class:`EvalResult` — top-level harness output covering all
|
|
36
|
+
prompts in one ``run_id``.
|
|
37
|
+
- :class:`FaithfulnessResult` — per-claim faithfulness outcome.
|
|
38
|
+
- :class:`PromptFaithfulnessResult` — aggregated per-prompt
|
|
39
|
+
faithfulness outcome.
|
|
40
|
+
- :func:`faithfulness_score` — stdlib Jaccard token-overlap
|
|
41
|
+
baseline.
|
|
42
|
+
- :func:`faithfulness_score_semantic` — sentence-transformers
|
|
43
|
+
semantic-similarity path (opt-in extra).
|
|
44
|
+
- :func:`extract_claims` — atomic-claim extraction from generated
|
|
45
|
+
artifacts.
|
|
46
|
+
- :func:`normalize_for_determinism` — canonical normalization
|
|
47
|
+
(whitespace + punctuation) used by the determinism check.
|
|
48
|
+
- :func:`hash_output` — SHA-256 hex of normalized output.
|
|
49
|
+
- :func:`sign_eval_result` / :func:`verify_eval_result` —
|
|
50
|
+
Sigstore-sign + verify the eval output.
|
|
51
|
+
|
|
52
|
+
The harness is generator-agnostic: it accepts any callable
|
|
53
|
+
``(prompt: str, context: GenerationContext) -> str`` so the
|
|
54
|
+
same machinery validates risk statements, control
|
|
55
|
+
explanations, future PRT-traced outputs, and any third-party
|
|
56
|
+
plugin's AI-generated artifacts. Unit tests use a deterministic
|
|
57
|
+
fake generator; live operator runs wire in
|
|
58
|
+
``evidentia_ai.risk_statements.RiskStatementGenerator.generate``.
|
|
59
|
+
|
|
60
|
+
v0.10.5 P9 extraction: this package was carved out of
|
|
61
|
+
``evidentia_ai.eval.*`` to keep air-gap installs of the
|
|
62
|
+
risk-statement runtime from pulling sentence-transformers /
|
|
63
|
+
numpy / instructor heavy-dep stacks. The dev-time eval harness
|
|
64
|
+
now installs separately (or via ``pip install
|
|
65
|
+
evidentia-eval[faithfulness-semantic]`` for the optional
|
|
66
|
+
semantic path).
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
from __future__ import annotations
|
|
70
|
+
|
|
71
|
+
from importlib.metadata import PackageNotFoundError
|
|
72
|
+
from importlib.metadata import version as _pkg_version
|
|
73
|
+
|
|
74
|
+
from evidentia_eval.claim_extraction import (
|
|
75
|
+
CLAIM_EXTRACTION_PROMPT,
|
|
76
|
+
extract_claims,
|
|
77
|
+
)
|
|
78
|
+
from evidentia_eval.faithfulness import (
|
|
79
|
+
DEFAULT_FAITHFULNESS_THRESHOLD,
|
|
80
|
+
FaithfulnessResult,
|
|
81
|
+
PromptFaithfulnessResult,
|
|
82
|
+
faithfulness_score,
|
|
83
|
+
)
|
|
84
|
+
from evidentia_eval.faithfulness_semantic import (
|
|
85
|
+
DEFAULT_SEMANTIC_MODEL,
|
|
86
|
+
DEFAULT_SEMANTIC_THRESHOLD,
|
|
87
|
+
SemanticFaithfulnessNotAvailableError,
|
|
88
|
+
faithfulness_score_semantic,
|
|
89
|
+
)
|
|
90
|
+
from evidentia_eval.harness import DFAHarness, EvalResult, EvalSample
|
|
91
|
+
from evidentia_eval.metrics import (
|
|
92
|
+
DeterminismResult,
|
|
93
|
+
ReplayResult,
|
|
94
|
+
determinism_score,
|
|
95
|
+
replay_equivalent,
|
|
96
|
+
)
|
|
97
|
+
from evidentia_eval.seeds import hash_output, normalize_for_determinism
|
|
98
|
+
from evidentia_eval.signing import (
|
|
99
|
+
sign_eval_result,
|
|
100
|
+
verify_eval_result,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
__version__ = _pkg_version("evidentia-eval")
|
|
105
|
+
except PackageNotFoundError: # pragma: no cover
|
|
106
|
+
__version__ = "0.0.0+unknown"
|
|
107
|
+
|
|
108
|
+
__all__ = [
|
|
109
|
+
"CLAIM_EXTRACTION_PROMPT",
|
|
110
|
+
"DEFAULT_FAITHFULNESS_THRESHOLD",
|
|
111
|
+
"DEFAULT_SEMANTIC_MODEL",
|
|
112
|
+
"DEFAULT_SEMANTIC_THRESHOLD",
|
|
113
|
+
"DFAHarness",
|
|
114
|
+
"DeterminismResult",
|
|
115
|
+
"EvalResult",
|
|
116
|
+
"EvalSample",
|
|
117
|
+
"FaithfulnessResult",
|
|
118
|
+
"PromptFaithfulnessResult",
|
|
119
|
+
"ReplayResult",
|
|
120
|
+
"SemanticFaithfulnessNotAvailableError",
|
|
121
|
+
"__version__",
|
|
122
|
+
"determinism_score",
|
|
123
|
+
"extract_claims",
|
|
124
|
+
"faithfulness_score",
|
|
125
|
+
"faithfulness_score_semantic",
|
|
126
|
+
"hash_output",
|
|
127
|
+
"normalize_for_determinism",
|
|
128
|
+
"replay_equivalent",
|
|
129
|
+
"sign_eval_result",
|
|
130
|
+
"verify_eval_result",
|
|
131
|
+
]
|