evidentia-eval 0.10.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,148 @@
1
+ # Secrets & credential stores (belt-and-suspenders safety net; the
2
+ # user's working directory MUST keep secrets in C:\Users\<user>\.secrets\
3
+ # and never inside the repo, but these patterns block accidental adds).
4
+ # .env.example IS tracked intentionally — it documents the expected
5
+ # variable names for new contributors.
6
+ .env
7
+ .env.*
8
+ !.env.example
9
+ !.env.template
10
+ *.pem
11
+ *.key
12
+ *.crt
13
+ *.p12
14
+ *.pfx
15
+ secrets/
16
+ credentials.json
17
+
18
+ # v0.4.0 — frontend build output lands in the Python package's static
19
+ # directory at wheel-assembly time via the hatchling build hook. The
20
+ # .gitkeep file in static/ is tracked; everything else is regenerated.
21
+ packages/evidentia-api/src/evidentia_api/static/assets/
22
+ packages/evidentia-api/src/evidentia_api/static/index.html
23
+ packages/evidentia-api/src/evidentia_api/static/*.js
24
+ packages/evidentia-api/src/evidentia_api/static/*.css
25
+
26
+ # Python
27
+ __pycache__/
28
+ *.py[cod]
29
+ *$py.class
30
+ *.so
31
+ .Python
32
+ build/
33
+ develop-eggs/
34
+ dist/
35
+ downloads/
36
+ eggs/
37
+ .eggs/
38
+ lib/
39
+ lib64/
40
+ parts/
41
+ sdist/
42
+ var/
43
+ wheels/
44
+ # NB: `lib/` and `lib64/` above would otherwise also match
45
+ # packages/evidentia-ui/src/lib/ (TypeScript utils). Scope to top-level
46
+ # only — there's no real Python-venv lib/ we'd fail to ignore because
47
+ # .venv/ and venv/ below cover that case.
48
+ !packages/evidentia-ui/src/lib/
49
+ *.egg-info/
50
+ .installed.cfg
51
+ *.egg
52
+ MANIFEST
53
+
54
+ # Virtual environments
55
+ .venv/
56
+ venv/
57
+ ENV/
58
+ env/
59
+
60
+ # uv
61
+ # NOTE: uv.lock is committed for reproducible builds.
62
+ # https://docs.astral.sh/uv/concepts/projects/sync/#locking-dependencies
63
+
64
+ # Testing
65
+ .pytest_cache/
66
+ .coverage
67
+ .coverage.*
68
+ htmlcov/
69
+ .tox/
70
+ .cache
71
+ coverage.xml
72
+ *.cover
73
+ .hypothesis/
74
+
75
+ # mypy
76
+ .mypy_cache/
77
+ .dmypy.json
78
+ dmypy.json
79
+
80
+ # Ruff
81
+ .ruff_cache/
82
+
83
+ # Supply-chain scan artifact — the CycloneDX SBOM is regenerated by
84
+ # scripts/run_osv_scan.py and by release.yml; it is never committed.
85
+ evidentia-sbom.cdx.json
86
+ # Step 7 post-tag verification artifacts: SBOMs downloaded from the
87
+ # GitHub Release attached assets for osv-scan re-verification. The
88
+ # canonical SBOM lives on the published release, not in the repo.
89
+ published-sbom*.cdx.json
90
+
91
+ # IDE
92
+ # .vscode/ is ignored by default but the canonical shared workspace
93
+ # config files are version-controlled (see docs/ide-setup.md). Per-developer
94
+ # overrides (.vscode/*.local.json, scratch files, etc.) stay ignored.
95
+ .vscode/*
96
+ !.vscode/settings.json
97
+ !.vscode/launch.json
98
+ !.vscode/tasks.json
99
+ !.vscode/extensions.json
100
+ .idea/
101
+
102
+ # .cursor/ is the per-developer private Cursor workspace directory
103
+ # (project rules under .cursor/rules/*.mdc, MCP server configs, and any
104
+ # other Cursor IDE state). The public-facing Cursor conventions live at
105
+ # the repo-root .cursorrules file (already version-controlled). The
106
+ # .cursor/ directory is for per-developer extensions that should not be
107
+ # committed.
108
+ .cursor/
109
+
110
+ # Local-only / per-developer scratch directory for working notes,
111
+ # drafts, and anything not ready to share. The convention follows the
112
+ # .vscode/ split: ignore the whole directory by default; un-ignore
113
+ # specific files only if they're meant to be shared across the team.
114
+ .local/
115
+
116
+ # Private competitive-strategy / market-research working docs. The repo
117
+ # is public (Polycentric-Labs/evidentia); strategy material naming
118
+ # specific competitor-feature adoption decisions or commercial
119
+ # positioning stays out of the public tree. Neutral landscape analysis
120
+ # lives in docs/ (e.g. docs/positioning-and-value.md) and IS tracked.
121
+ /private/
122
+ *.swp
123
+ *.swo
124
+ *~
125
+ .DS_Store
126
+
127
+ # Claude Code local state
128
+ .claude/
129
+
130
+ # Evidentia runtime — user project state (NOT bundled examples).
131
+ # `.controlbridge/` and `/controlbridge.yaml` remain ignored as a courtesy
132
+ # for legacy project workspaces (v0.1.0 – v0.5.0) so files generated by
133
+ # pre-rename code don't leak into git when those projects are migrated.
134
+ .evidentia/
135
+ .controlbridge/
136
+ /evidentia.yaml
137
+ /controlbridge.yaml
138
+ *.local.yaml
139
+ evidence/
140
+ reports/
141
+ risks/
142
+
143
+ # Generated reports from examples (keep source files, ignore generated ones)
144
+ examples/**/report.json
145
+ examples/**/report.csv
146
+ examples/**/report.md
147
+ examples/**/report.oscal.json
148
+ examples/**/risks.json
@@ -0,0 +1,112 @@
1
+ Metadata-Version: 2.4
2
+ Name: evidentia-eval
3
+ Version: 0.10.5
4
+ Summary: DFAH (Decision-Faithfulness Assessment Harness) determinism + faithfulness eval harness for Evidentia — dev-time AI-output quality gates
5
+ Project-URL: Homepage, https://github.com/polycentric-labs/evidentia
6
+ Project-URL: Repository, https://github.com/polycentric-labs/evidentia
7
+ Project-URL: Issues, https://github.com/polycentric-labs/evidentia/issues
8
+ Project-URL: Changelog, https://github.com/polycentric-labs/evidentia/blob/main/CHANGELOG.md
9
+ Author-email: Allen Byrd <allen@allenfbyrd.com>
10
+ License-Expression: Apache-2.0
11
+ Keywords: ai-quality,compliance,determinism,dfah,faithfulness,grc,llm-eval
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Typing :: Typed
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: evidentia-core<0.11.0,>=0.10.5
21
+ Provides-Extra: faithfulness-semantic
22
+ Requires-Dist: numpy>=1.26; extra == 'faithfulness-semantic'
23
+ Requires-Dist: sentence-transformers>=3.0; extra == 'faithfulness-semantic'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # evidentia-eval
27
+
28
+ Dev-time AI-output quality eval harness for Evidentia.
29
+
30
+ Hosts the **DFAH (Decision-Faithfulness Assessment Harness)** —
31
+ the auditor-defensible numerical proof layer that validates
32
+ LLM-driven artifact production is deterministic, replay-
33
+ equivalent, and faithful to its source policy clauses.
34
+
35
+ ## Why this package exists (v0.10.5 P9 extraction)
36
+
37
+ The DFAH harness was originally bundled into `evidentia-ai` (the
38
+ risk-statement generator + control explainer package). That
39
+ conflated two very different deployment surfaces:
40
+
41
+ - **`evidentia-ai`** — PRODUCTION runtime. Needed in air-gap
42
+ installs to actually generate risk statements.
43
+ - **`evidentia-eval`** — DEVELOPMENT-time evaluation. NOT needed
44
+ in air-gap installs; only fires when a CI pipeline runs a
45
+ determinism / faithfulness gate before tagging a release.
46
+
47
+ Extracting the eval harness lets air-gap installs of
48
+ `evidentia-ai` skip the optional sentence-transformers stack
49
+ entirely (it now lives behind `evidentia-eval[faithfulness-semantic]`
50
+ instead of `evidentia-ai[eval-faithfulness]`).
51
+
52
+ ## Quick start
53
+
54
+ ```bash
55
+ # Stdlib Jaccard baseline (no extra needed; <10 MB install)
56
+ pip install evidentia-eval
57
+
58
+ # Optional semantic-similarity faithfulness (~250 MB extra
59
+ # for sentence-transformers + numpy + model cache on first use)
60
+ pip install 'evidentia-eval[faithfulness-semantic]'
61
+ ```
62
+
63
+ CLI verbs:
64
+
65
+ ```bash
66
+ # Smoke test against a deterministic stub generator (no LLM
67
+ # tokens burned)
68
+ evidentia eval stub-smoke
69
+
70
+ # Real-LLM determinism gate against the risk-statement generator
71
+ evidentia eval risk-determinism --gap-report gaps.json \
72
+ --system-context ctx.yaml \
73
+ --fail-on-determinism-rate-below 0.95
74
+
75
+ # Verify a previously-signed eval bundle
76
+ evidentia eval verify path/to/eval-output.json
77
+ ```
78
+
79
+ The CLI verbs live in `evidentia.cli.eval` (the meta-package);
80
+ this package contributes the underlying library.
81
+
82
+ ## Public API
83
+
84
+ | Symbol | Purpose |
85
+ |---|---|
86
+ | `DFAHarness` | Owns the run loop + audit emit |
87
+ | `EvalResult` | Top-level harness output (JSON-serializable, Sigstore-signable) |
88
+ | `EvalSample` | One prompt's inputs (immutable; audit-trail-stable) |
89
+ | `DeterminismResult` | Per-prompt determinism outcome |
90
+ | `ReplayResult` | Per-prompt replay-equivalence outcome |
91
+ | `FaithfulnessResult` | Per-claim faithfulness outcome |
92
+ | `PromptFaithfulnessResult` | Aggregated per-prompt faithfulness |
93
+ | `faithfulness_score` | Stdlib Jaccard token-overlap baseline |
94
+ | `faithfulness_score_semantic` | Sentence-transformers path (optional extra) |
95
+ | `determinism_score` | Computes the modal-output pass rate |
96
+ | `replay_equivalent` | Binary replay-equivalence check |
97
+ | `extract_claims` | Atomic-claim extraction from generated artifacts |
98
+ | `normalize_for_determinism` | Canonical whitespace + punctuation normalization |
99
+ | `hash_output` | SHA-256 hex of normalized output |
100
+ | `sign_eval_result` | Sigstore-sign an `EvalResult` JSON |
101
+ | `verify_eval_result` | Verify a previously-signed eval bundle |
102
+
103
+ ## Backward-compat shim
104
+
105
+ For external scripts that still import `from evidentia_ai.eval
106
+ import ...`, `evidentia-ai` ships a deprecation shim that
107
+ re-exports from `evidentia_eval`. The shim warns once at import
108
+ time and is scheduled for removal in **v0.12.0**.
109
+
110
+ ## License
111
+
112
+ Apache-2.0. See the workspace root LICENSE file.
@@ -0,0 +1,87 @@
1
+ # evidentia-eval
2
+
3
+ Dev-time AI-output quality eval harness for Evidentia.
4
+
5
+ Hosts the **DFAH (Decision-Faithfulness Assessment Harness)** —
6
+ the auditor-defensible numerical proof layer that validates
7
+ LLM-driven artifact production is deterministic, replay-
8
+ equivalent, and faithful to its source policy clauses.
9
+
10
+ ## Why this package exists (v0.10.5 P9 extraction)
11
+
12
+ The DFAH harness was originally bundled into `evidentia-ai` (the
13
+ risk-statement generator + control explainer package). That
14
+ conflated two very different deployment surfaces:
15
+
16
+ - **`evidentia-ai`** — PRODUCTION runtime. Needed in air-gap
17
+ installs to actually generate risk statements.
18
+ - **`evidentia-eval`** — DEVELOPMENT-time evaluation. NOT needed
19
+ in air-gap installs; only fires when a CI pipeline runs a
20
+ determinism / faithfulness gate before tagging a release.
21
+
22
+ Extracting the eval harness lets air-gap installs of
23
+ `evidentia-ai` skip the optional sentence-transformers stack
24
+ entirely (it now lives behind `evidentia-eval[faithfulness-semantic]`
25
+ instead of `evidentia-ai[eval-faithfulness]`).
26
+
27
+ ## Quick start
28
+
29
+ ```bash
30
+ # Stdlib Jaccard baseline (no extra needed; <10 MB install)
31
+ pip install evidentia-eval
32
+
33
+ # Optional semantic-similarity faithfulness (~250 MB extra
34
+ # for sentence-transformers + numpy + model cache on first use)
35
+ pip install 'evidentia-eval[faithfulness-semantic]'
36
+ ```
37
+
38
+ CLI verbs:
39
+
40
+ ```bash
41
+ # Smoke test against a deterministic stub generator (no LLM
42
+ # tokens burned)
43
+ evidentia eval stub-smoke
44
+
45
+ # Real-LLM determinism gate against the risk-statement generator
46
+ evidentia eval risk-determinism --gap-report gaps.json \
47
+ --system-context ctx.yaml \
48
+ --fail-on-determinism-rate-below 0.95
49
+
50
+ # Verify a previously-signed eval bundle
51
+ evidentia eval verify path/to/eval-output.json
52
+ ```
53
+
54
+ The CLI verbs live in `evidentia.cli.eval` (the meta-package);
55
+ this package contributes the underlying library.
56
+
57
+ ## Public API
58
+
59
+ | Symbol | Purpose |
60
+ |---|---|
61
+ | `DFAHarness` | Owns the run loop + audit emit |
62
+ | `EvalResult` | Top-level harness output (JSON-serializable, Sigstore-signable) |
63
+ | `EvalSample` | One prompt's inputs (immutable; audit-trail-stable) |
64
+ | `DeterminismResult` | Per-prompt determinism outcome |
65
+ | `ReplayResult` | Per-prompt replay-equivalence outcome |
66
+ | `FaithfulnessResult` | Per-claim faithfulness outcome |
67
+ | `PromptFaithfulnessResult` | Aggregated per-prompt faithfulness |
68
+ | `faithfulness_score` | Stdlib Jaccard token-overlap baseline |
69
+ | `faithfulness_score_semantic` | Sentence-transformers path (optional extra) |
70
+ | `determinism_score` | Computes the modal-output pass rate |
71
+ | `replay_equivalent` | Binary replay-equivalence check |
72
+ | `extract_claims` | Atomic-claim extraction from generated artifacts |
73
+ | `normalize_for_determinism` | Canonical whitespace + punctuation normalization |
74
+ | `hash_output` | SHA-256 hex of normalized output |
75
+ | `sign_eval_result` | Sigstore-sign an `EvalResult` JSON |
76
+ | `verify_eval_result` | Verify a previously-signed eval bundle |
77
+
78
+ ## Backward-compat shim
79
+
80
+ For external scripts that still import `from evidentia_ai.eval
81
+ import ...`, `evidentia-ai` ships a deprecation shim that
82
+ re-exports from `evidentia_eval`. The shim warns once at import
83
+ time and is scheduled for removal in **v0.12.0**.
84
+
85
+ ## License
86
+
87
+ Apache-2.0. See the workspace root LICENSE file.
@@ -0,0 +1,49 @@
1
+ [project]
2
+ name = "evidentia-eval"
3
+ version = "0.10.5"
4
+ description = "DFAH (Decision-Faithfulness Assessment Harness) determinism + faithfulness eval harness for Evidentia — dev-time AI-output quality gates"
5
+ readme = "README.md"
6
+ authors = [{name = "Allen Byrd", email = "allen@allenfbyrd.com"}]
7
+ license = "Apache-2.0"
8
+ requires-python = ">=3.12"
9
+ keywords = ["grc", "compliance", "llm-eval", "dfah", "determinism", "faithfulness", "ai-quality"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Information Technology",
13
+ "License :: OSI Approved :: Apache Software License",
14
+ "Operating System :: OS Independent",
15
+ "Programming Language :: Python :: 3.12",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ "Typing :: Typed",
18
+ ]
19
+ dependencies = [
20
+ "evidentia-core>=0.10.5,<0.11.0",
21
+ ]
22
+
23
+ [project.optional-dependencies]
24
+ # v0.10.5 P9 extraction: the v0.8.3 P1.1 sentence-transformers
25
+ # faithfulness path lives on evidentia-eval now. Same opt-in shape:
26
+ # ~90 MB model download (default all-MiniLM-L6-v2) on first use;
27
+ # cached at ~/.cache/huggingface/. Operators relying on the stdlib
28
+ # Jaccard baseline are unaffected (no extra needed). Heavyweight;
29
+ # intentionally extra-gated.
30
+ faithfulness-semantic = [
31
+ "sentence-transformers>=3.0",
32
+ "numpy>=1.26",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/polycentric-labs/evidentia"
37
+ Repository = "https://github.com/polycentric-labs/evidentia"
38
+ Issues = "https://github.com/polycentric-labs/evidentia/issues"
39
+ Changelog = "https://github.com/polycentric-labs/evidentia/blob/main/CHANGELOG.md"
40
+
41
+ [build-system]
42
+ requires = ["hatchling"]
43
+ build-backend = "hatchling.build"
44
+
45
+ [tool.hatch.build.targets.wheel]
46
+ packages = ["src/evidentia_eval"]
47
+
48
+ [tool.uv.sources]
49
+ evidentia-core = { workspace = true }
@@ -0,0 +1,131 @@
1
+ """evidentia-eval — DFAH determinism + faithfulness harness for Evidentia.
2
+
3
+ Decision-Faithfulness Assessment Harness per arXiv 2601.15322.
4
+ Validates that risk-statement generation (or any AI-driven
5
+ artifact production) is auditor-defensibly stable: same input +
6
+ same model + same temperature produces the same output, and a
7
+ re-run with pinned ``(input, model, temperature, prompt_hash,
8
+ run_id)`` is byte-equivalent to the original.
9
+
10
+ Three metrics ship:
11
+
12
+ - **Decision determinism** — same prompt produces the same
13
+ normalized output across N samples. The pass rate is the
14
+ fraction of samples that match the modal output (modulo
15
+ whitespace + punctuation normalization). Reported as a 0..1
16
+ score; CI-gateable via
17
+ ``evidentia eval --fail-on-determinism-rate-below 0.95``.
18
+ - **Replay equivalence** — re-running with a pinned context
19
+ (``GenerationContext`` instance) produces an output whose
20
+ SHA-256 hash matches the original. Either the run is replay-
21
+ equivalent or it isn't — there is no graceful degradation.
22
+ - **Faithfulness** — do the atomic claims in a generated artifact
23
+ trace back to source policy clauses? Stdlib Jaccard baseline
24
+ (always available) + optional sentence-transformers semantic
25
+ path (``[faithfulness-semantic]`` extra).
26
+
27
+ Public API:
28
+
29
+ - :class:`DFAHarness` — owns the run loop + audit emit.
30
+ - :class:`DeterminismResult` — Pydantic model summarizing one
31
+ prompt's determinism outcome (modal output + pass rate +
32
+ per-sample hashes).
33
+ - :class:`ReplayResult` — Pydantic model summarizing replay-
34
+ equivalence for a single ``GenerationContext`` re-run.
35
+ - :class:`EvalResult` — top-level harness output covering all
36
+ prompts in one ``run_id``.
37
+ - :class:`FaithfulnessResult` — per-claim faithfulness outcome.
38
+ - :class:`PromptFaithfulnessResult` — aggregated per-prompt
39
+ faithfulness outcome.
40
+ - :func:`faithfulness_score` — stdlib Jaccard token-overlap
41
+ baseline.
42
+ - :func:`faithfulness_score_semantic` — sentence-transformers
43
+ semantic-similarity path (opt-in extra).
44
+ - :func:`extract_claims` — atomic-claim extraction from generated
45
+ artifacts.
46
+ - :func:`normalize_for_determinism` — canonical normalization
47
+ (whitespace + punctuation) used by the determinism check.
48
+ - :func:`hash_output` — SHA-256 hex of normalized output.
49
+ - :func:`sign_eval_result` / :func:`verify_eval_result` —
50
+ Sigstore-sign + verify the eval output.
51
+
52
+ The harness is generator-agnostic: it accepts any callable
53
+ ``(prompt: str, context: GenerationContext) -> str`` so the
54
+ same machinery validates risk statements, control
55
+ explanations, future PRT-traced outputs, and any third-party
56
+ plugin's AI-generated artifacts. Unit tests use a deterministic
57
+ fake generator; live operator runs wire in
58
+ ``evidentia_ai.risk_statements.RiskStatementGenerator.generate``.
59
+
60
+ v0.10.5 P9 extraction: this package was carved out of
61
+ ``evidentia_ai.eval.*`` to keep air-gap installs of the
62
+ risk-statement runtime from pulling sentence-transformers /
63
+ numpy / instructor heavy-dep stacks. The dev-time eval harness
64
+ now installs separately (or via ``pip install
65
+ evidentia-eval[faithfulness-semantic]`` for the optional
66
+ semantic path).
67
+ """
68
+
69
+ from __future__ import annotations
70
+
71
+ from importlib.metadata import PackageNotFoundError
72
+ from importlib.metadata import version as _pkg_version
73
+
74
+ from evidentia_eval.claim_extraction import (
75
+ CLAIM_EXTRACTION_PROMPT,
76
+ extract_claims,
77
+ )
78
+ from evidentia_eval.faithfulness import (
79
+ DEFAULT_FAITHFULNESS_THRESHOLD,
80
+ FaithfulnessResult,
81
+ PromptFaithfulnessResult,
82
+ faithfulness_score,
83
+ )
84
+ from evidentia_eval.faithfulness_semantic import (
85
+ DEFAULT_SEMANTIC_MODEL,
86
+ DEFAULT_SEMANTIC_THRESHOLD,
87
+ SemanticFaithfulnessNotAvailableError,
88
+ faithfulness_score_semantic,
89
+ )
90
+ from evidentia_eval.harness import DFAHarness, EvalResult, EvalSample
91
+ from evidentia_eval.metrics import (
92
+ DeterminismResult,
93
+ ReplayResult,
94
+ determinism_score,
95
+ replay_equivalent,
96
+ )
97
+ from evidentia_eval.seeds import hash_output, normalize_for_determinism
98
+ from evidentia_eval.signing import (
99
+ sign_eval_result,
100
+ verify_eval_result,
101
+ )
102
+
103
+ try:
104
+ __version__ = _pkg_version("evidentia-eval")
105
+ except PackageNotFoundError: # pragma: no cover
106
+ __version__ = "0.0.0+unknown"
107
+
108
+ __all__ = [
109
+ "CLAIM_EXTRACTION_PROMPT",
110
+ "DEFAULT_FAITHFULNESS_THRESHOLD",
111
+ "DEFAULT_SEMANTIC_MODEL",
112
+ "DEFAULT_SEMANTIC_THRESHOLD",
113
+ "DFAHarness",
114
+ "DeterminismResult",
115
+ "EvalResult",
116
+ "EvalSample",
117
+ "FaithfulnessResult",
118
+ "PromptFaithfulnessResult",
119
+ "ReplayResult",
120
+ "SemanticFaithfulnessNotAvailableError",
121
+ "__version__",
122
+ "determinism_score",
123
+ "extract_claims",
124
+ "faithfulness_score",
125
+ "faithfulness_score_semantic",
126
+ "hash_output",
127
+ "normalize_for_determinism",
128
+ "replay_equivalent",
129
+ "sign_eval_result",
130
+ "verify_eval_result",
131
+ ]