dream-eval 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dream_eval/__init__.py +18 -0
- dream_eval/backends.py +172 -0
- dream_eval/backends_pg.py +131 -0
- dream_eval/cli.py +158 -0
- dream_eval/gates.py +94 -0
- dream_eval/mcp/__init__.py +5 -0
- dream_eval/mcp/server.py +207 -0
- dream_eval/nli.py +96 -0
- dream_eval/py.typed +0 -0
- dream_eval/scoring.py +225 -0
- dream_eval/types.py +153 -0
- dream_eval-0.2.0.dist-info/METADATA +132 -0
- dream_eval-0.2.0.dist-info/RECORD +15 -0
- dream_eval-0.2.0.dist-info/WHEEL +4 -0
- dream_eval-0.2.0.dist-info/entry_points.txt +3 -0
dream_eval/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""dream-eval — Agent-agnostic faithfulness evaluation framework."""
|
|
2
|
+
|
|
3
|
+
from dream_eval.gates import check_hash_determinism, check_secret_leak
|
|
4
|
+
from dream_eval.scoring import compute_faithfulness, compute_precision, compute_recall
|
|
5
|
+
from dream_eval.types import EvalResult, FaithfulnessReport, GateResult
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"EvalResult",
|
|
9
|
+
"FaithfulnessReport",
|
|
10
|
+
"GateResult",
|
|
11
|
+
"check_hash_determinism",
|
|
12
|
+
"check_secret_leak",
|
|
13
|
+
"compute_faithfulness",
|
|
14
|
+
"compute_precision",
|
|
15
|
+
"compute_recall",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
__version__ = "0.1.0"
|
dream_eval/backends.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Memory backend adapters — abstract interface for different storage systems."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from dream_eval.types import EvalReport, EvalResult, Labels
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MemoryBackend(ABC):
|
|
13
|
+
"""Abstract interface for memory backends.
|
|
14
|
+
|
|
15
|
+
dream-eval is backend-agnostic: implement this interface to plug in
|
|
16
|
+
Postgres, LanceDB, knowledge graphs, or any other storage system.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def load_eval_report(self, run_id: str) -> EvalReport | None:
|
|
21
|
+
"""Load an evaluator report by run ID."""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def load_labels(self, corpus_path: str | None = None) -> Labels:
|
|
25
|
+
"""Load golden corpus labels."""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def save_eval_result(self, result: EvalResult) -> None:
|
|
29
|
+
"""Persist an eval result (gates + scoring)."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
33
|
+
"""List recent eval runs with summary info."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class JsonFileBackend(MemoryBackend):
|
|
37
|
+
"""Simple file-based backend for local eval runs.
|
|
38
|
+
|
|
39
|
+
Reads/writes to eval/results/<run_id>/ directories.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, results_dir: str = "eval/results") -> None:
|
|
43
|
+
self.results_dir = Path(results_dir)
|
|
44
|
+
|
|
45
|
+
def load_eval_report(self, run_id: str) -> EvalReport | None:
|
|
46
|
+
import json
|
|
47
|
+
|
|
48
|
+
path = self.results_dir / run_id / "eval-report.json"
|
|
49
|
+
if not path.exists():
|
|
50
|
+
return None
|
|
51
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
52
|
+
return EvalReport.model_validate(data)
|
|
53
|
+
|
|
54
|
+
def load_labels(self, corpus_path: str | None = None) -> Labels:
|
|
55
|
+
import json
|
|
56
|
+
|
|
57
|
+
base = Path(corpus_path) if corpus_path else self.results_dir.parent / "golden-corpus"
|
|
58
|
+
labels_file = base / "labels.json"
|
|
59
|
+
if not labels_file.exists():
|
|
60
|
+
return Labels()
|
|
61
|
+
data = json.loads(labels_file.read_text(encoding="utf-8"))
|
|
62
|
+
return Labels.model_validate(data)
|
|
63
|
+
|
|
64
|
+
def save_eval_result(self, result: EvalResult) -> None:
|
|
65
|
+
import json
|
|
66
|
+
|
|
67
|
+
out_dir = self.results_dir / result.run_id
|
|
68
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
|
|
70
|
+
metrics_path = out_dir / "metrics.json"
|
|
71
|
+
metrics_path.write_text(
|
|
72
|
+
json.dumps(result.to_metrics_dict(), indent=2, default=str),
|
|
73
|
+
encoding="utf-8",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
summary = _render_summary(result)
|
|
77
|
+
summary_path = out_dir / "summary.md"
|
|
78
|
+
summary_path.write_text(summary, encoding="utf-8")
|
|
79
|
+
|
|
80
|
+
def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
81
|
+
import json
|
|
82
|
+
|
|
83
|
+
runs: list[dict[str, Any]] = []
|
|
84
|
+
if not self.results_dir.exists():
|
|
85
|
+
return runs
|
|
86
|
+
|
|
87
|
+
for d in sorted(self.results_dir.iterdir(), reverse=True):
|
|
88
|
+
if not d.is_dir():
|
|
89
|
+
continue
|
|
90
|
+
metrics_file = d / "metrics.json"
|
|
91
|
+
if metrics_file.exists():
|
|
92
|
+
data = json.loads(metrics_file.read_text(encoding="utf-8"))
|
|
93
|
+
runs.append({
|
|
94
|
+
"run_id": d.name,
|
|
95
|
+
"faithfulness": data.get("faithfulness_score"),
|
|
96
|
+
"secret_leak": data.get("secret_leak_test"),
|
|
97
|
+
})
|
|
98
|
+
if len(runs) >= limit:
|
|
99
|
+
break
|
|
100
|
+
return runs
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class PostgresBackend(MemoryBackend):
|
|
104
|
+
"""Postgres-backed eval results storage.
|
|
105
|
+
|
|
106
|
+
Stores eval results in the agent_memory table alongside other memory records.
|
|
107
|
+
Requires psycopg[binary] >= 3.2.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(self, dsn: str | None = None) -> None:
|
|
111
|
+
from dream_eval.backends_pg import PostgresEvalBackend
|
|
112
|
+
|
|
113
|
+
self._impl = PostgresEvalBackend(dsn)
|
|
114
|
+
|
|
115
|
+
def load_eval_report(self, run_id: str) -> EvalReport | None:
|
|
116
|
+
return self._impl.load_eval_report(run_id)
|
|
117
|
+
|
|
118
|
+
def load_labels(self, corpus_path: str | None = None) -> Labels:
|
|
119
|
+
return self._impl.load_labels(corpus_path)
|
|
120
|
+
|
|
121
|
+
def save_eval_result(self, result: EvalResult) -> None:
|
|
122
|
+
self._impl.save_eval_result(result)
|
|
123
|
+
|
|
124
|
+
def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
125
|
+
return self._impl.list_runs(limit)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _render_summary(result: EvalResult) -> str:
|
|
129
|
+
"""Render eval result as markdown summary."""
|
|
130
|
+
gate_lines = []
|
|
131
|
+
for g in result.gates:
|
|
132
|
+
icon = {"pass": "OK", "fail": "FAIL", "warn": "WARN", "skip": "SKIP"}.get(
|
|
133
|
+
g.status.value, "?"
|
|
134
|
+
)
|
|
135
|
+
gate_lines.append(f"| {g.name} | {icon} | {g.message} |")
|
|
136
|
+
|
|
137
|
+
f = result.faithfulness
|
|
138
|
+
gates_table = "\n".join(gate_lines) if gate_lines else "| (none) | - | - |"
|
|
139
|
+
|
|
140
|
+
violations = ""
|
|
141
|
+
if f.recurrence_violations:
|
|
142
|
+
violations = "## Recurrence Violations\n" + "".join(
|
|
143
|
+
f"- {v}\n" for v in f.recurrence_violations
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return f"""# Eval Summary — {result.run_id}
|
|
147
|
+
|
|
148
|
+
**Mode:** {result.mode.value}
|
|
149
|
+
**Date:** {result.date.isoformat()}
|
|
150
|
+
**Hard fail:** {"YES" if result.hard_fail else "no"}
|
|
151
|
+
|
|
152
|
+
## Deterministic Gates
|
|
153
|
+
|
|
154
|
+
| Gate | Status | Message |
|
|
155
|
+
|------|--------|---------|
|
|
156
|
+
{gates_table}
|
|
157
|
+
|
|
158
|
+
## Faithfulness
|
|
159
|
+
|
|
160
|
+
| Metric | Value |
|
|
161
|
+
|--------|-------|
|
|
162
|
+
| Faithfulness | {f.faithfulness_score:.3f} |
|
|
163
|
+
| Precision | {f.precision:.3f} |
|
|
164
|
+
| Recall | {f.recall:.3f} |
|
|
165
|
+
| Recurrence calibration | {f.recurrence_calibration:.3f} |
|
|
166
|
+
| Items proposed | {f.items_proposed} |
|
|
167
|
+
| Fully supported | {f.items_fully_supported} |
|
|
168
|
+
| Partially supported | {f.items_partially_supported} |
|
|
169
|
+
| Unsupported | {f.items_unsupported} |
|
|
170
|
+
|
|
171
|
+
{violations}
|
|
172
|
+
"""
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Postgres backend for dream-eval — stores eval results alongside agent memory."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from dream_eval.types import EvalReport, EvalResult, Labels
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PostgresEvalBackend:
|
|
12
|
+
"""Stores eval results in the agent_memory table.
|
|
13
|
+
|
|
14
|
+
Uses the existing agent_memory schema — eval results are stored as
|
|
15
|
+
memory_type='decision' with metadata['dream_eval']=True.
|
|
16
|
+
|
|
17
|
+
Requires psycopg[binary] >= 3.2 and a running Postgres instance.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, dsn: str | None = None) -> None:
|
|
21
|
+
import os
|
|
22
|
+
|
|
23
|
+
if dsn is None:
|
|
24
|
+
dsn = os.environ.get("AGENT_MEMORY_DATABASE_URL") or os.environ.get(
|
|
25
|
+
"DATABASE_URL"
|
|
26
|
+
)
|
|
27
|
+
if not dsn:
|
|
28
|
+
raise ValueError(
|
|
29
|
+
"No database DSN configured. Set AGENT_MEMORY_DATABASE_URL or DATABASE_URL."
|
|
30
|
+
)
|
|
31
|
+
from psycopg.rows import dict_row
|
|
32
|
+
from psycopg_pool import ConnectionPool
|
|
33
|
+
|
|
34
|
+
self._pool = ConnectionPool(
|
|
35
|
+
dsn, min_size=1, max_size=5, kwargs={"row_factory": dict_row}
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def close(self) -> None:
|
|
39
|
+
self._pool.close()
|
|
40
|
+
|
|
41
|
+
def __enter__(self) -> PostgresEvalBackend:
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
def __exit__(self, *exc: Any) -> None:
|
|
45
|
+
self.close()
|
|
46
|
+
|
|
47
|
+
def save_eval_result(self, result: EvalResult) -> None:
|
|
48
|
+
"""Persist eval result as a memory record."""
|
|
49
|
+
with self._pool.connection() as conn:
|
|
50
|
+
conn.execute(
|
|
51
|
+
"""
|
|
52
|
+
INSERT INTO agent_memory (
|
|
53
|
+
agent_id, session_id, session_type, memory_type,
|
|
54
|
+
content, source, metadata
|
|
55
|
+
) VALUES (%s, %s, %s, %s, %s::jsonb, %s, %s::jsonb)
|
|
56
|
+
""",
|
|
57
|
+
(
|
|
58
|
+
"dream-eval",
|
|
59
|
+
result.run_id,
|
|
60
|
+
"dream_eval",
|
|
61
|
+
"decision",
|
|
62
|
+
json.dumps(result.to_metrics_dict()),
|
|
63
|
+
"sdk",
|
|
64
|
+
json.dumps({"dream_eval": True, "mode": result.mode.value}),
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
conn.commit()
|
|
68
|
+
|
|
69
|
+
def load_eval_report(self, run_id: str) -> EvalReport | None:
|
|
70
|
+
"""Load eval report from agent_memory by session_id."""
|
|
71
|
+
with self._pool.connection() as conn:
|
|
72
|
+
row = conn.execute(
|
|
73
|
+
"""
|
|
74
|
+
SELECT content FROM agent_memory
|
|
75
|
+
WHERE session_id = %s AND metadata ? 'dream_eval'
|
|
76
|
+
ORDER BY created_at DESC LIMIT 1
|
|
77
|
+
""",
|
|
78
|
+
(run_id,),
|
|
79
|
+
).fetchone()
|
|
80
|
+
if not row:
|
|
81
|
+
return None
|
|
82
|
+
content = row["content"]
|
|
83
|
+
if isinstance(content, str):
|
|
84
|
+
content = json.loads(content)
|
|
85
|
+
return EvalReport(
|
|
86
|
+
items=[],
|
|
87
|
+
sessions_evaluated=content.get("sessions_evaluated", 0),
|
|
88
|
+
token_cost=content.get("token_cost", 0),
|
|
89
|
+
latency=content.get("latency", 0),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def load_labels(self, corpus_path: str | None = None) -> Labels:
|
|
93
|
+
"""Load labels from a JSON file. Falls back to empty labels."""
|
|
94
|
+
from pathlib import Path
|
|
95
|
+
|
|
96
|
+
if corpus_path is None:
|
|
97
|
+
return Labels()
|
|
98
|
+
|
|
99
|
+
labels_file = Path(corpus_path) / "labels.json"
|
|
100
|
+
if not labels_file.exists():
|
|
101
|
+
return Labels()
|
|
102
|
+
|
|
103
|
+
data = json.loads(labels_file.read_text(encoding="utf-8"))
|
|
104
|
+
return Labels.model_validate(data)
|
|
105
|
+
|
|
106
|
+
def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
107
|
+
"""List recent eval runs from agent_memory."""
|
|
108
|
+
with self._pool.connection() as conn:
|
|
109
|
+
rows = conn.execute(
|
|
110
|
+
"""
|
|
111
|
+
SELECT session_id AS run_id, content, created_at
|
|
112
|
+
FROM agent_memory
|
|
113
|
+
WHERE metadata ? 'dream_eval'
|
|
114
|
+
ORDER BY created_at DESC
|
|
115
|
+
LIMIT %s
|
|
116
|
+
""",
|
|
117
|
+
(limit,),
|
|
118
|
+
).fetchall()
|
|
119
|
+
|
|
120
|
+
runs: list[dict[str, Any]] = []
|
|
121
|
+
for row in rows:
|
|
122
|
+
content = row["content"]
|
|
123
|
+
if isinstance(content, str):
|
|
124
|
+
content = json.loads(content)
|
|
125
|
+
runs.append({
|
|
126
|
+
"run_id": row["run_id"],
|
|
127
|
+
"faithfulness": content.get("faithfulness_score"),
|
|
128
|
+
"secret_leak": content.get("secret_leak_test"),
|
|
129
|
+
"date": row["created_at"].isoformat() if row["created_at"] else None,
|
|
130
|
+
})
|
|
131
|
+
return runs
|
dream_eval/cli.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI for dream-eval — run evaluations, check gates, score faithfulness."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import sys
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
from dream_eval.types import EvalMode, EvalResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main() -> None:
|
|
15
|
+
parser = argparse.ArgumentParser(
|
|
16
|
+
description="dream-eval — agent memory faithfulness evaluation"
|
|
17
|
+
)
|
|
18
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
19
|
+
|
|
20
|
+
p_run = sub.add_parser("run", help="Run a full eval pipeline")
|
|
21
|
+
p_run.add_argument("--corpus", default=None, help="Path to golden corpus directory")
|
|
22
|
+
p_run.add_argument("--mode", choices=["golden", "live"], default="golden")
|
|
23
|
+
p_run.add_argument("--output-dir", default=None, help="Output directory for results")
|
|
24
|
+
|
|
25
|
+
p_gates = sub.add_parser("gates", help="Run only deterministic gates")
|
|
26
|
+
p_gates.add_argument("--text", default=None, help="Text to check for secret leaks")
|
|
27
|
+
p_gates.add_argument("--file", default=None, help="File to check hash determinism")
|
|
28
|
+
p_gates.add_argument("--hash", default=None, help="Expected hash (sha256:hex)")
|
|
29
|
+
|
|
30
|
+
p_score = sub.add_parser("score", help="Score an eval report against labels")
|
|
31
|
+
p_score.add_argument("--report", required=True, help="Path to eval-report.json")
|
|
32
|
+
p_score.add_argument("--labels", default=None, help="Path to labels.json")
|
|
33
|
+
|
|
34
|
+
p_list = sub.add_parser("list", help="List recent eval runs")
|
|
35
|
+
p_list.add_argument("--limit", type=int, default=10)
|
|
36
|
+
p_list.add_argument("--backend", choices=["json", "postgres"], default="json")
|
|
37
|
+
p_list.add_argument("--dsn", default=None, help="Postgres DSN (for postgres backend)")
|
|
38
|
+
|
|
39
|
+
p_show = sub.add_parser("show", help="Show a specific eval run result")
|
|
40
|
+
p_show.add_argument("run_id")
|
|
41
|
+
p_show.add_argument("--backend", choices=["json", "postgres"], default="json")
|
|
42
|
+
|
|
43
|
+
args = parser.parse_args()
|
|
44
|
+
|
|
45
|
+
if args.command == "gates":
|
|
46
|
+
_run_gates(args)
|
|
47
|
+
elif args.command == "run":
|
|
48
|
+
_run_eval(args)
|
|
49
|
+
elif args.command == "score":
|
|
50
|
+
_run_score(args)
|
|
51
|
+
elif args.command == "list":
|
|
52
|
+
_run_list(args)
|
|
53
|
+
elif args.command == "show":
|
|
54
|
+
_run_show(args)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _run_gates(args: argparse.Namespace) -> None:
|
|
58
|
+
from dream_eval.gates import check_hash_determinism, check_secret_leak
|
|
59
|
+
|
|
60
|
+
results = []
|
|
61
|
+
|
|
62
|
+
if args.text:
|
|
63
|
+
result = check_secret_leak(args.text)
|
|
64
|
+
results.append(result)
|
|
65
|
+
|
|
66
|
+
if args.file:
|
|
67
|
+
result = check_hash_determinism(
|
|
68
|
+
open(args.file, encoding="utf-8").read(), args.hash
|
|
69
|
+
)
|
|
70
|
+
results.append(result)
|
|
71
|
+
|
|
72
|
+
if not results:
|
|
73
|
+
print("No checks specified. Use --text or --file.", file=sys.stderr)
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
|
|
76
|
+
for r in results:
|
|
77
|
+
print(json.dumps(r.model_dump(mode="json"), default=str))
|
|
78
|
+
|
|
79
|
+
if any(r.status.value == "fail" for r in results):
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _run_eval(args: argparse.Namespace) -> None:
|
|
84
|
+
from pathlib import Path
|
|
85
|
+
|
|
86
|
+
result = EvalResult(
|
|
87
|
+
run_id=datetime.now().strftime("%Y-%m-%dT%H-%M-%SZ"),
|
|
88
|
+
date=datetime.now(),
|
|
89
|
+
mode=EvalMode(args.mode),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
output_dir = args.output_dir or "eval/results"
|
|
93
|
+
out_path = Path(output_dir) / result.run_id
|
|
94
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
metrics_path = out_path / "metrics.json"
|
|
97
|
+
metrics_path.write_text(
|
|
98
|
+
json.dumps(result.to_metrics_dict(), indent=2, default=str),
|
|
99
|
+
encoding="utf-8",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(json.dumps({
|
|
103
|
+
"run_id": result.run_id,
|
|
104
|
+
"output": str(out_path),
|
|
105
|
+
"hard_fail": result.hard_fail,
|
|
106
|
+
}, default=str))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _run_score(args: argparse.Namespace) -> None:
|
|
110
|
+
from pathlib import Path
|
|
111
|
+
|
|
112
|
+
from dream_eval.scoring import compute_faithfulness
|
|
113
|
+
from dream_eval.types import EvalReport, Labels
|
|
114
|
+
|
|
115
|
+
report_data = json.loads(Path(args.report).read_text(encoding="utf-8"))
|
|
116
|
+
report = EvalReport.model_validate(report_data)
|
|
117
|
+
|
|
118
|
+
labels_path = args.labels or str(
|
|
119
|
+
Path(args.report).parent.parent / "golden-corpus" / "labels.json"
|
|
120
|
+
)
|
|
121
|
+
labels_data = json.loads(Path(labels_path).read_text(encoding="utf-8"))
|
|
122
|
+
labels = Labels.model_validate(labels_data)
|
|
123
|
+
|
|
124
|
+
faithfulness = compute_faithfulness(report.items, labels.items)
|
|
125
|
+
print(json.dumps(faithfulness.model_dump(mode="json"), default=str))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _run_list(args: argparse.Namespace) -> None:
|
|
129
|
+
from dream_eval.backends import JsonFileBackend, PostgresBackend
|
|
130
|
+
|
|
131
|
+
if args.backend == "postgres":
|
|
132
|
+
backend = PostgresBackend(dsn=args.dsn)
|
|
133
|
+
else:
|
|
134
|
+
backend = JsonFileBackend()
|
|
135
|
+
|
|
136
|
+
runs = backend.list_runs(limit=args.limit)
|
|
137
|
+
for run in runs:
|
|
138
|
+
print(json.dumps(run, default=str))
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _run_show(args: argparse.Namespace) -> None:
|
|
142
|
+
from dream_eval.backends import JsonFileBackend, PostgresBackend
|
|
143
|
+
|
|
144
|
+
if args.backend == "postgres":
|
|
145
|
+
backend = PostgresBackend()
|
|
146
|
+
else:
|
|
147
|
+
backend = JsonFileBackend()
|
|
148
|
+
|
|
149
|
+
report = backend.load_eval_report(args.run_id)
|
|
150
|
+
if report:
|
|
151
|
+
print(json.dumps(report.model_dump(mode="json"), default=str))
|
|
152
|
+
else:
|
|
153
|
+
print(f"Run {args.run_id} not found", file=sys.stderr)
|
|
154
|
+
sys.exit(1)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
if __name__ == "__main__":
|
|
158
|
+
main()
|
dream_eval/gates.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Deterministic gates — hard stops that must pass before scoring."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from dream_eval.types import GateResult, GateStatus
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def check_secret_leak(
|
|
12
|
+
output_text: str,
|
|
13
|
+
forbidden_patterns: list[str] | None = None,
|
|
14
|
+
) -> GateResult:
|
|
15
|
+
"""Check evaluator output for leaked secrets.
|
|
16
|
+
|
|
17
|
+
Hard stop: if any forbidden pattern matches, the eval must fail.
|
|
18
|
+
"""
|
|
19
|
+
if not forbidden_patterns:
|
|
20
|
+
return GateResult(
|
|
21
|
+
name="secret_leak",
|
|
22
|
+
status=GateStatus.SKIP,
|
|
23
|
+
message="No forbidden patterns configured",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
matches: list[str] = []
|
|
27
|
+
for pattern in forbidden_patterns:
|
|
28
|
+
try:
|
|
29
|
+
if re.search(pattern, output_text, re.IGNORECASE):
|
|
30
|
+
matches.append(pattern)
|
|
31
|
+
except re.error:
|
|
32
|
+
if pattern.lower() in output_text.lower():
|
|
33
|
+
matches.append(pattern)
|
|
34
|
+
|
|
35
|
+
if matches:
|
|
36
|
+
return GateResult(
|
|
37
|
+
name="secret_leak",
|
|
38
|
+
status=GateStatus.FAIL,
|
|
39
|
+
message=f"Secret leak detected: {len(matches)} forbidden pattern(s) matched",
|
|
40
|
+
details={"matched_patterns": matches},
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
return GateResult(
|
|
44
|
+
name="secret_leak",
|
|
45
|
+
status=GateStatus.PASS,
|
|
46
|
+
message="No forbidden patterns found in output",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _canonical_hash(content: str) -> str:
|
|
51
|
+
"""Canonical SHA-256: strip BOM, normalize CRLF→LF, hex digest with prefix."""
|
|
52
|
+
text = content
|
|
53
|
+
if text.startswith("\ufeff"):
|
|
54
|
+
text = text[1:]
|
|
55
|
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
56
|
+
digest = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
57
|
+
return f"sha256:{digest}"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def check_hash_determinism(
|
|
61
|
+
content: str,
|
|
62
|
+
expected_hash: str | None = None,
|
|
63
|
+
) -> GateResult:
|
|
64
|
+
"""Verify hash is deterministic after BOM/CRLF normalization.
|
|
65
|
+
|
|
66
|
+
The canonical hash algorithm:
|
|
67
|
+
1. Strip UTF-8 BOM if present
|
|
68
|
+
2. Normalize line endings: CRLF → LF
|
|
69
|
+
3. Compute SHA-256 hex digest
|
|
70
|
+
"""
|
|
71
|
+
actual_hash = _canonical_hash(content)
|
|
72
|
+
|
|
73
|
+
if expected_hash is None:
|
|
74
|
+
return GateResult(
|
|
75
|
+
name="hash_determinism",
|
|
76
|
+
status=GateStatus.PASS,
|
|
77
|
+
message=f"Hash computed: {actual_hash}",
|
|
78
|
+
details={"hash": actual_hash},
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if actual_hash == expected_hash:
|
|
82
|
+
return GateResult(
|
|
83
|
+
name="hash_determinism",
|
|
84
|
+
status=GateStatus.PASS,
|
|
85
|
+
message="Hash matches expected value",
|
|
86
|
+
details={"hash": actual_hash},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return GateResult(
|
|
90
|
+
name="hash_determinism",
|
|
91
|
+
status=GateStatus.FAIL,
|
|
92
|
+
message=f"Hash mismatch: expected {expected_hash}, got {actual_hash}",
|
|
93
|
+
details={"expected": expected_hash, "actual": actual_hash},
|
|
94
|
+
)
|