dream-eval 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dream_eval/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ """dream-eval — Agent-agnostic faithfulness evaluation framework."""
2
+
3
+ from dream_eval.gates import check_hash_determinism, check_secret_leak
4
+ from dream_eval.scoring import compute_faithfulness, compute_precision, compute_recall
5
+ from dream_eval.types import EvalResult, FaithfulnessReport, GateResult
6
+
7
+ __all__ = [
8
+ "EvalResult",
9
+ "FaithfulnessReport",
10
+ "GateResult",
11
+ "check_hash_determinism",
12
+ "check_secret_leak",
13
+ "compute_faithfulness",
14
+ "compute_precision",
15
+ "compute_recall",
16
+ ]
17
+
18
+ __version__ = "0.1.0"
dream_eval/backends.py ADDED
@@ -0,0 +1,172 @@
1
+ """Memory backend adapters — abstract interface for different storage systems."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from dream_eval.types import EvalReport, EvalResult, Labels
10
+
11
+
12
+ class MemoryBackend(ABC):
13
+ """Abstract interface for memory backends.
14
+
15
+ dream-eval is backend-agnostic: implement this interface to plug in
16
+ Postgres, LanceDB, knowledge graphs, or any other storage system.
17
+ """
18
+
19
+ @abstractmethod
20
+ def load_eval_report(self, run_id: str) -> EvalReport | None:
21
+ """Load an evaluator report by run ID."""
22
+
23
+ @abstractmethod
24
+ def load_labels(self, corpus_path: str | None = None) -> Labels:
25
+ """Load golden corpus labels."""
26
+
27
+ @abstractmethod
28
+ def save_eval_result(self, result: EvalResult) -> None:
29
+ """Persist an eval result (gates + scoring)."""
30
+
31
+ @abstractmethod
32
+ def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
33
+ """List recent eval runs with summary info."""
34
+
35
+
36
+ class JsonFileBackend(MemoryBackend):
37
+ """Simple file-based backend for local eval runs.
38
+
39
+ Reads/writes to eval/results/<run_id>/ directories.
40
+ """
41
+
42
+ def __init__(self, results_dir: str = "eval/results") -> None:
43
+ self.results_dir = Path(results_dir)
44
+
45
+ def load_eval_report(self, run_id: str) -> EvalReport | None:
46
+ import json
47
+
48
+ path = self.results_dir / run_id / "eval-report.json"
49
+ if not path.exists():
50
+ return None
51
+ data = json.loads(path.read_text(encoding="utf-8"))
52
+ return EvalReport.model_validate(data)
53
+
54
+ def load_labels(self, corpus_path: str | None = None) -> Labels:
55
+ import json
56
+
57
+ base = Path(corpus_path) if corpus_path else self.results_dir.parent / "golden-corpus"
58
+ labels_file = base / "labels.json"
59
+ if not labels_file.exists():
60
+ return Labels()
61
+ data = json.loads(labels_file.read_text(encoding="utf-8"))
62
+ return Labels.model_validate(data)
63
+
64
+ def save_eval_result(self, result: EvalResult) -> None:
65
+ import json
66
+
67
+ out_dir = self.results_dir / result.run_id
68
+ out_dir.mkdir(parents=True, exist_ok=True)
69
+
70
+ metrics_path = out_dir / "metrics.json"
71
+ metrics_path.write_text(
72
+ json.dumps(result.to_metrics_dict(), indent=2, default=str),
73
+ encoding="utf-8",
74
+ )
75
+
76
+ summary = _render_summary(result)
77
+ summary_path = out_dir / "summary.md"
78
+ summary_path.write_text(summary, encoding="utf-8")
79
+
80
+ def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
81
+ import json
82
+
83
+ runs: list[dict[str, Any]] = []
84
+ if not self.results_dir.exists():
85
+ return runs
86
+
87
+ for d in sorted(self.results_dir.iterdir(), reverse=True):
88
+ if not d.is_dir():
89
+ continue
90
+ metrics_file = d / "metrics.json"
91
+ if metrics_file.exists():
92
+ data = json.loads(metrics_file.read_text(encoding="utf-8"))
93
+ runs.append({
94
+ "run_id": d.name,
95
+ "faithfulness": data.get("faithfulness_score"),
96
+ "secret_leak": data.get("secret_leak_test"),
97
+ })
98
+ if len(runs) >= limit:
99
+ break
100
+ return runs
101
+
102
+
103
+ class PostgresBackend(MemoryBackend):
104
+ """Postgres-backed eval results storage.
105
+
106
+ Stores eval results in the agent_memory table alongside other memory records.
107
+ Requires psycopg[binary] >= 3.2.
108
+ """
109
+
110
+ def __init__(self, dsn: str | None = None) -> None:
111
+ from dream_eval.backends_pg import PostgresEvalBackend
112
+
113
+ self._impl = PostgresEvalBackend(dsn)
114
+
115
+ def load_eval_report(self, run_id: str) -> EvalReport | None:
116
+ return self._impl.load_eval_report(run_id)
117
+
118
+ def load_labels(self, corpus_path: str | None = None) -> Labels:
119
+ return self._impl.load_labels(corpus_path)
120
+
121
+ def save_eval_result(self, result: EvalResult) -> None:
122
+ self._impl.save_eval_result(result)
123
+
124
+ def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
125
+ return self._impl.list_runs(limit)
126
+
127
+
128
+ def _render_summary(result: EvalResult) -> str:
129
+ """Render eval result as markdown summary."""
130
+ gate_lines = []
131
+ for g in result.gates:
132
+ icon = {"pass": "OK", "fail": "FAIL", "warn": "WARN", "skip": "SKIP"}.get(
133
+ g.status.value, "?"
134
+ )
135
+ gate_lines.append(f"| {g.name} | {icon} | {g.message} |")
136
+
137
+ f = result.faithfulness
138
+ gates_table = "\n".join(gate_lines) if gate_lines else "| (none) | - | - |"
139
+
140
+ violations = ""
141
+ if f.recurrence_violations:
142
+ violations = "## Recurrence Violations\n" + "".join(
143
+ f"- {v}\n" for v in f.recurrence_violations
144
+ )
145
+
146
+ return f"""# Eval Summary — {result.run_id}
147
+
148
+ **Mode:** {result.mode.value}
149
+ **Date:** {result.date.isoformat()}
150
+ **Hard fail:** {"YES" if result.hard_fail else "no"}
151
+
152
+ ## Deterministic Gates
153
+
154
+ | Gate | Status | Message |
155
+ |------|--------|---------|
156
+ {gates_table}
157
+
158
+ ## Faithfulness
159
+
160
+ | Metric | Value |
161
+ |--------|-------|
162
+ | Faithfulness | {f.faithfulness_score:.3f} |
163
+ | Precision | {f.precision:.3f} |
164
+ | Recall | {f.recall:.3f} |
165
+ | Recurrence calibration | {f.recurrence_calibration:.3f} |
166
+ | Items proposed | {f.items_proposed} |
167
+ | Fully supported | {f.items_fully_supported} |
168
+ | Partially supported | {f.items_partially_supported} |
169
+ | Unsupported | {f.items_unsupported} |
170
+
171
+ {violations}
172
+ """
@@ -0,0 +1,131 @@
1
+ """Postgres backend for dream-eval — stores eval results alongside agent memory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from dream_eval.types import EvalReport, EvalResult, Labels
9
+
10
+
11
+ class PostgresEvalBackend:
12
+ """Stores eval results in the agent_memory table.
13
+
14
+ Uses the existing agent_memory schema — eval results are stored as
15
+ memory_type='decision' with metadata['dream_eval']=True.
16
+
17
+ Requires psycopg[binary] >= 3.2 and a running Postgres instance.
18
+ """
19
+
20
+ def __init__(self, dsn: str | None = None) -> None:
21
+ import os
22
+
23
+ if dsn is None:
24
+ dsn = os.environ.get("AGENT_MEMORY_DATABASE_URL") or os.environ.get(
25
+ "DATABASE_URL"
26
+ )
27
+ if not dsn:
28
+ raise ValueError(
29
+ "No database DSN configured. Set AGENT_MEMORY_DATABASE_URL or DATABASE_URL."
30
+ )
31
+ from psycopg.rows import dict_row
32
+ from psycopg_pool import ConnectionPool
33
+
34
+ self._pool = ConnectionPool(
35
+ dsn, min_size=1, max_size=5, kwargs={"row_factory": dict_row}
36
+ )
37
+
38
+ def close(self) -> None:
39
+ self._pool.close()
40
+
41
+ def __enter__(self) -> PostgresEvalBackend:
42
+ return self
43
+
44
+ def __exit__(self, *exc: Any) -> None:
45
+ self.close()
46
+
47
+ def save_eval_result(self, result: EvalResult) -> None:
48
+ """Persist eval result as a memory record."""
49
+ with self._pool.connection() as conn:
50
+ conn.execute(
51
+ """
52
+ INSERT INTO agent_memory (
53
+ agent_id, session_id, session_type, memory_type,
54
+ content, source, metadata
55
+ ) VALUES (%s, %s, %s, %s, %s::jsonb, %s, %s::jsonb)
56
+ """,
57
+ (
58
+ "dream-eval",
59
+ result.run_id,
60
+ "dream_eval",
61
+ "decision",
62
+ json.dumps(result.to_metrics_dict()),
63
+ "sdk",
64
+ json.dumps({"dream_eval": True, "mode": result.mode.value}),
65
+ ),
66
+ )
67
+ conn.commit()
68
+
69
+ def load_eval_report(self, run_id: str) -> EvalReport | None:
70
+ """Load eval report from agent_memory by session_id."""
71
+ with self._pool.connection() as conn:
72
+ row = conn.execute(
73
+ """
74
+ SELECT content FROM agent_memory
75
+ WHERE session_id = %s AND metadata ? 'dream_eval'
76
+ ORDER BY created_at DESC LIMIT 1
77
+ """,
78
+ (run_id,),
79
+ ).fetchone()
80
+ if not row:
81
+ return None
82
+ content = row["content"]
83
+ if isinstance(content, str):
84
+ content = json.loads(content)
85
+ return EvalReport(
86
+ items=[],
87
+ sessions_evaluated=content.get("sessions_evaluated", 0),
88
+ token_cost=content.get("token_cost", 0),
89
+ latency=content.get("latency", 0),
90
+ )
91
+
92
+ def load_labels(self, corpus_path: str | None = None) -> Labels:
93
+ """Load labels from a JSON file. Falls back to empty labels."""
94
+ from pathlib import Path
95
+
96
+ if corpus_path is None:
97
+ return Labels()
98
+
99
+ labels_file = Path(corpus_path) / "labels.json"
100
+ if not labels_file.exists():
101
+ return Labels()
102
+
103
+ data = json.loads(labels_file.read_text(encoding="utf-8"))
104
+ return Labels.model_validate(data)
105
+
106
+ def list_runs(self, limit: int = 50) -> list[dict[str, Any]]:
107
+ """List recent eval runs from agent_memory."""
108
+ with self._pool.connection() as conn:
109
+ rows = conn.execute(
110
+ """
111
+ SELECT session_id AS run_id, content, created_at
112
+ FROM agent_memory
113
+ WHERE metadata ? 'dream_eval'
114
+ ORDER BY created_at DESC
115
+ LIMIT %s
116
+ """,
117
+ (limit,),
118
+ ).fetchall()
119
+
120
+ runs: list[dict[str, Any]] = []
121
+ for row in rows:
122
+ content = row["content"]
123
+ if isinstance(content, str):
124
+ content = json.loads(content)
125
+ runs.append({
126
+ "run_id": row["run_id"],
127
+ "faithfulness": content.get("faithfulness_score"),
128
+ "secret_leak": content.get("secret_leak_test"),
129
+ "date": row["created_at"].isoformat() if row["created_at"] else None,
130
+ })
131
+ return runs
dream_eval/cli.py ADDED
@@ -0,0 +1,158 @@
1
+ #!/usr/bin/env python3
2
+ """CLI for dream-eval — run evaluations, check gates, score faithfulness."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import sys
9
+ from datetime import datetime
10
+
11
+ from dream_eval.types import EvalMode, EvalResult
12
+
13
+
14
+ def main() -> None:
15
+ parser = argparse.ArgumentParser(
16
+ description="dream-eval — agent memory faithfulness evaluation"
17
+ )
18
+ sub = parser.add_subparsers(dest="command", required=True)
19
+
20
+ p_run = sub.add_parser("run", help="Run a full eval pipeline")
21
+ p_run.add_argument("--corpus", default=None, help="Path to golden corpus directory")
22
+ p_run.add_argument("--mode", choices=["golden", "live"], default="golden")
23
+ p_run.add_argument("--output-dir", default=None, help="Output directory for results")
24
+
25
+ p_gates = sub.add_parser("gates", help="Run only deterministic gates")
26
+ p_gates.add_argument("--text", default=None, help="Text to check for secret leaks")
27
+ p_gates.add_argument("--file", default=None, help="File to check hash determinism")
28
+ p_gates.add_argument("--hash", default=None, help="Expected hash (sha256:hex)")
29
+
30
+ p_score = sub.add_parser("score", help="Score an eval report against labels")
31
+ p_score.add_argument("--report", required=True, help="Path to eval-report.json")
32
+ p_score.add_argument("--labels", default=None, help="Path to labels.json")
33
+
34
+ p_list = sub.add_parser("list", help="List recent eval runs")
35
+ p_list.add_argument("--limit", type=int, default=10)
36
+ p_list.add_argument("--backend", choices=["json", "postgres"], default="json")
37
+ p_list.add_argument("--dsn", default=None, help="Postgres DSN (for postgres backend)")
38
+
39
+ p_show = sub.add_parser("show", help="Show a specific eval run result")
40
+ p_show.add_argument("run_id")
41
+ p_show.add_argument("--backend", choices=["json", "postgres"], default="json")
42
+
43
+ args = parser.parse_args()
44
+
45
+ if args.command == "gates":
46
+ _run_gates(args)
47
+ elif args.command == "run":
48
+ _run_eval(args)
49
+ elif args.command == "score":
50
+ _run_score(args)
51
+ elif args.command == "list":
52
+ _run_list(args)
53
+ elif args.command == "show":
54
+ _run_show(args)
55
+
56
+
57
+ def _run_gates(args: argparse.Namespace) -> None:
58
+ from dream_eval.gates import check_hash_determinism, check_secret_leak
59
+
60
+ results = []
61
+
62
+ if args.text:
63
+ result = check_secret_leak(args.text)
64
+ results.append(result)
65
+
66
+ if args.file:
67
+ result = check_hash_determinism(
68
+ open(args.file, encoding="utf-8").read(), args.hash
69
+ )
70
+ results.append(result)
71
+
72
+ if not results:
73
+ print("No checks specified. Use --text or --file.", file=sys.stderr)
74
+ sys.exit(1)
75
+
76
+ for r in results:
77
+ print(json.dumps(r.model_dump(mode="json"), default=str))
78
+
79
+ if any(r.status.value == "fail" for r in results):
80
+ sys.exit(1)
81
+
82
+
83
+ def _run_eval(args: argparse.Namespace) -> None:
84
+ from pathlib import Path
85
+
86
+ result = EvalResult(
87
+ run_id=datetime.now().strftime("%Y-%m-%dT%H-%M-%SZ"),
88
+ date=datetime.now(),
89
+ mode=EvalMode(args.mode),
90
+ )
91
+
92
+ output_dir = args.output_dir or "eval/results"
93
+ out_path = Path(output_dir) / result.run_id
94
+ out_path.mkdir(parents=True, exist_ok=True)
95
+
96
+ metrics_path = out_path / "metrics.json"
97
+ metrics_path.write_text(
98
+ json.dumps(result.to_metrics_dict(), indent=2, default=str),
99
+ encoding="utf-8",
100
+ )
101
+
102
+ print(json.dumps({
103
+ "run_id": result.run_id,
104
+ "output": str(out_path),
105
+ "hard_fail": result.hard_fail,
106
+ }, default=str))
107
+
108
+
109
+ def _run_score(args: argparse.Namespace) -> None:
110
+ from pathlib import Path
111
+
112
+ from dream_eval.scoring import compute_faithfulness
113
+ from dream_eval.types import EvalReport, Labels
114
+
115
+ report_data = json.loads(Path(args.report).read_text(encoding="utf-8"))
116
+ report = EvalReport.model_validate(report_data)
117
+
118
+ labels_path = args.labels or str(
119
+ Path(args.report).parent.parent / "golden-corpus" / "labels.json"
120
+ )
121
+ labels_data = json.loads(Path(labels_path).read_text(encoding="utf-8"))
122
+ labels = Labels.model_validate(labels_data)
123
+
124
+ faithfulness = compute_faithfulness(report.items, labels.items)
125
+ print(json.dumps(faithfulness.model_dump(mode="json"), default=str))
126
+
127
+
128
+ def _run_list(args: argparse.Namespace) -> None:
129
+ from dream_eval.backends import JsonFileBackend, PostgresBackend
130
+
131
+ if args.backend == "postgres":
132
+ backend = PostgresBackend(dsn=args.dsn)
133
+ else:
134
+ backend = JsonFileBackend()
135
+
136
+ runs = backend.list_runs(limit=args.limit)
137
+ for run in runs:
138
+ print(json.dumps(run, default=str))
139
+
140
+
141
+ def _run_show(args: argparse.Namespace) -> None:
142
+ from dream_eval.backends import JsonFileBackend, PostgresBackend
143
+
144
+ if args.backend == "postgres":
145
+ backend = PostgresBackend()
146
+ else:
147
+ backend = JsonFileBackend()
148
+
149
+ report = backend.load_eval_report(args.run_id)
150
+ if report:
151
+ print(json.dumps(report.model_dump(mode="json"), default=str))
152
+ else:
153
+ print(f"Run {args.run_id} not found", file=sys.stderr)
154
+ sys.exit(1)
155
+
156
+
157
+ if __name__ == "__main__":
158
+ main()
dream_eval/gates.py ADDED
@@ -0,0 +1,94 @@
1
+ """Deterministic gates — hard stops that must pass before scoring."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import re
7
+
8
+ from dream_eval.types import GateResult, GateStatus
9
+
10
+
11
+ def check_secret_leak(
12
+ output_text: str,
13
+ forbidden_patterns: list[str] | None = None,
14
+ ) -> GateResult:
15
+ """Check evaluator output for leaked secrets.
16
+
17
+ Hard stop: if any forbidden pattern matches, the eval must fail.
18
+ """
19
+ if not forbidden_patterns:
20
+ return GateResult(
21
+ name="secret_leak",
22
+ status=GateStatus.SKIP,
23
+ message="No forbidden patterns configured",
24
+ )
25
+
26
+ matches: list[str] = []
27
+ for pattern in forbidden_patterns:
28
+ try:
29
+ if re.search(pattern, output_text, re.IGNORECASE):
30
+ matches.append(pattern)
31
+ except re.error:
32
+ if pattern.lower() in output_text.lower():
33
+ matches.append(pattern)
34
+
35
+ if matches:
36
+ return GateResult(
37
+ name="secret_leak",
38
+ status=GateStatus.FAIL,
39
+ message=f"Secret leak detected: {len(matches)} forbidden pattern(s) matched",
40
+ details={"matched_patterns": matches},
41
+ )
42
+
43
+ return GateResult(
44
+ name="secret_leak",
45
+ status=GateStatus.PASS,
46
+ message="No forbidden patterns found in output",
47
+ )
48
+
49
+
50
+ def _canonical_hash(content: str) -> str:
51
+ """Canonical SHA-256: strip BOM, normalize CRLF→LF, hex digest with prefix."""
52
+ text = content
53
+ if text.startswith("\ufeff"):
54
+ text = text[1:]
55
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
56
+ digest = hashlib.sha256(text.encode("utf-8")).hexdigest()
57
+ return f"sha256:{digest}"
58
+
59
+
60
+ def check_hash_determinism(
61
+ content: str,
62
+ expected_hash: str | None = None,
63
+ ) -> GateResult:
64
+ """Verify hash is deterministic after BOM/CRLF normalization.
65
+
66
+ The canonical hash algorithm:
67
+ 1. Strip UTF-8 BOM if present
68
+ 2. Normalize line endings: CRLF → LF
69
+ 3. Compute SHA-256 hex digest
70
+ """
71
+ actual_hash = _canonical_hash(content)
72
+
73
+ if expected_hash is None:
74
+ return GateResult(
75
+ name="hash_determinism",
76
+ status=GateStatus.PASS,
77
+ message=f"Hash computed: {actual_hash}",
78
+ details={"hash": actual_hash},
79
+ )
80
+
81
+ if actual_hash == expected_hash:
82
+ return GateResult(
83
+ name="hash_determinism",
84
+ status=GateStatus.PASS,
85
+ message="Hash matches expected value",
86
+ details={"hash": actual_hash},
87
+ )
88
+
89
+ return GateResult(
90
+ name="hash_determinism",
91
+ status=GateStatus.FAIL,
92
+ message=f"Hash mismatch: expected {expected_hash}, got {actual_hash}",
93
+ details={"expected": expected_hash, "actual": actual_hash},
94
+ )
@@ -0,0 +1,5 @@
1
+ """MCP server for dream-eval — exposes eval tools to any MCP-compatible agent."""
2
+
3
+ from dream_eval.mcp.server import create_server
4
+
5
+ __all__ = ["create_server"]