dream-eval 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dream_eval-0.2.0/.claude-plugin/SKILL.md +88 -0
- dream_eval-0.2.0/.claude-plugin/plugin.json +32 -0
- dream_eval-0.2.0/.codex-plugin/SKILL.md +50 -0
- dream_eval-0.2.0/.codex-plugin/plugin.json +32 -0
- dream_eval-0.2.0/.cursor-plugin/SKILL.md +57 -0
- dream_eval-0.2.0/.cursor-plugin/plugin.json +32 -0
- dream_eval-0.2.0/.gitignore +9 -0
- dream_eval-0.2.0/CHANGELOG.md +34 -0
- dream_eval-0.2.0/PKG-INFO +132 -0
- dream_eval-0.2.0/README.md +99 -0
- dream_eval-0.2.0/examples/quick_start.py +87 -0
- dream_eval-0.2.0/py.typed +0 -0
- dream_eval-0.2.0/pyproject.toml +67 -0
- dream_eval-0.2.0/src/dream_eval/__init__.py +18 -0
- dream_eval-0.2.0/src/dream_eval/backends.py +172 -0
- dream_eval-0.2.0/src/dream_eval/backends_pg.py +131 -0
- dream_eval-0.2.0/src/dream_eval/cli.py +158 -0
- dream_eval-0.2.0/src/dream_eval/gates.py +94 -0
- dream_eval-0.2.0/src/dream_eval/mcp/__init__.py +5 -0
- dream_eval-0.2.0/src/dream_eval/mcp/server.py +207 -0
- dream_eval-0.2.0/src/dream_eval/nli.py +96 -0
- dream_eval-0.2.0/src/dream_eval/py.typed +0 -0
- dream_eval-0.2.0/src/dream_eval/scoring.py +225 -0
- dream_eval-0.2.0/src/dream_eval/types.py +153 -0
- dream_eval-0.2.0/tests/__init__.py +0 -0
- dream_eval-0.2.0/tests/test_gates.py +58 -0
- dream_eval-0.2.0/tests/test_scoring.py +195 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: dream-eval
|
|
3
|
+
description: Agent-agnostic faithfulness evaluation for agent memory — scoring, deterministic gates, and metrics. Works with any memory backend (Postgres, LanceDB, knowledge graphs).
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# dream-eval
|
|
7
|
+
|
|
8
|
+
Faithfulness evaluation framework for agent memory quality scoring.
|
|
9
|
+
|
|
10
|
+
## Usage
|
|
11
|
+
|
|
12
|
+
### Python API
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
from dream_eval.scoring import compute_faithfulness
|
|
16
|
+
from dream_eval.types import ProposedItem, LabeledItem
|
|
17
|
+
|
|
18
|
+
proposed = [ProposedItem(id="pref-1", category="pref", content={"key": "ci-merge-gate"})]
|
|
19
|
+
labels = [LabeledItem(id="pref-1", category="pref", content={"key": "ci-merge-gate"})]
|
|
20
|
+
|
|
21
|
+
report = compute_faithfulness(proposed, labels)
|
|
22
|
+
print(report.faithfulness_score) # 1.0
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### CLI
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
# Score an eval report
|
|
29
|
+
dream-eval score --report eval/results/run-1/eval-report.json
|
|
30
|
+
|
|
31
|
+
# Check for secret leaks
|
|
32
|
+
dream-eval gates --text "output to check"
|
|
33
|
+
|
|
34
|
+
# Check hash determinism
|
|
35
|
+
dream-eval gates --file input.txt
|
|
36
|
+
|
|
37
|
+
# List recent runs
|
|
38
|
+
dream-eval list --limit 10
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Deterministic Gates
|
|
42
|
+
|
|
43
|
+
These are **hard stops** — if either fails, the eval must stop:
|
|
44
|
+
|
|
45
|
+
| Gate | What it checks |
|
|
46
|
+
|------|----------------|
|
|
47
|
+
| `secret_leak` | Forbidden patterns (API keys, DSNs) in evaluator output |
|
|
48
|
+
| `hash_determinism` | BOM/CRLF normalization produces consistent SHA-256 hashes |
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install dream-eval
|
|
54
|
+
|
|
55
|
+
# With Postgres backend
|
|
56
|
+
pip install dream-eval[postgres]
|
|
57
|
+
|
|
58
|
+
# With MCP server
|
|
59
|
+
pip install dream-eval[mcp]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Architecture
|
|
63
|
+
|
|
64
|
+
dream-eval is **backend-agnostic**. The `MemoryBackend` abstract class defines the interface:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
class MemoryBackend(ABC):
|
|
68
|
+
def load_eval_report(self, run_id: str) -> EvalReport | None: ...
|
|
69
|
+
def load_labels(self, corpus_path: str | None = None) -> Labels: ...
|
|
70
|
+
def save_eval_result(self, result: EvalResult) -> None: ...
|
|
71
|
+
def list_runs(self, limit: int = 50) -> list[dict[str, Any]]: ...
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Built-in backends:
|
|
75
|
+
- `JsonFileBackend` — reads/writes to `eval/results/<run_id>/`
|
|
76
|
+
- `PostgresBackend` — stores in `agent_memory` table alongside other memory records
|
|
77
|
+
|
|
78
|
+
## MCP Server
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Run as MCP server
|
|
82
|
+
dream-eval-mcp
|
|
83
|
+
|
|
84
|
+
# Or via uvx
|
|
85
|
+
uvx dream-eval[mcp]
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Exposes tools: `dream_score`, `dream_check_secrets`, `dream_check_hash`, `dream_metrics_schema`.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "dream-eval",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Agent-agnostic faithfulness evaluation for agent memory — scoring, deterministic gates, and metrics.",
|
|
5
|
+
"author": "OnlineChefGroep",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"skills": [
|
|
8
|
+
{
|
|
9
|
+
"name": "dream-eval",
|
|
10
|
+
"description": "Faithfulness evaluation pipeline for agent memory quality scoring",
|
|
11
|
+
"trigger": "Apply the dream-eval skill to score agent memory faithfulness",
|
|
12
|
+
"entry_point": "SKILL.md"
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
"tools": [
|
|
16
|
+
{
|
|
17
|
+
"name": "dream_score",
|
|
18
|
+
"description": "Score evaluator output against golden corpus labels",
|
|
19
|
+
"command": "uvx dream-eval score"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"name": "dream_check_secrets",
|
|
23
|
+
"description": "Check text for leaked secrets",
|
|
24
|
+
"command": "uvx dream-eval gates --text"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"name": "dream_check_hash",
|
|
28
|
+
"description": "Verify content produces a deterministic hash",
|
|
29
|
+
"command": "uvx dream-eval gates --file"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: dream-eval
|
|
3
|
+
description: Agent-agnostic faithfulness evaluation for agent memory — scoring, deterministic gates, and metrics. Works with any memory backend.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# dream-eval (Codex)
|
|
7
|
+
|
|
8
|
+
Faithfulness evaluation framework for agent memory quality scoring.
|
|
9
|
+
|
|
10
|
+
## Trigger
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
Apply the dream-eval skill to score agent memory faithfulness.
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# Score an eval report
|
|
20
|
+
uvx dream-eval score --report eval/results/run-1/eval-report.json
|
|
21
|
+
|
|
22
|
+
# Check for secret leaks
|
|
23
|
+
uvx dream-eval gates --text "output to check"
|
|
24
|
+
|
|
25
|
+
# Check hash determinism
|
|
26
|
+
uvx dream-eval gates --file input.txt
|
|
27
|
+
|
|
28
|
+
# List recent runs
|
|
29
|
+
uvx dream-eval list --limit 10
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Python API
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from dream_eval.scoring import compute_faithfulness
|
|
36
|
+
from dream_eval.types import ProposedItem, LabeledItem
|
|
37
|
+
|
|
38
|
+
proposed = [ProposedItem(id="pref-1", category="pref", content={"key": "ci-merge-gate"})]
|
|
39
|
+
labels = [LabeledItem(id="pref-1", category="pref", content={"key": "ci-merge-gate"})]
|
|
40
|
+
|
|
41
|
+
report = compute_faithfulness(proposed, labels)
|
|
42
|
+
print(report.faithfulness_score) # 1.0
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Deterministic Gates
|
|
46
|
+
|
|
47
|
+
| Gate | What it checks |
|
|
48
|
+
|------|----------------|
|
|
49
|
+
| `secret_leak` | Forbidden patterns (API keys, DSNs) in evaluator output |
|
|
50
|
+
| `hash_determinism` | BOM/CRLF normalization produces consistent SHA-256 hashes |
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "dream-eval",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Agent-agnostic faithfulness evaluation for agent memory — scoring, deterministic gates, and metrics.",
|
|
5
|
+
"author": "OnlineChefGroep",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"skills": [
|
|
8
|
+
{
|
|
9
|
+
"name": "dream-eval",
|
|
10
|
+
"description": "Faithfulness evaluation pipeline for agent memory quality scoring",
|
|
11
|
+
"trigger": "Apply the dream-eval skill to score agent memory faithfulness",
|
|
12
|
+
"entry_point": "SKILL.md"
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
"tools": [
|
|
16
|
+
{
|
|
17
|
+
"name": "dream_score",
|
|
18
|
+
"description": "Score evaluator output against golden corpus labels",
|
|
19
|
+
"command": "uvx dream-eval score"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"name": "dream_check_secrets",
|
|
23
|
+
"description": "Check text for leaked secrets",
|
|
24
|
+
"command": "uvx dream-eval gates --text"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"name": "dream_check_hash",
|
|
28
|
+
"description": "Verify content produces a deterministic hash",
|
|
29
|
+
"command": "uvx dream-eval gates --file"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: dream-eval
|
|
3
|
+
description: Agent-agnostic faithfulness evaluation for agent memory — scoring, deterministic gates, and metrics. Works with any memory backend.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# dream-eval (Cursor)
|
|
8
|
+
|
|
9
|
+
Faithfulness evaluation framework for agent memory quality scoring.
|
|
10
|
+
|
|
11
|
+
## Trigger
|
|
12
|
+
|
|
13
|
+
`/dream-eval` or "Apply the dream-eval skill to score agent memory faithfulness."
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Score an eval report
|
|
19
|
+
uvx dream-eval score --report eval/results/run-1/eval-report.json
|
|
20
|
+
|
|
21
|
+
# Check for secret leaks
|
|
22
|
+
uvx dream-eval gates --text "output to check"
|
|
23
|
+
|
|
24
|
+
# Check hash determinism
|
|
25
|
+
uvx dream-eval gates --file input.txt
|
|
26
|
+
|
|
27
|
+
# List recent runs
|
|
28
|
+
uvx dream-eval list --limit 10
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Python API
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from dream_eval.scoring import compute_faithfulness
|
|
35
|
+
from dream_eval.types import ProposedItem, LabeledItem
|
|
36
|
+
|
|
37
|
+
proposed = [ProposedItem(id="pref-1", category="pref", content={"key": "ci-merge-gate"})]
|
|
38
|
+
labels = [LabeledItem(id="pref-1", category="pref", content={"key": "ci-merge-gate"})]
|
|
39
|
+
|
|
40
|
+
report = compute_faithfulness(proposed, labels)
|
|
41
|
+
print(report.faithfulness_score) # 1.0
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Deterministic Gates
|
|
45
|
+
|
|
46
|
+
| Gate | What it checks |
|
|
47
|
+
|------|----------------|
|
|
48
|
+
| `secret_leak` | Forbidden patterns (API keys, DSNs) in evaluator output |
|
|
49
|
+
| `hash_determinism` | BOM/CRLF normalization produces consistent SHA-256 hashes |
|
|
50
|
+
|
|
51
|
+
## MCP Server
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
dream-eval-mcp
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Exposes tools: `dream_score`, `dream_check_secrets`, `dream_check_hash`, `dream_metrics_schema`.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "dream-eval",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Agent-agnostic faithfulness evaluation for agent memory — scoring, deterministic gates, and metrics.",
|
|
5
|
+
"author": "OnlineChefGroep",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"skills": [
|
|
8
|
+
{
|
|
9
|
+
"name": "dream-eval",
|
|
10
|
+
"description": "Faithfulness evaluation pipeline for agent memory quality scoring",
|
|
11
|
+
"trigger": "Apply the dream-eval skill to score agent memory faithfulness",
|
|
12
|
+
"entry_point": "SKILL.md"
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
"tools": [
|
|
16
|
+
{
|
|
17
|
+
"name": "dream_score",
|
|
18
|
+
"description": "Score evaluator output against golden corpus labels",
|
|
19
|
+
"command": "uvx dream-eval score"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"name": "dream_check_secrets",
|
|
23
|
+
"description": "Check text for leaked secrets",
|
|
24
|
+
"command": "uvx dream-eval gates --text"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"name": "dream_check_hash",
|
|
28
|
+
"description": "Verify content produces a deterministic hash",
|
|
29
|
+
"command": "uvx dream-eval gates --file"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.2.0 (2026-06-26)
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- **NLI claim verification** via Vectara HHEM-2.1-Open (`dream-eval[nli]`)
|
|
7
|
+
- `verify_claim()` for single claim verification
|
|
8
|
+
- `verify_content_nli()` for batch content matching
|
|
9
|
+
- Automatic fallback to fuzzy matching if NLI extra not installed
|
|
10
|
+
- **Fuzzy content matching** with `fuzzy=True` parameter
|
|
11
|
+
- Uses difflib SequenceMatcher (no LLM dependency)
|
|
12
|
+
- Configurable threshold (default 0.85)
|
|
13
|
+
- **Async parallel scoring** via `score_transcripts_parallel()`
|
|
14
|
+
- ThreadPoolExecutor-based parallelism
|
|
15
|
+
- Preserves input order
|
|
16
|
+
- **Hypothesis property-based tests** for scoring invariants
|
|
17
|
+
- Deterministic regression tests with `@example` decorator
|
|
18
|
+
- Bounds checking for all scoring functions
|
|
19
|
+
- **MCP server** for Claude/Copilot/etc integration (`dream-eval-mcp`)
|
|
20
|
+
- **Agent Skills plugins** (`.claude-plugin`, `.codex-plugin`, `.cursor-plugin`)
|
|
21
|
+
- **Materialized views** for dashboard metrics (schema_v3.sql)
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
- Pool tuning: `min_size=2, max_size=20, max_waiting=100`
|
|
25
|
+
- Faithfulness baseline target: 0.63 → 0.75
|
|
26
|
+
|
|
27
|
+
## 0.1.0 (2026-06-26)
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
- Initial release
|
|
31
|
+
- Scoring algorithms: precision, recall, faithfulness, recurrence calibration
|
|
32
|
+
- Deterministic gates: secret_leak, hash_determinism
|
|
33
|
+
- Backends: JsonFileBackend, PostgresBackend
|
|
34
|
+
- CLI: `dream-eval run/gates/score/list/show`
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dream-eval
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Agent-agnostic faithfulness evaluation framework — evaluator → judge → curator pipeline for agent memory quality scoring
|
|
5
|
+
Author: OnlineChefGroep
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: agent-eval,faithfulness,llm-eval,memory-evaluation,quality-gate
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Python: <3.15,>=3.11
|
|
19
|
+
Requires-Dist: pydantic>=2.13
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.15; extra == 'dev'
|
|
24
|
+
Provides-Extra: mcp
|
|
25
|
+
Requires-Dist: mcp>=1.0; extra == 'mcp'
|
|
26
|
+
Provides-Extra: nli
|
|
27
|
+
Requires-Dist: torch>=2.0; extra == 'nli'
|
|
28
|
+
Requires-Dist: transformers>=4.35; extra == 'nli'
|
|
29
|
+
Provides-Extra: postgres
|
|
30
|
+
Requires-Dist: psycopg-pool>=3.1; extra == 'postgres'
|
|
31
|
+
Requires-Dist: psycopg[binary]>=3.2; extra == 'postgres'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# dream-eval
|
|
35
|
+
|
|
36
|
+
Agent-agnostic faithfulness evaluation framework for agent memory quality scoring.
|
|
37
|
+
|
|
38
|
+
## What it does
|
|
39
|
+
|
|
40
|
+
dream-eval implements the **evaluator → judge → curator** pipeline pattern:
|
|
41
|
+
|
|
42
|
+
- **Evaluator** reads transcripts + soul (interpretive lens), proposes items
|
|
43
|
+
- **Judge** scores against labels WITHOUT reading soul (enforcing objectivity)
|
|
44
|
+
- **Curator** writes results (enforcing separation of concerns)
|
|
45
|
+
|
|
46
|
+
This pattern is unique in the agent memory space — no competitor (mem0, Cognee, LangMem) offers automated faithfulness evaluation.
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install dream-eval
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Quick start
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from dream_eval import compute_faithfulness
|
|
58
|
+
from dream_eval.types import ProposedItem, LabeledItem
|
|
59
|
+
|
|
60
|
+
proposed = [
|
|
61
|
+
ProposedItem(id="pref-1", category="pref", content={"key": "dark_mode"}),
|
|
62
|
+
ProposedItem(id="workflow-1", category="workflow", content={"key": "ci_merge"}),
|
|
63
|
+
]
|
|
64
|
+
labels = [
|
|
65
|
+
LabeledItem(id="pref-1", category="pref"),
|
|
66
|
+
LabeledItem(id="workflow-1", category="workflow"),
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
report = compute_faithfulness(proposed, labels)
|
|
70
|
+
print(f"Faithfulness: {report.faithfulness_score}")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## CLI
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Score evaluator report against labels
|
|
77
|
+
dream-eval score --report report.json --labels labels.json
|
|
78
|
+
|
|
79
|
+
# Run deterministic gates
|
|
80
|
+
dream-eval gate --labels labels.json --output evaluator_output.txt
|
|
81
|
+
|
|
82
|
+
# Export to metrics.json format
|
|
83
|
+
dream-eval export --input eval_result.json --output metrics.json
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Deterministic gates
|
|
87
|
+
|
|
88
|
+
These fail the eval regardless of LLM scores:
|
|
89
|
+
|
|
90
|
+
- **secret_leak** — checks for forbidden patterns (API keys, tokens, passwords)
|
|
91
|
+
- **hash_determinism** — verifies BOM/CRLF normalization produces stable hashes
|
|
92
|
+
|
|
93
|
+
## Memory backend adapter
|
|
94
|
+
|
|
95
|
+
dream-eval works with any memory backend via `BaseMemoryBackend`:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from dream_eval.adapter import BaseMemoryBackend
|
|
99
|
+
|
|
100
|
+
class MyBackend(BaseMemoryBackend):
|
|
101
|
+
def read_transcripts(self, corpus_path=None):
|
|
102
|
+
# Read from your storage
|
|
103
|
+
...
|
|
104
|
+
|
|
105
|
+
def read_labels(self, labels_path=None):
|
|
106
|
+
# Read ground truth labels
|
|
107
|
+
...
|
|
108
|
+
|
|
109
|
+
def write_eval_result(self, result):
|
|
110
|
+
# Write evaluation results
|
|
111
|
+
...
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Built-in `DictMemoryBackend` for testing.
|
|
115
|
+
|
|
116
|
+
## Architecture
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
dream-eval/
|
|
120
|
+
├── src/dream_eval/
|
|
121
|
+
│ ├── __init__.py # Package exports
|
|
122
|
+
│ ├── types.py # Pydantic models (EvalResult, FaithfulnessReport, etc.)
|
|
123
|
+
│ ├── scoring.py # Faithfulness, precision, recall algorithms
|
|
124
|
+
│ ├── gates.py # Deterministic gates (secret_leak, hash_determinism)
|
|
125
|
+
│ ├── adapter.py # Abstract BaseMemoryBackend + DictMemoryBackend
|
|
126
|
+
│ └── cli.py # CLI entry point
|
|
127
|
+
└── tests/ # Test suite
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
MIT
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# dream-eval
|
|
2
|
+
|
|
3
|
+
Agent-agnostic faithfulness evaluation framework for agent memory quality scoring.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
dream-eval implements the **evaluator → judge → curator** pipeline pattern:
|
|
8
|
+
|
|
9
|
+
- **Evaluator** reads transcripts + soul (interpretive lens), proposes items
|
|
10
|
+
- **Judge** scores against labels WITHOUT reading soul (enforcing objectivity)
|
|
11
|
+
- **Curator** writes results (enforcing separation of concerns)
|
|
12
|
+
|
|
13
|
+
This pattern is unique in the agent memory space — no competitor (mem0, Cognee, LangMem) offers automated faithfulness evaluation.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install dream-eval
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from dream_eval import compute_faithfulness
|
|
25
|
+
from dream_eval.types import ProposedItem, LabeledItem
|
|
26
|
+
|
|
27
|
+
proposed = [
|
|
28
|
+
ProposedItem(id="pref-1", category="pref", content={"key": "dark_mode"}),
|
|
29
|
+
ProposedItem(id="workflow-1", category="workflow", content={"key": "ci_merge"}),
|
|
30
|
+
]
|
|
31
|
+
labels = [
|
|
32
|
+
LabeledItem(id="pref-1", category="pref"),
|
|
33
|
+
LabeledItem(id="workflow-1", category="workflow"),
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
report = compute_faithfulness(proposed, labels)
|
|
37
|
+
print(f"Faithfulness: {report.faithfulness_score}")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## CLI
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Score evaluator report against labels
|
|
44
|
+
dream-eval score --report report.json --labels labels.json
|
|
45
|
+
|
|
46
|
+
# Run deterministic gates
|
|
47
|
+
dream-eval gate --labels labels.json --output evaluator_output.txt
|
|
48
|
+
|
|
49
|
+
# Export to metrics.json format
|
|
50
|
+
dream-eval export --input eval_result.json --output metrics.json
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Deterministic gates
|
|
54
|
+
|
|
55
|
+
These fail the eval regardless of LLM scores:
|
|
56
|
+
|
|
57
|
+
- **secret_leak** — checks for forbidden patterns (API keys, tokens, passwords)
|
|
58
|
+
- **hash_determinism** — verifies BOM/CRLF normalization produces stable hashes
|
|
59
|
+
|
|
60
|
+
## Memory backend adapter
|
|
61
|
+
|
|
62
|
+
dream-eval works with any memory backend via `BaseMemoryBackend`:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from dream_eval.adapter import BaseMemoryBackend
|
|
66
|
+
|
|
67
|
+
class MyBackend(BaseMemoryBackend):
|
|
68
|
+
def read_transcripts(self, corpus_path=None):
|
|
69
|
+
# Read from your storage
|
|
70
|
+
...
|
|
71
|
+
|
|
72
|
+
def read_labels(self, labels_path=None):
|
|
73
|
+
# Read ground truth labels
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
def write_eval_result(self, result):
|
|
77
|
+
# Write evaluation results
|
|
78
|
+
...
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Built-in `DictMemoryBackend` for testing.
|
|
82
|
+
|
|
83
|
+
## Architecture
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
dream-eval/
|
|
87
|
+
├── src/dream_eval/
|
|
88
|
+
│ ├── __init__.py # Package exports
|
|
89
|
+
│ ├── types.py # Pydantic models (EvalResult, FaithfulnessReport, etc.)
|
|
90
|
+
│ ├── scoring.py # Faithfulness, precision, recall algorithms
|
|
91
|
+
│ ├── gates.py # Deterministic gates (secret_leak, hash_determinism)
|
|
92
|
+
│ ├── adapter.py # Abstract BaseMemoryBackend + DictMemoryBackend
|
|
93
|
+
│ └── cli.py # CLI entry point
|
|
94
|
+
└── tests/ # Test suite
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## License
|
|
98
|
+
|
|
99
|
+
MIT
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Quick start example for dream-eval."""
|
|
3
|
+
|
|
4
|
+
from dream_eval.scoring import compute_faithfulness
|
|
5
|
+
from dream_eval.gates import check_secret_leak, check_hash_determinism
|
|
6
|
+
from dream_eval.types import ProposedItem, LabeledItem
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main() -> None:
|
|
10
|
+
# --- 1. Faithfulness scoring ---
|
|
11
|
+
proposed = [
|
|
12
|
+
ProposedItem(
|
|
13
|
+
id="pref-1",
|
|
14
|
+
category="pref",
|
|
15
|
+
content={"key": "ci-merge-gate", "value": "require-reviews"},
|
|
16
|
+
recurrence=3,
|
|
17
|
+
),
|
|
18
|
+
ProposedItem(
|
|
19
|
+
id="rule-1",
|
|
20
|
+
category="rule",
|
|
21
|
+
content={"key": "no-secrets-in-code", "value": "true"},
|
|
22
|
+
),
|
|
23
|
+
ProposedItem(
|
|
24
|
+
id="unknown-1",
|
|
25
|
+
category="pref",
|
|
26
|
+
content={"key": "mystery-pref"},
|
|
27
|
+
),
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
labels = [
|
|
31
|
+
LabeledItem(
|
|
32
|
+
id="pref-1",
|
|
33
|
+
category="pref",
|
|
34
|
+
content={"key": "ci-merge-gate", "value": "require-reviews"},
|
|
35
|
+
max_recurrence=5,
|
|
36
|
+
),
|
|
37
|
+
LabeledItem(
|
|
38
|
+
id="rule-1",
|
|
39
|
+
category="rule",
|
|
40
|
+
content={"key": "no-secrets-in-code", "value": "true"},
|
|
41
|
+
),
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
report = compute_faithfulness(proposed, labels)
|
|
45
|
+
|
|
46
|
+
print("=== Faithfulness Report ===")
|
|
47
|
+
print(f" Score: {report.faithfulness_score:.3f}")
|
|
48
|
+
print(f" Precision: {report.precision:.3f}")
|
|
49
|
+
print(f" Recall: {report.recall:.3f}")
|
|
50
|
+
print(f" Proposed: {report.items_proposed}")
|
|
51
|
+
print(f" Supported: {report.items_fully_supported}")
|
|
52
|
+
print(f" Unsupported: {report.items_unsupported}")
|
|
53
|
+
print()
|
|
54
|
+
|
|
55
|
+
# --- 2. Secret leak detection ---
|
|
56
|
+
clean = check_secret_leak(
|
|
57
|
+
"The config uses environment variables for all secrets.",
|
|
58
|
+
forbidden_patterns=[r"sk-.*", r"password\s*=\s*\S+"],
|
|
59
|
+
)
|
|
60
|
+
leaked = check_secret_leak(
|
|
61
|
+
"Set password=secret123 in the config file.",
|
|
62
|
+
forbidden_patterns=[r"sk-.*", r"password\s*=\s*\S+"],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
print("=== Secret Leak Gates ===")
|
|
66
|
+
print(f" Clean text: {clean.status.value}")
|
|
67
|
+
print(f" Leaked text: {leaked.status.value}")
|
|
68
|
+
print()
|
|
69
|
+
|
|
70
|
+
# --- 3. Hash determinism ---
|
|
71
|
+
content_v1 = "Hello World\n"
|
|
72
|
+
content_v2 = "Hello World\r\n" # CRLF variant
|
|
73
|
+
content_v3 = "\ufeffHello World\n" # BOM variant
|
|
74
|
+
|
|
75
|
+
h1 = check_hash_determinism(content_v1)
|
|
76
|
+
h2 = check_hash_determinism(content_v2)
|
|
77
|
+
h3 = check_hash_determinism(content_v3)
|
|
78
|
+
|
|
79
|
+
print("=== Hash Determinism ===")
|
|
80
|
+
print(f" LF: {h1.details['hash']}")
|
|
81
|
+
print(f" CRLF: {h2.details['hash']}")
|
|
82
|
+
print(f" BOM+LF: {h3.details['hash']}")
|
|
83
|
+
print(f" All match: {h1.details['hash'] == h2.details['hash'] == h3.details['hash']}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
main()
|
|
File without changes
|