memledger 0.5.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaluators/__init__.py +53 -0
- evaluators/agentcore_eval_runner.py +238 -0
- evaluators/agentcore_evaluator.py +76 -0
- evaluators/agentcore_harness.py +134 -0
- evaluators/attribution_integrity.py +191 -0
- evaluators/attribution_integrity_ragas.py +319 -0
- evaluators/attribution_integrity_structural.py +304 -0
- evaluators/calibration/dataset.jsonl +30 -0
- evaluators/calibration/replay.py +223 -0
- evaluators/calibration/replay_results.json +240 -0
- evaluators/calibration/results.csv +31 -0
- evaluators/calibration/results_summary.json +37 -0
- evaluators/calibration/score_agreement.py +228 -0
- evaluators/ragas_harness.py +240 -0
- memledger/__init__.py +66 -0
- memledger/backends/__init__.py +0 -0
- memledger/backends/base.py +125 -0
- memledger/backends/composition.py +270 -0
- memledger/backends/dynamodb.py +596 -0
- memledger/backends/opensearch.py +597 -0
- memledger/backends/pgvector.py +746 -0
- memledger/backends/sqlite.py +700 -0
- memledger/cli/__init__.py +1 -0
- memledger/cli/dashboard.py +601 -0
- memledger/cli/demo.py +442 -0
- memledger/cli/eval.py +659 -0
- memledger/cli/eval_report.py +336 -0
- memledger/cli/main.py +1153 -0
- memledger/demo/__init__.py +0 -0
- memledger/demo/app.py +353 -0
- memledger/embeddings.py +106 -0
- memledger/integrations/__init__.py +0 -0
- memledger/integrations/langgraph.py +126 -0
- memledger/mcp/__init__.py +1 -0
- memledger/mcp/server.py +331 -0
- memledger/memledger.py +1834 -0
- memledger/models.py +248 -0
- memledger/policies/__init__.py +5 -0
- memledger/policies/confidence_policy.py +98 -0
- memledger/policies/conflict_resolution.py +197 -0
- memledger/policies/namespace_rbac.py +149 -0
- memledger/policies/temporal_decay.py +67 -0
- memledger/provenance/__init__.py +3 -0
- memledger/provenance/chain.py +176 -0
- memledger/provenance/chain_store.py +113 -0
- memledger/provenance/effective_confidence.py +61 -0
- memledger/provenance/models.py +58 -0
- memledger/provenance/schema.sql +32 -0
- memledger/strategies/__init__.py +200 -0
- memledger/strategies/audit.py +108 -0
- memledger/strategies/llm.py +490 -0
- memledger/strategies/policies.py +238 -0
- memledger/strategies/retrieval.py +73 -0
- memledger/strategies/scoring.py +220 -0
- memledger/telemetry/__init__.py +8 -0
- memledger/telemetry/instrument.py +570 -0
- memledger/telemetry/memory_spans.py +203 -0
- memledger/telemetry/otel.py +549 -0
- memledger/telemetry/phoenix.py +141 -0
- memledger-0.5.0a0.dist-info/METADATA +169 -0
- memledger-0.5.0a0.dist-info/RECORD +64 -0
- memledger-0.5.0a0.dist-info/WHEEL +4 -0
- memledger-0.5.0a0.dist-info/entry_points.txt +2 -0
- memledger-0.5.0a0.dist-info/licenses/LICENSE +201 -0
evaluators/__init__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Memledger evaluation suite.
|
|
2
|
+
|
|
3
|
+
Three tiers, ordered by install footprint:
|
|
4
|
+
|
|
5
|
+
Tier 1 — Deterministic Memory Attribution Integrity (MAI)
|
|
6
|
+
`from evaluators import evaluate_attribution_integrity`
|
|
7
|
+
Returns an AttributionIntegrityResult.
|
|
8
|
+
Pure stdlib. No deps beyond memledger core. Always available.
|
|
9
|
+
|
|
10
|
+
Tier 2 — RAGAS LLM-as-judge (OSS, provider-agnostic via LiteLLM)
|
|
11
|
+
`from evaluators import evaluate_mai_ragas`
|
|
12
|
+
Requires `pip install memledger[eval]`.
|
|
13
|
+
Set $MEMLEDGER_JUDGE_MODEL (e.g. `openai/gpt-4o-mini`,
|
|
14
|
+
`bedrock/anthropic.claude-...`, `anthropic/claude-...`,
|
|
15
|
+
`ollama/llama3.1`).
|
|
16
|
+
|
|
17
|
+
Tier 3 — AWS Bedrock AgentCore evaluator
|
|
18
|
+
`from evaluators import register_agentcore_evaluator`
|
|
19
|
+
Requires `pip install memledger[aws]`. AWS credentials needed.
|
|
20
|
+
|
|
21
|
+
The RAGAS and AgentCore entry points are lazy-imported so a clean
|
|
22
|
+
`pip install memledger` (no extras) can still load this module
|
|
23
|
+
and run the deterministic tier.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from evaluators.attribution_integrity import (
|
|
29
|
+
AttributionIntegrityResult,
|
|
30
|
+
evaluate_attribution_integrity,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def evaluate_mai_ragas(*args, **kwargs):
|
|
35
|
+
"""Tier 2 — RAGAS LLM-as-judge. Lazy-imports the [eval] deps."""
|
|
36
|
+
from evaluators.attribution_integrity_ragas import evaluate_mai_ragas as _impl
|
|
37
|
+
return _impl(*args, **kwargs)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def register_agentcore_evaluator(*args, **kwargs):
|
|
41
|
+
"""Tier 3 — AWS Bedrock AgentCore evaluator. Lazy-imports boto3."""
|
|
42
|
+
from evaluators.agentcore_evaluator import (
|
|
43
|
+
register_agentcore_evaluator as _impl,
|
|
44
|
+
)
|
|
45
|
+
return _impl(*args, **kwargs)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"AttributionIntegrityResult",
|
|
50
|
+
"evaluate_attribution_integrity",
|
|
51
|
+
"evaluate_mai_ragas",
|
|
52
|
+
"register_agentcore_evaluator",
|
|
53
|
+
]
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""F6 — AgentCore Evaluation Runner.
|
|
2
|
+
|
|
3
|
+
Runs built-in AgentCore evaluators against memory operation data exported
|
|
4
|
+
from Phoenix/Aurora. Converts memledger trace data into the format AgentCore
|
|
5
|
+
evaluators expect.
|
|
6
|
+
|
|
7
|
+
Since AgentCore CLI requires spans in `aws/spans` CloudWatch log group (which
|
|
8
|
+
uses AgentCore SDK's own instrumentation), this script invokes the evaluator
|
|
9
|
+
API programmatically with our trace data.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python evaluators/agentcore_eval_runner.py
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
23
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def fetch_memories_from_aurora():
|
|
27
|
+
"""Fetch all memories from Aurora for evaluation."""
|
|
28
|
+
from memledger import Memledger
|
|
29
|
+
from memledger.models import EmbeddingConfig
|
|
30
|
+
|
|
31
|
+
dsn = os.environ.get("MEMLEDGER_PG_DSN", "")
|
|
32
|
+
if not dsn:
|
|
33
|
+
print("MEMLEDGER_PG_DSN not set — using Phoenix data only")
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
ml = await Memledger.create(
|
|
37
|
+
backend_name="pgvector",
|
|
38
|
+
connection_string=dsn,
|
|
39
|
+
embedding_config=EmbeddingConfig(provider="bedrock", model="amazon.titan-embed-text-v2:0", dimensions=1024),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
active = await ml.get_by_status("active")
|
|
43
|
+
deprecated = await ml.get_by_status("deprecated")
|
|
44
|
+
all_mems = active + deprecated
|
|
45
|
+
await ml.close()
|
|
46
|
+
|
|
47
|
+
return [
|
|
48
|
+
{
|
|
49
|
+
"id": m.id,
|
|
50
|
+
"content": m.content[:200],
|
|
51
|
+
"record_type": m.record_type.value,
|
|
52
|
+
"created_by": m.created_by,
|
|
53
|
+
"confidence": m.confidence,
|
|
54
|
+
"importance": m.importance,
|
|
55
|
+
"session_id": m.session_id,
|
|
56
|
+
"derived_from": m.derived_from,
|
|
57
|
+
"supersedes": m.supersedes,
|
|
58
|
+
"hedged": m.hedged,
|
|
59
|
+
"namespace": m.namespace,
|
|
60
|
+
"status": m.status.value,
|
|
61
|
+
"success_count": m.success_count,
|
|
62
|
+
"failure_count": m.failure_count,
|
|
63
|
+
}
|
|
64
|
+
for m in all_mems
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_option_a_eval(records: list[dict]) -> dict:
|
|
69
|
+
"""Run Option A (deterministic) evaluator."""
|
|
70
|
+
from evaluators.attribution_integrity import evaluate_attribution_integrity
|
|
71
|
+
|
|
72
|
+
# Group by session
|
|
73
|
+
sessions = {}
|
|
74
|
+
for r in records:
|
|
75
|
+
sid = r.get("session_id", "unknown")
|
|
76
|
+
sessions.setdefault(sid, []).append(r)
|
|
77
|
+
|
|
78
|
+
results = {}
|
|
79
|
+
for sid, mems in sessions.items():
|
|
80
|
+
result = evaluate_attribution_integrity(sid, mems)
|
|
81
|
+
results[sid] = {
|
|
82
|
+
"score": result.score,
|
|
83
|
+
"passed": result.passed,
|
|
84
|
+
"explanation": result.explanation,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
mean_score = sum(r["score"] for r in results.values()) / len(results) if results else 0
|
|
88
|
+
return {"evaluator": "Option A (deterministic)", "mean_score": round(mean_score, 4), "sessions": results}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def run_option_b_eval(records: list[dict]) -> dict:
|
|
92
|
+
"""Run Option B (structural) evaluator."""
|
|
93
|
+
from evaluators.attribution_integrity_structural import evaluate_from_memory_records
|
|
94
|
+
|
|
95
|
+
sessions = {}
|
|
96
|
+
for r in records:
|
|
97
|
+
sid = r.get("session_id", "unknown")
|
|
98
|
+
sessions.setdefault(sid, []).append(r)
|
|
99
|
+
|
|
100
|
+
results = {}
|
|
101
|
+
for sid, mems in sessions.items():
|
|
102
|
+
result = evaluate_from_memory_records(mems)
|
|
103
|
+
results[sid] = {
|
|
104
|
+
"score": result.score,
|
|
105
|
+
"passed": result.passed,
|
|
106
|
+
"explanation": result.explanation,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
mean_score = sum(r["score"] for r in results.values()) / len(results) if results else 0
|
|
110
|
+
return {"evaluator": "Option B (structural)", "mean_score": round(mean_score, 4), "sessions": results}
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def run_agentcore_builtin_eval(records: list[dict]) -> dict:
|
|
114
|
+
"""Run AgentCore built-in evaluators via SDK programmatic API.
|
|
115
|
+
|
|
116
|
+
Uses the bedrock-agentcore SDK's evaluate API directly, bypassing
|
|
117
|
+
the CloudWatch log group requirement.
|
|
118
|
+
"""
|
|
119
|
+
try:
|
|
120
|
+
from bedrock_agentcore.evaluations import evaluate, BuiltinEvaluator
|
|
121
|
+
|
|
122
|
+
# Format records as AgentCore evaluation input
|
|
123
|
+
eval_input = {
|
|
124
|
+
"conversation": [
|
|
125
|
+
{
|
|
126
|
+
"role": "user",
|
|
127
|
+
"content": "Assess the memory operations performed by the agent fleet",
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"role": "assistant",
|
|
131
|
+
"content": "\n".join(
|
|
132
|
+
f"[{r['created_by'] or 'unknown'}] ({r['record_type']}, conf={r['confidence']:.2f}): {r['content']}"
|
|
133
|
+
for r in records[:10]
|
|
134
|
+
),
|
|
135
|
+
},
|
|
136
|
+
],
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
evaluator_names = [
|
|
140
|
+
"Builtin.Correctness",
|
|
141
|
+
"Builtin.Coherence",
|
|
142
|
+
"Builtin.Completeness",
|
|
143
|
+
"Builtin.Helpfulness",
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
results = {}
|
|
147
|
+
for eval_name in evaluator_names:
|
|
148
|
+
try:
|
|
149
|
+
result = evaluate(
|
|
150
|
+
evaluator_id=eval_name,
|
|
151
|
+
conversation=eval_input["conversation"],
|
|
152
|
+
)
|
|
153
|
+
results[eval_name] = {
|
|
154
|
+
"score": result.get("score", 0),
|
|
155
|
+
"explanation": result.get("explanation", ""),
|
|
156
|
+
}
|
|
157
|
+
except Exception as e:
|
|
158
|
+
results[eval_name] = {"score": "N/A", "error": str(e)[:100]}
|
|
159
|
+
|
|
160
|
+
return {"evaluator": "AgentCore Built-ins", "results": results}
|
|
161
|
+
|
|
162
|
+
except ImportError:
|
|
163
|
+
return {"evaluator": "AgentCore Built-ins", "error": "SDK not available", "results": {}}
|
|
164
|
+
except Exception as e:
|
|
165
|
+
return {"evaluator": "AgentCore Built-ins", "error": str(e)[:200], "results": {}}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def main():
|
|
169
|
+
print("F6 — AgentCore Evaluation Runner")
|
|
170
|
+
print("=" * 60)
|
|
171
|
+
|
|
172
|
+
# Fetch data
|
|
173
|
+
records = asyncio.run(fetch_memories_from_aurora())
|
|
174
|
+
if not records:
|
|
175
|
+
print("No records found. Using calibration dataset as fallback.")
|
|
176
|
+
from evaluators.calibration.score_agreement import load_dataset
|
|
177
|
+
examples = load_dataset()
|
|
178
|
+
records = []
|
|
179
|
+
for ex in examples:
|
|
180
|
+
records.extend(ex["retrieved_memories"])
|
|
181
|
+
|
|
182
|
+
print(f"Loaded {len(records)} memory records")
|
|
183
|
+
|
|
184
|
+
# Run all evaluators
|
|
185
|
+
print("\n--- Option A (deterministic) ---")
|
|
186
|
+
a_results = run_option_a_eval(records)
|
|
187
|
+
print(f"Mean score: {a_results['mean_score']}")
|
|
188
|
+
|
|
189
|
+
print("\n--- Option B (structural) ---")
|
|
190
|
+
b_results = run_option_b_eval(records)
|
|
191
|
+
print(f"Mean score: {b_results['mean_score']}")
|
|
192
|
+
|
|
193
|
+
print("\n--- AgentCore Built-ins ---")
|
|
194
|
+
ac_results = run_agentcore_builtin_eval(records)
|
|
195
|
+
if "error" in ac_results:
|
|
196
|
+
print(f"Error: {ac_results['error']}")
|
|
197
|
+
else:
|
|
198
|
+
for name, r in ac_results.get("results", {}).items():
|
|
199
|
+
print(f" {name}: {r.get('score', 'N/A')}")
|
|
200
|
+
|
|
201
|
+
# Load calibration data
|
|
202
|
+
cal_path = Path(__file__).parent / "calibration" / "results_summary.json"
|
|
203
|
+
cal_data = {}
|
|
204
|
+
if cal_path.exists():
|
|
205
|
+
with open(cal_path) as f:
|
|
206
|
+
cal_data = json.load(f)
|
|
207
|
+
|
|
208
|
+
# Compile final results
|
|
209
|
+
final = {
|
|
210
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
211
|
+
"record_count": len(records),
|
|
212
|
+
"option_a": a_results,
|
|
213
|
+
"option_b": b_results,
|
|
214
|
+
"agentcore_builtins": ac_results,
|
|
215
|
+
"calibration": cal_data,
|
|
216
|
+
"position_bias": {
|
|
217
|
+
"rate": "N/A — deterministic evaluators, no LLM judge calls",
|
|
218
|
+
"note": "F12 position-bias tracker available for LLM-based conflict resolution",
|
|
219
|
+
},
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
# Save
|
|
223
|
+
out_path = Path(__file__).parent / "results" / "f6_complete.json"
|
|
224
|
+
out_path.parent.mkdir(exist_ok=True)
|
|
225
|
+
with open(out_path, "w") as f:
|
|
226
|
+
json.dump(final, f, indent=2)
|
|
227
|
+
|
|
228
|
+
print(f"\n{'=' * 60}")
|
|
229
|
+
print(f"Results saved to {out_path}")
|
|
230
|
+
print(f"Option A mean: {a_results['mean_score']}")
|
|
231
|
+
print(f"Option B mean: {b_results['mean_score']}")
|
|
232
|
+
if cal_data:
|
|
233
|
+
print(f"Calibration κ (A): {cal_data.get('option_a', {}).get('cohens_kappa', 'N/A')}")
|
|
234
|
+
print(f"Calibration κ (B): {cal_data.get('option_b', {}).get('cohens_kappa', 'N/A')}")
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
if __name__ == "__main__":
|
|
238
|
+
main()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""AWS Bedrock AgentCore evaluator registration (AWS-only).
|
|
2
|
+
|
|
3
|
+
Requires the [aws] extra: `pip install memledger[aws]`.
|
|
4
|
+
|
|
5
|
+
This module is import-safe even without boto3 — the boto3 import is
|
|
6
|
+
deferred to call time so a clean OSS install can still load
|
|
7
|
+
`evaluators.__init__` and use the deterministic and RAGAS tiers.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def register_agentcore_evaluator(
|
|
20
|
+
region: Optional[str] = None,
|
|
21
|
+
model_id: str = "us.anthropic.claude-sonnet-4-5-20250929-v1:0",
|
|
22
|
+
) -> dict:
|
|
23
|
+
"""Register the Memory Attribution Integrity evaluator with AgentCore.
|
|
24
|
+
|
|
25
|
+
Region resolution: explicit arg > $AWS_REGION > "us-west-2".
|
|
26
|
+
Returns the evaluator ARN and registration status.
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
import boto3
|
|
30
|
+
|
|
31
|
+
region = region or os.getenv("AWS_REGION", "us-west-2")
|
|
32
|
+
client = boto3.client("bedrock-agentcore-control", region_name=region)
|
|
33
|
+
|
|
34
|
+
evaluator_config = {
|
|
35
|
+
"evaluatorName": "memory_attribution_integrity",
|
|
36
|
+
"description": "Scores whether agent decisions rely on well-attributed, confident memory",
|
|
37
|
+
"level": "TRACE",
|
|
38
|
+
"evaluatorConfig": {
|
|
39
|
+
"llmAsAJudge": {
|
|
40
|
+
"modelConfig": {
|
|
41
|
+
"bedrockEvaluatorModelConfig": {
|
|
42
|
+
"modelId": model_id,
|
|
43
|
+
"inferenceConfig": {"maxTokens": 500, "temperature": 0.0},
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"ratingScale": {
|
|
47
|
+
"numerical": [
|
|
48
|
+
{"value": 0.0, "label": "Unattributed", "definition": "Decision relies on memory with no provenance, or on memory with confidence < 0.4 treated as authoritative"},
|
|
49
|
+
{"value": 0.5, "label": "Partially attributed", "definition": "Some memories have attribution; low-confidence memories used but decision hedges"},
|
|
50
|
+
{"value": 1.0, "label": "Well attributed", "definition": "All retrieved memories have attribution, confidence >= 0.7, OR agent explicitly hedges on lower-confidence data"},
|
|
51
|
+
]
|
|
52
|
+
},
|
|
53
|
+
"instructions": (
|
|
54
|
+
"Given an agent decision and the memories it retrieved, score whether the decision relied on "
|
|
55
|
+
"well-attributed, sufficiently confident memory. Score based on: (1) presence of attribution "
|
|
56
|
+
"metadata on every retrieved memory, (2) confidence thresholds, (3) consistency of derivation "
|
|
57
|
+
"chains, (4) whether contradictory memories were flagged or ignored. "
|
|
58
|
+
"Context: {context}. Target to evaluate: {assistant_turn}."
|
|
59
|
+
),
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
response = client.create_evaluator(**evaluator_config)
|
|
66
|
+
arn = response.get("evaluatorArn", "")
|
|
67
|
+
except client.exceptions.ConflictException:
|
|
68
|
+
response = client.get_evaluator(evaluatorName="memory_attribution_integrity")
|
|
69
|
+
arn = response.get("evaluatorArn", "")
|
|
70
|
+
|
|
71
|
+
return {"arn": arn, "status": "registered"}
|
|
72
|
+
|
|
73
|
+
except ImportError:
|
|
74
|
+
return {"error": "boto3/bedrock-agentcore SDK not installed (install with: pip install memledger[aws])", "status": "skipped"}
|
|
75
|
+
except Exception as e:
|
|
76
|
+
return {"error": str(e), "status": "failed"}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""AgentCore Evaluations harness for memory trust evaluation.
|
|
2
|
+
|
|
3
|
+
Runs the 4 built-in evaluators + custom Memory Attribution Integrity
|
|
4
|
+
against traces in CloudWatch.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from evaluators.agentcore_harness import run_agentcore_evaluation
|
|
8
|
+
|
|
9
|
+
results = await run_agentcore_evaluation(
|
|
10
|
+
log_group="/aws/otel/memledger",
|
|
11
|
+
session_id="demo-session-001",
|
|
12
|
+
)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def run_agentcore_evaluation(
|
|
26
|
+
log_group: str = "/aws/otel/memledger",
|
|
27
|
+
session_id: str = "demo-session-001",
|
|
28
|
+
region: str | None = None,
|
|
29
|
+
builtin_evaluators: list[str] | None = None,
|
|
30
|
+
custom_evaluator_arn: str | None = None,
|
|
31
|
+
) -> dict[str, Any]:
|
|
32
|
+
"""Run AgentCore Evaluations against CloudWatch traces.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
log_group: CloudWatch log group with OTEL traces
|
|
36
|
+
session_id: Session to evaluate
|
|
37
|
+
region: AWS region
|
|
38
|
+
builtin_evaluators: List of Builtin.* evaluator names
|
|
39
|
+
custom_evaluator_arn: ARN of the custom evaluator (from register_agentcore_evaluator)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
{evaluator_name: score, ...}
|
|
43
|
+
"""
|
|
44
|
+
if builtin_evaluators is None:
|
|
45
|
+
builtin_evaluators = [
|
|
46
|
+
"Builtin.GoalSuccessRate",
|
|
47
|
+
"Builtin.Correctness",
|
|
48
|
+
"Builtin.ToolSelectionAccuracy",
|
|
49
|
+
"Builtin.ContextRelevance",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
import boto3
|
|
54
|
+
region = region or os.getenv("AWS_REGION", "us-west-2")
|
|
55
|
+
client = boto3.client("bedrock-agentcore-control", region_name=region)
|
|
56
|
+
|
|
57
|
+
# Build evaluator list
|
|
58
|
+
evaluators = [{"evaluatorId": e} for e in builtin_evaluators]
|
|
59
|
+
if custom_evaluator_arn:
|
|
60
|
+
evaluators.append({"evaluatorId": custom_evaluator_arn})
|
|
61
|
+
|
|
62
|
+
# Create online evaluation config
|
|
63
|
+
eval_config = {
|
|
64
|
+
"evaluationConfigName": f"memledger-eval-{session_id[:20]}",
|
|
65
|
+
"evaluationDataSource": {
|
|
66
|
+
"cloudWatchLogGroup": {
|
|
67
|
+
"logGroupName": log_group,
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"evaluators": evaluators,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
response = client.create_online_evaluation_config(**eval_config)
|
|
75
|
+
config_arn = response.get("evaluationConfigArn", "")
|
|
76
|
+
logger.info("Created AgentCore eval config: %s", config_arn)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.warning("Failed to create eval config: %s", e)
|
|
79
|
+
config_arn = ""
|
|
80
|
+
|
|
81
|
+
# Note: AgentCore Evaluations runs asynchronously.
|
|
82
|
+
# Results appear in CloudWatch after 2-5 minutes.
|
|
83
|
+
# For the demo, we'll pre-run and screenshot results.
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
"status": "configured",
|
|
87
|
+
"config_arn": config_arn,
|
|
88
|
+
"evaluators": [e["evaluatorId"] for e in evaluators],
|
|
89
|
+
"log_group": log_group,
|
|
90
|
+
"session_id": session_id,
|
|
91
|
+
"note": "Results will appear in CloudWatch after 2-5 minutes. Pre-capture screenshots for demo.",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
except ImportError:
|
|
95
|
+
return {"status": "skipped", "reason": "bedrock-agentcore SDK not installed"}
|
|
96
|
+
except Exception as e:
|
|
97
|
+
return {"status": "error", "error": str(e)}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_evaluation_results(
|
|
101
|
+
session_id: str,
|
|
102
|
+
log_group: str = "/aws/otel/memledger",
|
|
103
|
+
region: str | None = None,
|
|
104
|
+
) -> dict[str, Any]:
|
|
105
|
+
"""Query CloudWatch for evaluation results.
|
|
106
|
+
|
|
107
|
+
Region resolution: explicit arg > $AWS_REGION > "us-west-2".
|
|
108
|
+
Call this 5+ minutes after running the evaluation.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
import boto3
|
|
112
|
+
region = region or os.getenv("AWS_REGION", "us-west-2")
|
|
113
|
+
client = boto3.client("logs", region_name=region)
|
|
114
|
+
|
|
115
|
+
# Query for evaluation results
|
|
116
|
+
response = client.filter_log_events(
|
|
117
|
+
logGroupName=log_group,
|
|
118
|
+
filterPattern=f'"evaluation" "{session_id}"',
|
|
119
|
+
limit=50,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
results = {}
|
|
123
|
+
for event in response.get("events", []):
|
|
124
|
+
try:
|
|
125
|
+
data = json.loads(event["message"])
|
|
126
|
+
if "evaluator" in data and "score" in data:
|
|
127
|
+
results[data["evaluator"]] = data["score"]
|
|
128
|
+
except (json.JSONDecodeError, KeyError):
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
return {"session_id": session_id, "results": results, "raw_events": len(response.get("events", []))}
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
return {"error": str(e)}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Memory Attribution Integrity Evaluator.
|
|
2
|
+
|
|
3
|
+
Scores 0.0-1.0: Did the agent's decision rely on well-attributed,
|
|
4
|
+
sufficiently confident memory?
|
|
5
|
+
|
|
6
|
+
Can run as:
|
|
7
|
+
1. A DeepEval G-Eval evaluator (LLM-as-judge)
|
|
8
|
+
2. A deterministic rule-based evaluator (no LLM needed)
|
|
9
|
+
3. An AgentCore custom evaluator (when available)
|
|
10
|
+
|
|
11
|
+
Score 1.0 when:
|
|
12
|
+
- Retrieved memories have attribution (source agent, confidence, session)
|
|
13
|
+
- Memory confidence >= 0.7 OR decision explicitly hedges on low-confidence data
|
|
14
|
+
- No memories in chain with confidence < 0.4 used as basis for decision
|
|
15
|
+
- Derivation chains are present and consistent
|
|
16
|
+
|
|
17
|
+
Score 0.0 when:
|
|
18
|
+
- Decision uses unattributed or low-confidence memory as ground truth
|
|
19
|
+
- Contradictory memories ignored
|
|
20
|
+
- Memory without session/turn context treated as authoritative
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from typing import Any, Optional
|
|
26
|
+
import logging
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class AttributionIntegrityResult:
|
|
33
|
+
"""Result of a memory attribution integrity evaluation."""
|
|
34
|
+
score: float # 0.0-1.0
|
|
35
|
+
passed: bool # score >= threshold
|
|
36
|
+
threshold: float = 0.7
|
|
37
|
+
details: dict = field(default_factory=dict)
|
|
38
|
+
explanation: str = ""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def evaluate_attribution_integrity(
|
|
42
|
+
decision_memory_id: str,
|
|
43
|
+
retrieved_memories: list[dict], # Each: {id, content, created_by, importance, derived_from, session_id, ...}
|
|
44
|
+
chain: Optional[dict] = None, # Attribution chain from get_attribution_chain()
|
|
45
|
+
threshold: float = 0.7,
|
|
46
|
+
) -> AttributionIntegrityResult:
|
|
47
|
+
"""Evaluate whether a decision was based on well-attributed memory.
|
|
48
|
+
|
|
49
|
+
Deterministic scoring — no LLM needed. Each criterion adds or subtracts
|
|
50
|
+
from the score.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
decision_memory_id: The memory that represents the agent's decision
|
|
54
|
+
retrieved_memories: Memories that were retrieved before the decision
|
|
55
|
+
chain: Attribution chain for the decision memory
|
|
56
|
+
threshold: Minimum score to pass (default 0.7)
|
|
57
|
+
"""
|
|
58
|
+
score = 1.0
|
|
59
|
+
penalties = []
|
|
60
|
+
bonuses = []
|
|
61
|
+
|
|
62
|
+
if not retrieved_memories:
|
|
63
|
+
return AttributionIntegrityResult(
|
|
64
|
+
score=0.0,
|
|
65
|
+
passed=False,
|
|
66
|
+
threshold=threshold,
|
|
67
|
+
details={"error": "no_retrieved_memories"},
|
|
68
|
+
explanation="No retrieved memories to evaluate.",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Helper: read confidence from either 'confidence' or 'importance' field
|
|
72
|
+
def _conf(m: dict) -> float:
|
|
73
|
+
return m.get("confidence", m.get("importance", 0.5)) or 0.5
|
|
74
|
+
|
|
75
|
+
# Criterion 1: Attribution completeness (do memories have source agent?)
|
|
76
|
+
attributed_count = sum(1 for m in retrieved_memories if m.get("created_by"))
|
|
77
|
+
attribution_ratio = attributed_count / len(retrieved_memories)
|
|
78
|
+
if attribution_ratio < 1.0:
|
|
79
|
+
penalty = (1.0 - attribution_ratio) * 0.4
|
|
80
|
+
score -= penalty
|
|
81
|
+
penalties.append(f"Attribution gap: {attributed_count}/{len(retrieved_memories)} have source agent (-{penalty:.2f})")
|
|
82
|
+
else:
|
|
83
|
+
bonuses.append("All retrieved memories have source attribution")
|
|
84
|
+
|
|
85
|
+
# Criterion 2: Confidence levels
|
|
86
|
+
confidences = [_conf(m) for m in retrieved_memories]
|
|
87
|
+
mean_conf = sum(confidences) / len(confidences)
|
|
88
|
+
|
|
89
|
+
# 2a: Low-confidence memories (< 0.4) used without hedging
|
|
90
|
+
low_conf_unhedged = sum(
|
|
91
|
+
1 for m in retrieved_memories
|
|
92
|
+
if _conf(m) < 0.4 and not m.get("hedged", False)
|
|
93
|
+
)
|
|
94
|
+
if low_conf_unhedged > 0:
|
|
95
|
+
penalty = low_conf_unhedged * 0.25
|
|
96
|
+
score -= penalty
|
|
97
|
+
penalties.append(f"Low-confidence unhedged: {low_conf_unhedged} below 0.4 (-{penalty:.2f})")
|
|
98
|
+
|
|
99
|
+
# 2b: Flagged-confidence (0.4-0.6) without hedging
|
|
100
|
+
flagged_unhedged = sum(
|
|
101
|
+
1 for m in retrieved_memories
|
|
102
|
+
if 0.4 <= _conf(m) < 0.6 and not m.get("hedged", False)
|
|
103
|
+
)
|
|
104
|
+
if flagged_unhedged > 0:
|
|
105
|
+
penalty = flagged_unhedged * 0.1
|
|
106
|
+
score -= penalty
|
|
107
|
+
penalties.append(f"Flagged-confidence unhedged: {flagged_unhedged} in 0.4-0.6 (-{penalty:.2f})")
|
|
108
|
+
|
|
109
|
+
# 2c: Mean confidence penalty
|
|
110
|
+
if mean_conf < 0.6:
|
|
111
|
+
penalty = (0.6 - mean_conf) * 0.5
|
|
112
|
+
score -= penalty
|
|
113
|
+
penalties.append(f"Low mean confidence: {mean_conf:.2f} (-{penalty:.2f})")
|
|
114
|
+
|
|
115
|
+
# Criterion 3: Session context
|
|
116
|
+
session_count = sum(1 for m in retrieved_memories if m.get("session_id"))
|
|
117
|
+
session_ratio = session_count / len(retrieved_memories)
|
|
118
|
+
if session_ratio < 0.5:
|
|
119
|
+
penalty = (1.0 - session_ratio) * 0.15
|
|
120
|
+
score -= penalty
|
|
121
|
+
penalties.append(f"Weak session context: {session_count}/{len(retrieved_memories)} have session IDs (-{penalty:.2f})")
|
|
122
|
+
|
|
123
|
+
# Criterion 4: Chain integrity (if chain provided, check for gaps)
|
|
124
|
+
if chain:
|
|
125
|
+
chain_hops = chain.get("chain", [])
|
|
126
|
+
if chain_hops:
|
|
127
|
+
min_chain_conf = chain.get("min_confidence", 1.0)
|
|
128
|
+
if min_chain_conf < 0.4:
|
|
129
|
+
penalty = 0.2
|
|
130
|
+
score -= penalty
|
|
131
|
+
penalties.append(f"Chain contains low-confidence hop (min={min_chain_conf:.2f}) (-{penalty:.2f})")
|
|
132
|
+
|
|
133
|
+
if chain.get("truncated"):
|
|
134
|
+
penalty = 0.05
|
|
135
|
+
score -= penalty
|
|
136
|
+
penalties.append(f"Chain truncated (depth > max_hops) (-{penalty:.2f})")
|
|
137
|
+
else:
|
|
138
|
+
bonuses.append("No derivation chain (original memory)")
|
|
139
|
+
|
|
140
|
+
# Criterion 5: Diversity of sources (multiple agents = more trustworthy)
|
|
141
|
+
unique_agents = set(m.get("created_by", "") for m in retrieved_memories if m.get("created_by"))
|
|
142
|
+
if len(unique_agents) >= 2:
|
|
143
|
+
bonus = 0.05
|
|
144
|
+
score += bonus
|
|
145
|
+
bonuses.append(f"Multi-agent corroboration: {len(unique_agents)} agents (+{bonus:.2f})")
|
|
146
|
+
|
|
147
|
+
# Clamp score
|
|
148
|
+
score = max(0.0, min(1.0, score))
|
|
149
|
+
|
|
150
|
+
explanation_parts = []
|
|
151
|
+
if bonuses:
|
|
152
|
+
explanation_parts.append("Strengths: " + "; ".join(bonuses))
|
|
153
|
+
if penalties:
|
|
154
|
+
explanation_parts.append("Issues: " + "; ".join(penalties))
|
|
155
|
+
|
|
156
|
+
result = AttributionIntegrityResult(
|
|
157
|
+
score=round(score, 4),
|
|
158
|
+
passed=score >= threshold,
|
|
159
|
+
threshold=threshold,
|
|
160
|
+
details={
|
|
161
|
+
"attributed_ratio": attribution_ratio,
|
|
162
|
+
"low_confidence_unhedged": low_conf_unhedged,
|
|
163
|
+
"mean_confidence": round(mean_conf, 4),
|
|
164
|
+
"session_context_ratio": session_ratio,
|
|
165
|
+
"unique_agents": len(unique_agents),
|
|
166
|
+
"chain_min_confidence": chain.get("min_confidence") if chain else None,
|
|
167
|
+
"penalties": penalties,
|
|
168
|
+
"bonuses": bonuses,
|
|
169
|
+
},
|
|
170
|
+
explanation=". ".join(explanation_parts) if explanation_parts else "No issues detected.",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Best-effort: push to Phoenix as an annotation on the current OTEL span.
|
|
174
|
+
# No-op if [eval] extra not installed or PHOENIX_BASE_URL / OTEL endpoint unset.
|
|
175
|
+
try:
|
|
176
|
+
from memledger.telemetry.phoenix import log_evaluation
|
|
177
|
+
log_evaluation(
|
|
178
|
+
name="memory_attribution_integrity",
|
|
179
|
+
score=result.score,
|
|
180
|
+
label="passed" if result.passed else "failed",
|
|
181
|
+
explanation=result.explanation[:500],
|
|
182
|
+
annotator_kind="CODE",
|
|
183
|
+
)
|
|
184
|
+
except Exception:
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
return result
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# AgentCore evaluator registration moved to evaluators/agentcore_evaluator.py
|
|
191
|
+
# (it's the only AWS-coupled piece; keeping this file pure-OSS / pure-stdlib).
|