contextforge-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- context_forge/__init__.py +95 -0
- context_forge/core/__init__.py +55 -0
- context_forge/core/trace.py +369 -0
- context_forge/core/types.py +121 -0
- context_forge/evaluation.py +267 -0
- context_forge/exceptions.py +56 -0
- context_forge/graders/__init__.py +44 -0
- context_forge/graders/base.py +264 -0
- context_forge/graders/deterministic/__init__.py +11 -0
- context_forge/graders/deterministic/memory_corruption.py +130 -0
- context_forge/graders/hybrid.py +190 -0
- context_forge/graders/judges/__init__.py +11 -0
- context_forge/graders/judges/backends/__init__.py +9 -0
- context_forge/graders/judges/backends/ollama.py +173 -0
- context_forge/graders/judges/base.py +158 -0
- context_forge/graders/judges/memory_hygiene_judge.py +332 -0
- context_forge/graders/judges/models.py +113 -0
- context_forge/harness/__init__.py +43 -0
- context_forge/harness/user_simulator/__init__.py +70 -0
- context_forge/harness/user_simulator/adapters/__init__.py +13 -0
- context_forge/harness/user_simulator/adapters/base.py +67 -0
- context_forge/harness/user_simulator/adapters/crewai.py +100 -0
- context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
- context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
- context_forge/harness/user_simulator/llm/__init__.py +5 -0
- context_forge/harness/user_simulator/llm/ollama.py +119 -0
- context_forge/harness/user_simulator/models.py +103 -0
- context_forge/harness/user_simulator/persona.py +154 -0
- context_forge/harness/user_simulator/runner.py +342 -0
- context_forge/harness/user_simulator/scenario.py +95 -0
- context_forge/harness/user_simulator/simulator.py +307 -0
- context_forge/instrumentation/__init__.py +23 -0
- context_forge/instrumentation/base.py +307 -0
- context_forge/instrumentation/instrumentors/__init__.py +17 -0
- context_forge/instrumentation/instrumentors/langchain.py +671 -0
- context_forge/instrumentation/instrumentors/langgraph.py +534 -0
- context_forge/instrumentation/tracer.py +588 -0
- context_forge/py.typed +0 -0
- contextforge_eval-0.1.0.dist-info/METADATA +420 -0
- contextforge_eval-0.1.0.dist-info/RECORD +43 -0
- contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
- contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""Simple evaluation API for ContextForge.
|
|
2
|
+
|
|
3
|
+
This module provides high-level functions for common evaluation patterns,
|
|
4
|
+
hiding the complexity of instrumentors, adapters, and graders.
|
|
5
|
+
|
|
6
|
+
Two usage levels:
|
|
7
|
+
|
|
8
|
+
Level 2 (Simple): Single-turn evaluation with minimal setup
|
|
9
|
+
from context_forge.evaluation import evaluate_agent
|
|
10
|
+
|
|
11
|
+
result = evaluate_agent(
|
|
12
|
+
graph=my_graph,
|
|
13
|
+
message="I work from home now",
|
|
14
|
+
store=my_store,
|
|
15
|
+
)
|
|
16
|
+
result.print_report()
|
|
17
|
+
|
|
18
|
+
Level 3 (Simulation): Multi-turn with personas and scenarios
|
|
19
|
+
from context_forge import SimulationRunner, LangGraphAdapter, Persona
|
|
20
|
+
# ... full control over simulation
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import Any, Optional
|
|
25
|
+
|
|
26
|
+
from context_forge.core.trace import TraceRun
|
|
27
|
+
from context_forge.graders import GraderResult, HybridMemoryHygieneGrader
|
|
28
|
+
from context_forge.graders.base import Evidence
|
|
29
|
+
from context_forge.instrumentation import LangGraphInstrumentor
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class EvaluationResult:
|
|
34
|
+
"""Result from a simple evaluation run.
|
|
35
|
+
|
|
36
|
+
Combines the agent's response with grader results for easy inspection.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
response: The agent's final response
|
|
40
|
+
trace: The captured trace (for debugging)
|
|
41
|
+
grader_results: Results from each grader that was run
|
|
42
|
+
passed: True if all graders passed
|
|
43
|
+
"""
|
|
44
|
+
response: Any
|
|
45
|
+
trace: TraceRun
|
|
46
|
+
grader_results: list[GraderResult] = field(default_factory=list)
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def passed(self) -> bool:
|
|
50
|
+
"""True if all graders passed."""
|
|
51
|
+
return all(r.passed for r in self.grader_results)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def score(self) -> float:
|
|
55
|
+
"""Average score across all graders."""
|
|
56
|
+
if not self.grader_results:
|
|
57
|
+
return 1.0
|
|
58
|
+
return sum(r.score for r in self.grader_results) / len(self.grader_results)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def errors(self) -> list[Evidence]:
|
|
62
|
+
"""All errors from all graders."""
|
|
63
|
+
errors = []
|
|
64
|
+
for r in self.grader_results:
|
|
65
|
+
errors.extend(r.errors)
|
|
66
|
+
return errors
|
|
67
|
+
|
|
68
|
+
def print_report(self, verbose: bool = False) -> None:
|
|
69
|
+
"""Print a combined report of all grader results."""
|
|
70
|
+
print("\n" + "=" * 60)
|
|
71
|
+
print("EVALUATION REPORT")
|
|
72
|
+
print("=" * 60)
|
|
73
|
+
|
|
74
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
75
|
+
print(f"\nOverall: {status} (score: {self.score:.2f})")
|
|
76
|
+
print(f"Response: {str(self.response)[:100]}...")
|
|
77
|
+
|
|
78
|
+
for result in self.grader_results:
|
|
79
|
+
result.print_report(verbose=verbose)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def evaluate_agent(
|
|
83
|
+
graph,
|
|
84
|
+
message: str,
|
|
85
|
+
store=None,
|
|
86
|
+
user_id: str = "eval_user",
|
|
87
|
+
session_id: str = "eval_session",
|
|
88
|
+
graders: Optional[list[str]] = None,
|
|
89
|
+
llm_model: str = "llama3.2",
|
|
90
|
+
print_result: bool = True,
|
|
91
|
+
) -> EvaluationResult:
|
|
92
|
+
"""Evaluate a LangGraph agent with a single message.
|
|
93
|
+
|
|
94
|
+
This is the simplest way to evaluate your agent. It:
|
|
95
|
+
1. Instruments the agent to capture traces
|
|
96
|
+
2. Runs your message through the agent
|
|
97
|
+
3. Grades the trace with specified graders
|
|
98
|
+
4. Returns a combined result
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
graph: Your compiled LangGraph graph
|
|
102
|
+
message: The user message to send
|
|
103
|
+
store: Optional LangGraph store (for memory operations)
|
|
104
|
+
user_id: User ID for the session
|
|
105
|
+
session_id: Session ID for the conversation
|
|
106
|
+
graders: List of grader names to run. Default: ["memory_hygiene"]
|
|
107
|
+
Available: "memory_hygiene", "memory_corruption"
|
|
108
|
+
llm_model: Ollama model for LLM-based graders
|
|
109
|
+
print_result: Whether to print the report automatically
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
EvaluationResult with response, trace, and grader results
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
from context_forge.evaluation import evaluate_agent
|
|
116
|
+
from langgraph.store.memory import InMemoryStore
|
|
117
|
+
from my_agent import build_graph
|
|
118
|
+
|
|
119
|
+
store = InMemoryStore()
|
|
120
|
+
# ... populate store with user profile ...
|
|
121
|
+
|
|
122
|
+
graph = build_graph(store=store)
|
|
123
|
+
result = evaluate_agent(
|
|
124
|
+
graph=graph,
|
|
125
|
+
message="I switched to working from home",
|
|
126
|
+
store=store,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if not result.passed:
|
|
130
|
+
print("Agent failed evaluation!")
|
|
131
|
+
for error in result.errors:
|
|
132
|
+
print(f" - {error.description}")
|
|
133
|
+
"""
|
|
134
|
+
graders = graders or ["memory_hygiene"]
|
|
135
|
+
|
|
136
|
+
# Set up instrumentation
|
|
137
|
+
instrumentor = LangGraphInstrumentor(
|
|
138
|
+
agent_name="evaluated_agent",
|
|
139
|
+
agent_version="1.0.0",
|
|
140
|
+
)
|
|
141
|
+
instrumentor.instrument()
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
# Build initial state
|
|
145
|
+
initial_state = {
|
|
146
|
+
"user_id": user_id,
|
|
147
|
+
"session_id": session_id,
|
|
148
|
+
"message": message,
|
|
149
|
+
"messages": [],
|
|
150
|
+
"turn_count": 0,
|
|
151
|
+
"user_profile": None, # Will be loaded from store
|
|
152
|
+
"response": None,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
# Add store config if provided
|
|
156
|
+
config = {}
|
|
157
|
+
if store is not None:
|
|
158
|
+
config["configurable"] = {"store": store}
|
|
159
|
+
|
|
160
|
+
# Run the agent
|
|
161
|
+
result = graph.invoke(initial_state, config=config)
|
|
162
|
+
response = result.get("response", result)
|
|
163
|
+
|
|
164
|
+
# Get the trace
|
|
165
|
+
traces = instrumentor.get_traces()
|
|
166
|
+
if not traces:
|
|
167
|
+
raise RuntimeError("No trace captured. Is the graph using LangChain components?")
|
|
168
|
+
trace = traces[0]
|
|
169
|
+
|
|
170
|
+
# Run graders
|
|
171
|
+
grader_results = []
|
|
172
|
+
for grader_name in graders:
|
|
173
|
+
grader_result = _run_grader(grader_name, trace, llm_model)
|
|
174
|
+
grader_results.append(grader_result)
|
|
175
|
+
|
|
176
|
+
# Build result
|
|
177
|
+
eval_result = EvaluationResult(
|
|
178
|
+
response=response,
|
|
179
|
+
trace=trace,
|
|
180
|
+
grader_results=grader_results,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if print_result:
|
|
184
|
+
eval_result.print_report()
|
|
185
|
+
|
|
186
|
+
return eval_result
|
|
187
|
+
|
|
188
|
+
finally:
|
|
189
|
+
instrumentor.uninstrument()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def evaluate_trace(
|
|
193
|
+
trace: TraceRun,
|
|
194
|
+
graders: Optional[list[str]] = None,
|
|
195
|
+
llm_model: str = "llama3.2",
|
|
196
|
+
print_result: bool = True,
|
|
197
|
+
) -> EvaluationResult:
|
|
198
|
+
"""Evaluate an existing trace.
|
|
199
|
+
|
|
200
|
+
Use this when you already have a trace (e.g., loaded from a file)
|
|
201
|
+
and just want to run graders on it.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
trace: The trace to evaluate
|
|
205
|
+
graders: List of grader names to run
|
|
206
|
+
llm_model: Ollama model for LLM-based graders
|
|
207
|
+
print_result: Whether to print the report automatically
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
EvaluationResult with grader results
|
|
211
|
+
|
|
212
|
+
Example:
|
|
213
|
+
from context_forge.evaluation import evaluate_trace
|
|
214
|
+
from context_forge import TraceRun
|
|
215
|
+
import json
|
|
216
|
+
|
|
217
|
+
with open("my_trace.json") as f:
|
|
218
|
+
trace = TraceRun.model_validate(json.load(f))
|
|
219
|
+
|
|
220
|
+
result = evaluate_trace(trace)
|
|
221
|
+
"""
|
|
222
|
+
graders = graders or ["memory_hygiene"]
|
|
223
|
+
|
|
224
|
+
grader_results = []
|
|
225
|
+
for grader_name in graders:
|
|
226
|
+
grader_result = _run_grader(grader_name, trace, llm_model)
|
|
227
|
+
grader_results.append(grader_result)
|
|
228
|
+
|
|
229
|
+
eval_result = EvaluationResult(
|
|
230
|
+
response=None,
|
|
231
|
+
trace=trace,
|
|
232
|
+
grader_results=grader_results,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if print_result:
|
|
236
|
+
eval_result.print_report()
|
|
237
|
+
|
|
238
|
+
return eval_result
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _run_grader(grader_name: str, trace: TraceRun, llm_model: str) -> GraderResult:
|
|
242
|
+
"""Run a grader by name."""
|
|
243
|
+
from context_forge.graders import MemoryCorruptionGrader
|
|
244
|
+
from context_forge.graders.judges.backends import OllamaBackend
|
|
245
|
+
|
|
246
|
+
if grader_name == "memory_hygiene":
|
|
247
|
+
# Check if Ollama is available
|
|
248
|
+
try:
|
|
249
|
+
backend = OllamaBackend(model=llm_model)
|
|
250
|
+
if backend.is_available():
|
|
251
|
+
grader = HybridMemoryHygieneGrader(llm_backend=backend)
|
|
252
|
+
else:
|
|
253
|
+
# Fall back to deterministic only
|
|
254
|
+
grader = HybridMemoryHygieneGrader()
|
|
255
|
+
except Exception:
|
|
256
|
+
grader = HybridMemoryHygieneGrader()
|
|
257
|
+
return grader.grade(trace)
|
|
258
|
+
|
|
259
|
+
elif grader_name == "memory_corruption":
|
|
260
|
+
grader = MemoryCorruptionGrader()
|
|
261
|
+
return grader.grade(trace)
|
|
262
|
+
|
|
263
|
+
else:
|
|
264
|
+
raise ValueError(
|
|
265
|
+
f"Unknown grader: {grader_name}. "
|
|
266
|
+
f"Available: memory_hygiene, memory_corruption"
|
|
267
|
+
)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Custom exceptions for ContextForge.
|
|
2
|
+
|
|
3
|
+
This module implements T025: Custom exceptions.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ContextForgeError(Exception):
|
|
8
|
+
"""Base exception for all ContextForge errors."""
|
|
9
|
+
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TraceValidationError(ContextForgeError):
|
|
14
|
+
"""Raised when trace validation fails."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, message: str, field: str | None = None):
|
|
17
|
+
self.field = field
|
|
18
|
+
super().__init__(message)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class InstrumentationError(ContextForgeError):
|
|
22
|
+
"""Raised when instrumentation fails."""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class InstrumentorAlreadyActiveError(InstrumentationError):
|
|
28
|
+
"""Raised when trying to instrument when already instrumented."""
|
|
29
|
+
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class InstrumentorNotActiveError(InstrumentationError):
|
|
34
|
+
"""Raised when trying to uninstrument when not instrumented."""
|
|
35
|
+
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SpanConversionError(ContextForgeError):
|
|
40
|
+
"""Raised when span conversion fails."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, message: str, span_id: str | None = None):
|
|
43
|
+
self.span_id = span_id
|
|
44
|
+
super().__init__(message)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TracerError(ContextForgeError):
|
|
48
|
+
"""Raised when tracer operations fail."""
|
|
49
|
+
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TracerNotActiveError(TracerError):
|
|
54
|
+
"""Raised when trying to record steps without an active tracer."""
|
|
55
|
+
|
|
56
|
+
pass
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""ContextForge Graders - Evaluate agent trajectories.
|
|
2
|
+
|
|
3
|
+
Graders analyze traces to detect behavioral issues that output-only
|
|
4
|
+
evaluation would miss.
|
|
5
|
+
|
|
6
|
+
Two types of evaluation:
|
|
7
|
+
- Deterministic (MemoryCorruptionGrader): Checks INVARIANTS that are always wrong
|
|
8
|
+
- LLM Judges (MemoryHygieneJudge): SEMANTIC evaluation requiring understanding
|
|
9
|
+
- Hybrid: Combines both for comprehensive analysis
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from context_forge.graders import HybridMemoryHygieneGrader
|
|
13
|
+
from context_forge.graders.judges.backends import OllamaBackend
|
|
14
|
+
|
|
15
|
+
# Full evaluation (recommended)
|
|
16
|
+
grader = HybridMemoryHygieneGrader(
|
|
17
|
+
llm_backend=OllamaBackend(model="llama3.2")
|
|
18
|
+
)
|
|
19
|
+
result = grader.grade(trace)
|
|
20
|
+
|
|
21
|
+
if not result.passed:
|
|
22
|
+
for error in result.errors:
|
|
23
|
+
print(f"Issue: {error.description}")
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from context_forge.graders.base import Evidence, Grader, GraderResult, Severity
|
|
27
|
+
from context_forge.graders.deterministic import MemoryCorruptionGrader
|
|
28
|
+
from context_forge.graders.hybrid import HybridMemoryHygieneGrader
|
|
29
|
+
from context_forge.graders.judges import LLMJudge, MemoryHygieneJudge
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Base classes
|
|
33
|
+
"Grader",
|
|
34
|
+
"GraderResult",
|
|
35
|
+
"Evidence",
|
|
36
|
+
"Severity",
|
|
37
|
+
# Deterministic graders (invariant checks)
|
|
38
|
+
"MemoryCorruptionGrader",
|
|
39
|
+
# LLM judges (semantic evaluation)
|
|
40
|
+
"LLMJudge",
|
|
41
|
+
"MemoryHygieneJudge",
|
|
42
|
+
# Hybrid graders (recommended)
|
|
43
|
+
"HybridMemoryHygieneGrader",
|
|
44
|
+
]
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Base classes for ContextForge graders.
|
|
2
|
+
|
|
3
|
+
This module implements the core grader interface that all graders
|
|
4
|
+
(deterministic and LLM judges) must implement.
|
|
5
|
+
|
|
6
|
+
Key design principles:
|
|
7
|
+
- Graders operate ONLY on traces, never on framework objects
|
|
8
|
+
- Results include evidence with step_ids for traceability
|
|
9
|
+
- Deterministic graders are stateless and reproducible
|
|
10
|
+
- LLM judges include full reproducibility metadata
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import Any, Optional
|
|
18
|
+
|
|
19
|
+
from context_forge.core.trace import TraceRun
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Severity(str, Enum):
|
|
23
|
+
"""Severity level for evidence items."""
|
|
24
|
+
|
|
25
|
+
INFO = "info" # Informational, not a problem
|
|
26
|
+
WARN = "warn" # Potential issue, doesn't fail the grader
|
|
27
|
+
ERROR = "error" # Definite issue, fails the grader
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class Evidence:
|
|
32
|
+
"""Proof of what was evaluated by a grader.
|
|
33
|
+
|
|
34
|
+
Every grader result must include evidence explaining what was
|
|
35
|
+
checked and why the result was pass/fail.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
check_name: Name of the specific check (e.g., "redundant_write")
|
|
39
|
+
description: Human-readable explanation of the finding
|
|
40
|
+
severity: How serious this finding is
|
|
41
|
+
step_ids: Which trace steps were examined
|
|
42
|
+
details: Additional structured data about the finding
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
check_name: str
|
|
46
|
+
description: str
|
|
47
|
+
severity: Severity = Severity.INFO
|
|
48
|
+
step_ids: list[str] = field(default_factory=list)
|
|
49
|
+
details: Optional[dict[str, Any]] = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class GraderResult:
|
|
54
|
+
"""Result from any grader (deterministic or LLM judge).
|
|
55
|
+
|
|
56
|
+
Attributes:
|
|
57
|
+
grader_name: Name of the grader that produced this result
|
|
58
|
+
passed: Whether the trace passed all checks
|
|
59
|
+
score: Numeric score from 0.0 (worst) to 1.0 (best)
|
|
60
|
+
evidence: List of evidence items explaining the result
|
|
61
|
+
timestamp: When the grading was performed
|
|
62
|
+
metadata: Additional grader-specific data (LLM judges add prompt/response)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
grader_name: str
|
|
66
|
+
passed: bool
|
|
67
|
+
score: float
|
|
68
|
+
evidence: list[Evidence] = field(default_factory=list)
|
|
69
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
70
|
+
metadata: Optional[dict[str, Any]] = None
|
|
71
|
+
|
|
72
|
+
def __post_init__(self):
|
|
73
|
+
"""Validate score is in valid range."""
|
|
74
|
+
if not 0.0 <= self.score <= 1.0:
|
|
75
|
+
raise ValueError(f"Score must be between 0.0 and 1.0, got {self.score}")
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def errors(self) -> list[Evidence]:
|
|
79
|
+
"""Get all evidence items with ERROR severity."""
|
|
80
|
+
return [e for e in self.evidence if e.severity == Severity.ERROR]
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def warnings(self) -> list[Evidence]:
|
|
84
|
+
"""Get all evidence items with WARN severity."""
|
|
85
|
+
return [e for e in self.evidence if e.severity == Severity.WARN]
|
|
86
|
+
|
|
87
|
+
def to_dict(self) -> dict[str, Any]:
|
|
88
|
+
"""Convert result to dictionary for serialization."""
|
|
89
|
+
return {
|
|
90
|
+
"grader_name": self.grader_name,
|
|
91
|
+
"passed": self.passed,
|
|
92
|
+
"score": self.score,
|
|
93
|
+
"evidence": [
|
|
94
|
+
{
|
|
95
|
+
"check_name": e.check_name,
|
|
96
|
+
"description": e.description,
|
|
97
|
+
"severity": e.severity.value,
|
|
98
|
+
"step_ids": e.step_ids,
|
|
99
|
+
"details": e.details,
|
|
100
|
+
}
|
|
101
|
+
for e in self.evidence
|
|
102
|
+
],
|
|
103
|
+
"timestamp": self.timestamp.isoformat(),
|
|
104
|
+
"metadata": self.metadata,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
def format_report(self, verbose: bool = False) -> str:
|
|
108
|
+
"""Format the grader result as a human-readable report.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
verbose: Include additional details and metadata
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Formatted string report
|
|
115
|
+
"""
|
|
116
|
+
lines = []
|
|
117
|
+
|
|
118
|
+
# Header
|
|
119
|
+
lines.append("")
|
|
120
|
+
lines.append("=" * 60)
|
|
121
|
+
lines.append(f"GRADER REPORT: {self.grader_name}")
|
|
122
|
+
lines.append("=" * 60)
|
|
123
|
+
|
|
124
|
+
# Result summary
|
|
125
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
126
|
+
status_icon = "[OK]" if self.passed else "[FAIL]"
|
|
127
|
+
lines.append("")
|
|
128
|
+
lines.append(f"Result: {status_icon} {status}")
|
|
129
|
+
lines.append(f"Score: {self.score:.2f} / 1.00")
|
|
130
|
+
|
|
131
|
+
# Errors (always show)
|
|
132
|
+
errors = self.errors
|
|
133
|
+
if errors:
|
|
134
|
+
lines.append("")
|
|
135
|
+
lines.append(f"ERRORS ({len(errors)}):")
|
|
136
|
+
for e in errors:
|
|
137
|
+
lines.append(f" [ERROR] {e.check_name}")
|
|
138
|
+
lines.append(f" {e.description}")
|
|
139
|
+
if verbose and e.details:
|
|
140
|
+
for k, v in e.details.items():
|
|
141
|
+
lines.append(f" {k}: {v}")
|
|
142
|
+
|
|
143
|
+
# Warnings (always show)
|
|
144
|
+
warnings = self.warnings
|
|
145
|
+
if warnings:
|
|
146
|
+
lines.append("")
|
|
147
|
+
lines.append(f"WARNINGS ({len(warnings)}):")
|
|
148
|
+
for e in warnings:
|
|
149
|
+
lines.append(f" [WARN] {e.check_name}")
|
|
150
|
+
lines.append(f" {e.description}")
|
|
151
|
+
|
|
152
|
+
# Info items (show summary only, or all if verbose)
|
|
153
|
+
info_items = [e for e in self.evidence if e.severity == Severity.INFO]
|
|
154
|
+
if info_items:
|
|
155
|
+
# Always show the summary if present
|
|
156
|
+
summary = next(
|
|
157
|
+
(e for e in info_items if e.check_name == "llm_summary"),
|
|
158
|
+
None,
|
|
159
|
+
)
|
|
160
|
+
if summary:
|
|
161
|
+
lines.append("")
|
|
162
|
+
lines.append("SUMMARY:")
|
|
163
|
+
lines.append(f" {summary.description}")
|
|
164
|
+
|
|
165
|
+
# Show correct saves
|
|
166
|
+
correct_saves = [e for e in info_items if e.check_name == "correct_save"]
|
|
167
|
+
if correct_saves:
|
|
168
|
+
lines.append("")
|
|
169
|
+
lines.append(f"CORRECTLY SAVED ({len(correct_saves)}):")
|
|
170
|
+
for e in correct_saves:
|
|
171
|
+
lines.append(f" [OK] {e.description}")
|
|
172
|
+
|
|
173
|
+
# Verbose: show all info items
|
|
174
|
+
if verbose:
|
|
175
|
+
other_info = [
|
|
176
|
+
e for e in info_items
|
|
177
|
+
if e.check_name not in ("llm_summary", "correct_save")
|
|
178
|
+
]
|
|
179
|
+
if other_info:
|
|
180
|
+
lines.append("")
|
|
181
|
+
lines.append("ADDITIONAL INFO:")
|
|
182
|
+
for e in other_info:
|
|
183
|
+
lines.append(f" [{e.check_name}] {e.description}")
|
|
184
|
+
|
|
185
|
+
lines.append("")
|
|
186
|
+
lines.append("-" * 60)
|
|
187
|
+
|
|
188
|
+
return "\n".join(lines)
|
|
189
|
+
|
|
190
|
+
def print_report(self, verbose: bool = False) -> None:
|
|
191
|
+
"""Print the grader result as a human-readable report.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
verbose: Include additional details and metadata
|
|
195
|
+
"""
|
|
196
|
+
print(self.format_report(verbose=verbose))
|
|
197
|
+
|
|
198
|
+
def __str__(self) -> str:
|
|
199
|
+
"""Short string representation."""
|
|
200
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
201
|
+
return f"GraderResult({self.grader_name}: {status}, score={self.score:.2f})"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class Grader(ABC):
|
|
205
|
+
"""Base class for all graders.
|
|
206
|
+
|
|
207
|
+
Graders evaluate traces and return structured results with evidence.
|
|
208
|
+
All graders must implement the `grade` method.
|
|
209
|
+
|
|
210
|
+
Attributes:
|
|
211
|
+
name: Human-readable name for this grader
|
|
212
|
+
deterministic: Whether this grader produces identical results on repeated runs
|
|
213
|
+
required_step_types: Step types this grader needs to function
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
name: str = "base_grader"
|
|
217
|
+
deterministic: bool = True
|
|
218
|
+
required_step_types: list[str] = []
|
|
219
|
+
|
|
220
|
+
@abstractmethod
|
|
221
|
+
def grade(self, trace: TraceRun) -> GraderResult:
|
|
222
|
+
"""Evaluate a trace and return a grading result.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
trace: The trace to evaluate
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
GraderResult with pass/fail, score, and evidence
|
|
229
|
+
"""
|
|
230
|
+
pass
|
|
231
|
+
|
|
232
|
+
def validate_trace(self, trace: TraceRun) -> list[str]:
|
|
233
|
+
"""Check if trace has required step types.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
trace: The trace to validate
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
List of missing step types (empty if all present)
|
|
240
|
+
"""
|
|
241
|
+
present_types = {step.step_type for step in trace.steps}
|
|
242
|
+
missing = [st for st in self.required_step_types if st not in present_types]
|
|
243
|
+
return missing
|
|
244
|
+
|
|
245
|
+
def check_required_steps(self, trace: TraceRun) -> None:
|
|
246
|
+
"""Validate trace has required step types, raising if not.
|
|
247
|
+
|
|
248
|
+
Call this at the start of grade() to fail fast on invalid traces.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
trace: The trace to validate
|
|
252
|
+
|
|
253
|
+
Raises:
|
|
254
|
+
ValueError: If required step types are missing
|
|
255
|
+
"""
|
|
256
|
+
missing = self.validate_trace(trace)
|
|
257
|
+
if missing:
|
|
258
|
+
raise ValueError(
|
|
259
|
+
f"Grader '{self.name}' requires step types {self.required_step_types}, "
|
|
260
|
+
f"but trace is missing: {missing}"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
def __repr__(self) -> str:
|
|
264
|
+
return f"{self.__class__.__name__}(name={self.name!r}, deterministic={self.deterministic})"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Deterministic graders for ContextForge.
|
|
2
|
+
|
|
3
|
+
These graders check for INVARIANTS - things that are always wrong
|
|
4
|
+
regardless of the agent's non-deterministic path.
|
|
5
|
+
|
|
6
|
+
For semantic evaluation (understanding), use LLM judges instead.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from context_forge.graders.deterministic.memory_corruption import MemoryCorruptionGrader
|
|
10
|
+
|
|
11
|
+
__all__ = ["MemoryCorruptionGrader"]
|