contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. context_forge/__init__.py +95 -0
  2. context_forge/core/__init__.py +55 -0
  3. context_forge/core/trace.py +369 -0
  4. context_forge/core/types.py +121 -0
  5. context_forge/evaluation.py +267 -0
  6. context_forge/exceptions.py +56 -0
  7. context_forge/graders/__init__.py +44 -0
  8. context_forge/graders/base.py +264 -0
  9. context_forge/graders/deterministic/__init__.py +11 -0
  10. context_forge/graders/deterministic/memory_corruption.py +130 -0
  11. context_forge/graders/hybrid.py +190 -0
  12. context_forge/graders/judges/__init__.py +11 -0
  13. context_forge/graders/judges/backends/__init__.py +9 -0
  14. context_forge/graders/judges/backends/ollama.py +173 -0
  15. context_forge/graders/judges/base.py +158 -0
  16. context_forge/graders/judges/memory_hygiene_judge.py +332 -0
  17. context_forge/graders/judges/models.py +113 -0
  18. context_forge/harness/__init__.py +43 -0
  19. context_forge/harness/user_simulator/__init__.py +70 -0
  20. context_forge/harness/user_simulator/adapters/__init__.py +13 -0
  21. context_forge/harness/user_simulator/adapters/base.py +67 -0
  22. context_forge/harness/user_simulator/adapters/crewai.py +100 -0
  23. context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
  24. context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
  25. context_forge/harness/user_simulator/llm/__init__.py +5 -0
  26. context_forge/harness/user_simulator/llm/ollama.py +119 -0
  27. context_forge/harness/user_simulator/models.py +103 -0
  28. context_forge/harness/user_simulator/persona.py +154 -0
  29. context_forge/harness/user_simulator/runner.py +342 -0
  30. context_forge/harness/user_simulator/scenario.py +95 -0
  31. context_forge/harness/user_simulator/simulator.py +307 -0
  32. context_forge/instrumentation/__init__.py +23 -0
  33. context_forge/instrumentation/base.py +307 -0
  34. context_forge/instrumentation/instrumentors/__init__.py +17 -0
  35. context_forge/instrumentation/instrumentors/langchain.py +671 -0
  36. context_forge/instrumentation/instrumentors/langgraph.py +534 -0
  37. context_forge/instrumentation/tracer.py +588 -0
  38. context_forge/py.typed +0 -0
  39. contextforge_eval-0.1.0.dist-info/METADATA +420 -0
  40. contextforge_eval-0.1.0.dist-info/RECORD +43 -0
  41. contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
  42. contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  43. contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,267 @@
1
+ """Simple evaluation API for ContextForge.
2
+
3
+ This module provides high-level functions for common evaluation patterns,
4
+ hiding the complexity of instrumentors, adapters, and graders.
5
+
6
+ Two usage levels:
7
+
8
+ Level 2 (Simple): Single-turn evaluation with minimal setup
9
+ from context_forge.evaluation import evaluate_agent
10
+
11
+ result = evaluate_agent(
12
+ graph=my_graph,
13
+ message="I work from home now",
14
+ store=my_store,
15
+ )
16
+ result.print_report()
17
+
18
+ Level 3 (Simulation): Multi-turn with personas and scenarios
19
+ from context_forge import SimulationRunner, LangGraphAdapter, Persona
20
+ # ... full control over simulation
21
+ """
22
+
23
+ from dataclasses import dataclass, field
24
+ from typing import Any, Optional
25
+
26
+ from context_forge.core.trace import TraceRun
27
+ from context_forge.graders import GraderResult, HybridMemoryHygieneGrader
28
+ from context_forge.graders.base import Evidence
29
+ from context_forge.instrumentation import LangGraphInstrumentor
30
+
31
+
32
+ @dataclass
33
+ class EvaluationResult:
34
+ """Result from a simple evaluation run.
35
+
36
+ Combines the agent's response with grader results for easy inspection.
37
+
38
+ Attributes:
39
+ response: The agent's final response
40
+ trace: The captured trace (for debugging)
41
+ grader_results: Results from each grader that was run
42
+ passed: True if all graders passed
43
+ """
44
+ response: Any
45
+ trace: TraceRun
46
+ grader_results: list[GraderResult] = field(default_factory=list)
47
+
48
+ @property
49
+ def passed(self) -> bool:
50
+ """True if all graders passed."""
51
+ return all(r.passed for r in self.grader_results)
52
+
53
+ @property
54
+ def score(self) -> float:
55
+ """Average score across all graders."""
56
+ if not self.grader_results:
57
+ return 1.0
58
+ return sum(r.score for r in self.grader_results) / len(self.grader_results)
59
+
60
+ @property
61
+ def errors(self) -> list[Evidence]:
62
+ """All errors from all graders."""
63
+ errors = []
64
+ for r in self.grader_results:
65
+ errors.extend(r.errors)
66
+ return errors
67
+
68
+ def print_report(self, verbose: bool = False) -> None:
69
+ """Print a combined report of all grader results."""
70
+ print("\n" + "=" * 60)
71
+ print("EVALUATION REPORT")
72
+ print("=" * 60)
73
+
74
+ status = "PASSED" if self.passed else "FAILED"
75
+ print(f"\nOverall: {status} (score: {self.score:.2f})")
76
+ print(f"Response: {str(self.response)[:100]}...")
77
+
78
+ for result in self.grader_results:
79
+ result.print_report(verbose=verbose)
80
+
81
+
82
+ def evaluate_agent(
83
+ graph,
84
+ message: str,
85
+ store=None,
86
+ user_id: str = "eval_user",
87
+ session_id: str = "eval_session",
88
+ graders: Optional[list[str]] = None,
89
+ llm_model: str = "llama3.2",
90
+ print_result: bool = True,
91
+ ) -> EvaluationResult:
92
+ """Evaluate a LangGraph agent with a single message.
93
+
94
+ This is the simplest way to evaluate your agent. It:
95
+ 1. Instruments the agent to capture traces
96
+ 2. Runs your message through the agent
97
+ 3. Grades the trace with specified graders
98
+ 4. Returns a combined result
99
+
100
+ Args:
101
+ graph: Your compiled LangGraph graph
102
+ message: The user message to send
103
+ store: Optional LangGraph store (for memory operations)
104
+ user_id: User ID for the session
105
+ session_id: Session ID for the conversation
106
+ graders: List of grader names to run. Default: ["memory_hygiene"]
107
+ Available: "memory_hygiene", "memory_corruption"
108
+ llm_model: Ollama model for LLM-based graders
109
+ print_result: Whether to print the report automatically
110
+
111
+ Returns:
112
+ EvaluationResult with response, trace, and grader results
113
+
114
+ Example:
115
+ from context_forge.evaluation import evaluate_agent
116
+ from langgraph.store.memory import InMemoryStore
117
+ from my_agent import build_graph
118
+
119
+ store = InMemoryStore()
120
+ # ... populate store with user profile ...
121
+
122
+ graph = build_graph(store=store)
123
+ result = evaluate_agent(
124
+ graph=graph,
125
+ message="I switched to working from home",
126
+ store=store,
127
+ )
128
+
129
+ if not result.passed:
130
+ print("Agent failed evaluation!")
131
+ for error in result.errors:
132
+ print(f" - {error.description}")
133
+ """
134
+ graders = graders or ["memory_hygiene"]
135
+
136
+ # Set up instrumentation
137
+ instrumentor = LangGraphInstrumentor(
138
+ agent_name="evaluated_agent",
139
+ agent_version="1.0.0",
140
+ )
141
+ instrumentor.instrument()
142
+
143
+ try:
144
+ # Build initial state
145
+ initial_state = {
146
+ "user_id": user_id,
147
+ "session_id": session_id,
148
+ "message": message,
149
+ "messages": [],
150
+ "turn_count": 0,
151
+ "user_profile": None, # Will be loaded from store
152
+ "response": None,
153
+ }
154
+
155
+ # Add store config if provided
156
+ config = {}
157
+ if store is not None:
158
+ config["configurable"] = {"store": store}
159
+
160
+ # Run the agent
161
+ result = graph.invoke(initial_state, config=config)
162
+ response = result.get("response", result)
163
+
164
+ # Get the trace
165
+ traces = instrumentor.get_traces()
166
+ if not traces:
167
+ raise RuntimeError("No trace captured. Is the graph using LangChain components?")
168
+ trace = traces[0]
169
+
170
+ # Run graders
171
+ grader_results = []
172
+ for grader_name in graders:
173
+ grader_result = _run_grader(grader_name, trace, llm_model)
174
+ grader_results.append(grader_result)
175
+
176
+ # Build result
177
+ eval_result = EvaluationResult(
178
+ response=response,
179
+ trace=trace,
180
+ grader_results=grader_results,
181
+ )
182
+
183
+ if print_result:
184
+ eval_result.print_report()
185
+
186
+ return eval_result
187
+
188
+ finally:
189
+ instrumentor.uninstrument()
190
+
191
+
192
+ def evaluate_trace(
193
+ trace: TraceRun,
194
+ graders: Optional[list[str]] = None,
195
+ llm_model: str = "llama3.2",
196
+ print_result: bool = True,
197
+ ) -> EvaluationResult:
198
+ """Evaluate an existing trace.
199
+
200
+ Use this when you already have a trace (e.g., loaded from a file)
201
+ and just want to run graders on it.
202
+
203
+ Args:
204
+ trace: The trace to evaluate
205
+ graders: List of grader names to run
206
+ llm_model: Ollama model for LLM-based graders
207
+ print_result: Whether to print the report automatically
208
+
209
+ Returns:
210
+ EvaluationResult with grader results
211
+
212
+ Example:
213
+ from context_forge.evaluation import evaluate_trace
214
+ from context_forge import TraceRun
215
+ import json
216
+
217
+ with open("my_trace.json") as f:
218
+ trace = TraceRun.model_validate(json.load(f))
219
+
220
+ result = evaluate_trace(trace)
221
+ """
222
+ graders = graders or ["memory_hygiene"]
223
+
224
+ grader_results = []
225
+ for grader_name in graders:
226
+ grader_result = _run_grader(grader_name, trace, llm_model)
227
+ grader_results.append(grader_result)
228
+
229
+ eval_result = EvaluationResult(
230
+ response=None,
231
+ trace=trace,
232
+ grader_results=grader_results,
233
+ )
234
+
235
+ if print_result:
236
+ eval_result.print_report()
237
+
238
+ return eval_result
239
+
240
+
241
+ def _run_grader(grader_name: str, trace: TraceRun, llm_model: str) -> GraderResult:
242
+ """Run a grader by name."""
243
+ from context_forge.graders import MemoryCorruptionGrader
244
+ from context_forge.graders.judges.backends import OllamaBackend
245
+
246
+ if grader_name == "memory_hygiene":
247
+ # Check if Ollama is available
248
+ try:
249
+ backend = OllamaBackend(model=llm_model)
250
+ if backend.is_available():
251
+ grader = HybridMemoryHygieneGrader(llm_backend=backend)
252
+ else:
253
+ # Fall back to deterministic only
254
+ grader = HybridMemoryHygieneGrader()
255
+ except Exception:
256
+ grader = HybridMemoryHygieneGrader()
257
+ return grader.grade(trace)
258
+
259
+ elif grader_name == "memory_corruption":
260
+ grader = MemoryCorruptionGrader()
261
+ return grader.grade(trace)
262
+
263
+ else:
264
+ raise ValueError(
265
+ f"Unknown grader: {grader_name}. "
266
+ f"Available: memory_hygiene, memory_corruption"
267
+ )
@@ -0,0 +1,56 @@
1
+ """Custom exceptions for ContextForge.
2
+
3
+ This module implements T025: Custom exceptions.
4
+ """
5
+
6
+
7
+ class ContextForgeError(Exception):
8
+ """Base exception for all ContextForge errors."""
9
+
10
+ pass
11
+
12
+
13
+ class TraceValidationError(ContextForgeError):
14
+ """Raised when trace validation fails."""
15
+
16
+ def __init__(self, message: str, field: str | None = None):
17
+ self.field = field
18
+ super().__init__(message)
19
+
20
+
21
+ class InstrumentationError(ContextForgeError):
22
+ """Raised when instrumentation fails."""
23
+
24
+ pass
25
+
26
+
27
+ class InstrumentorAlreadyActiveError(InstrumentationError):
28
+ """Raised when trying to instrument when already instrumented."""
29
+
30
+ pass
31
+
32
+
33
+ class InstrumentorNotActiveError(InstrumentationError):
34
+ """Raised when trying to uninstrument when not instrumented."""
35
+
36
+ pass
37
+
38
+
39
+ class SpanConversionError(ContextForgeError):
40
+ """Raised when span conversion fails."""
41
+
42
+ def __init__(self, message: str, span_id: str | None = None):
43
+ self.span_id = span_id
44
+ super().__init__(message)
45
+
46
+
47
+ class TracerError(ContextForgeError):
48
+ """Raised when tracer operations fail."""
49
+
50
+ pass
51
+
52
+
53
+ class TracerNotActiveError(TracerError):
54
+ """Raised when trying to record steps without an active tracer."""
55
+
56
+ pass
@@ -0,0 +1,44 @@
1
+ """ContextForge Graders - Evaluate agent trajectories.
2
+
3
+ Graders analyze traces to detect behavioral issues that output-only
4
+ evaluation would miss.
5
+
6
+ Two types of evaluation:
7
+ - Deterministic (MemoryCorruptionGrader): Checks INVARIANTS that are always wrong
8
+ - LLM Judges (MemoryHygieneJudge): SEMANTIC evaluation requiring understanding
9
+ - Hybrid: Combines both for comprehensive analysis
10
+
11
+ Usage:
12
+ from context_forge.graders import HybridMemoryHygieneGrader
13
+ from context_forge.graders.judges.backends import OllamaBackend
14
+
15
+ # Full evaluation (recommended)
16
+ grader = HybridMemoryHygieneGrader(
17
+ llm_backend=OllamaBackend(model="llama3.2")
18
+ )
19
+ result = grader.grade(trace)
20
+
21
+ if not result.passed:
22
+ for error in result.errors:
23
+ print(f"Issue: {error.description}")
24
+ """
25
+
26
+ from context_forge.graders.base import Evidence, Grader, GraderResult, Severity
27
+ from context_forge.graders.deterministic import MemoryCorruptionGrader
28
+ from context_forge.graders.hybrid import HybridMemoryHygieneGrader
29
+ from context_forge.graders.judges import LLMJudge, MemoryHygieneJudge
30
+
31
+ __all__ = [
32
+ # Base classes
33
+ "Grader",
34
+ "GraderResult",
35
+ "Evidence",
36
+ "Severity",
37
+ # Deterministic graders (invariant checks)
38
+ "MemoryCorruptionGrader",
39
+ # LLM judges (semantic evaluation)
40
+ "LLMJudge",
41
+ "MemoryHygieneJudge",
42
+ # Hybrid graders (recommended)
43
+ "HybridMemoryHygieneGrader",
44
+ ]
@@ -0,0 +1,264 @@
1
+ """Base classes for ContextForge graders.
2
+
3
+ This module implements the core grader interface that all graders
4
+ (deterministic and LLM judges) must implement.
5
+
6
+ Key design principles:
7
+ - Graders operate ONLY on traces, never on framework objects
8
+ - Results include evidence with step_ids for traceability
9
+ - Deterministic graders are stateless and reproducible
10
+ - LLM judges include full reproducibility metadata
11
+ """
12
+
13
+ from abc import ABC, abstractmethod
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime, timezone
16
+ from enum import Enum
17
+ from typing import Any, Optional
18
+
19
+ from context_forge.core.trace import TraceRun
20
+
21
+
22
+ class Severity(str, Enum):
23
+ """Severity level for evidence items."""
24
+
25
+ INFO = "info" # Informational, not a problem
26
+ WARN = "warn" # Potential issue, doesn't fail the grader
27
+ ERROR = "error" # Definite issue, fails the grader
28
+
29
+
30
+ @dataclass
31
+ class Evidence:
32
+ """Proof of what was evaluated by a grader.
33
+
34
+ Every grader result must include evidence explaining what was
35
+ checked and why the result was pass/fail.
36
+
37
+ Attributes:
38
+ check_name: Name of the specific check (e.g., "redundant_write")
39
+ description: Human-readable explanation of the finding
40
+ severity: How serious this finding is
41
+ step_ids: Which trace steps were examined
42
+ details: Additional structured data about the finding
43
+ """
44
+
45
+ check_name: str
46
+ description: str
47
+ severity: Severity = Severity.INFO
48
+ step_ids: list[str] = field(default_factory=list)
49
+ details: Optional[dict[str, Any]] = None
50
+
51
+
52
+ @dataclass
53
+ class GraderResult:
54
+ """Result from any grader (deterministic or LLM judge).
55
+
56
+ Attributes:
57
+ grader_name: Name of the grader that produced this result
58
+ passed: Whether the trace passed all checks
59
+ score: Numeric score from 0.0 (worst) to 1.0 (best)
60
+ evidence: List of evidence items explaining the result
61
+ timestamp: When the grading was performed
62
+ metadata: Additional grader-specific data (LLM judges add prompt/response)
63
+ """
64
+
65
+ grader_name: str
66
+ passed: bool
67
+ score: float
68
+ evidence: list[Evidence] = field(default_factory=list)
69
+ timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
70
+ metadata: Optional[dict[str, Any]] = None
71
+
72
+ def __post_init__(self):
73
+ """Validate score is in valid range."""
74
+ if not 0.0 <= self.score <= 1.0:
75
+ raise ValueError(f"Score must be between 0.0 and 1.0, got {self.score}")
76
+
77
+ @property
78
+ def errors(self) -> list[Evidence]:
79
+ """Get all evidence items with ERROR severity."""
80
+ return [e for e in self.evidence if e.severity == Severity.ERROR]
81
+
82
+ @property
83
+ def warnings(self) -> list[Evidence]:
84
+ """Get all evidence items with WARN severity."""
85
+ return [e for e in self.evidence if e.severity == Severity.WARN]
86
+
87
+ def to_dict(self) -> dict[str, Any]:
88
+ """Convert result to dictionary for serialization."""
89
+ return {
90
+ "grader_name": self.grader_name,
91
+ "passed": self.passed,
92
+ "score": self.score,
93
+ "evidence": [
94
+ {
95
+ "check_name": e.check_name,
96
+ "description": e.description,
97
+ "severity": e.severity.value,
98
+ "step_ids": e.step_ids,
99
+ "details": e.details,
100
+ }
101
+ for e in self.evidence
102
+ ],
103
+ "timestamp": self.timestamp.isoformat(),
104
+ "metadata": self.metadata,
105
+ }
106
+
107
+ def format_report(self, verbose: bool = False) -> str:
108
+ """Format the grader result as a human-readable report.
109
+
110
+ Args:
111
+ verbose: Include additional details and metadata
112
+
113
+ Returns:
114
+ Formatted string report
115
+ """
116
+ lines = []
117
+
118
+ # Header
119
+ lines.append("")
120
+ lines.append("=" * 60)
121
+ lines.append(f"GRADER REPORT: {self.grader_name}")
122
+ lines.append("=" * 60)
123
+
124
+ # Result summary
125
+ status = "PASSED" if self.passed else "FAILED"
126
+ status_icon = "[OK]" if self.passed else "[FAIL]"
127
+ lines.append("")
128
+ lines.append(f"Result: {status_icon} {status}")
129
+ lines.append(f"Score: {self.score:.2f} / 1.00")
130
+
131
+ # Errors (always show)
132
+ errors = self.errors
133
+ if errors:
134
+ lines.append("")
135
+ lines.append(f"ERRORS ({len(errors)}):")
136
+ for e in errors:
137
+ lines.append(f" [ERROR] {e.check_name}")
138
+ lines.append(f" {e.description}")
139
+ if verbose and e.details:
140
+ for k, v in e.details.items():
141
+ lines.append(f" {k}: {v}")
142
+
143
+ # Warnings (always show)
144
+ warnings = self.warnings
145
+ if warnings:
146
+ lines.append("")
147
+ lines.append(f"WARNINGS ({len(warnings)}):")
148
+ for e in warnings:
149
+ lines.append(f" [WARN] {e.check_name}")
150
+ lines.append(f" {e.description}")
151
+
152
+ # Info items (show summary only, or all if verbose)
153
+ info_items = [e for e in self.evidence if e.severity == Severity.INFO]
154
+ if info_items:
155
+ # Always show the summary if present
156
+ summary = next(
157
+ (e for e in info_items if e.check_name == "llm_summary"),
158
+ None,
159
+ )
160
+ if summary:
161
+ lines.append("")
162
+ lines.append("SUMMARY:")
163
+ lines.append(f" {summary.description}")
164
+
165
+ # Show correct saves
166
+ correct_saves = [e for e in info_items if e.check_name == "correct_save"]
167
+ if correct_saves:
168
+ lines.append("")
169
+ lines.append(f"CORRECTLY SAVED ({len(correct_saves)}):")
170
+ for e in correct_saves:
171
+ lines.append(f" [OK] {e.description}")
172
+
173
+ # Verbose: show all info items
174
+ if verbose:
175
+ other_info = [
176
+ e for e in info_items
177
+ if e.check_name not in ("llm_summary", "correct_save")
178
+ ]
179
+ if other_info:
180
+ lines.append("")
181
+ lines.append("ADDITIONAL INFO:")
182
+ for e in other_info:
183
+ lines.append(f" [{e.check_name}] {e.description}")
184
+
185
+ lines.append("")
186
+ lines.append("-" * 60)
187
+
188
+ return "\n".join(lines)
189
+
190
+ def print_report(self, verbose: bool = False) -> None:
191
+ """Print the grader result as a human-readable report.
192
+
193
+ Args:
194
+ verbose: Include additional details and metadata
195
+ """
196
+ print(self.format_report(verbose=verbose))
197
+
198
+ def __str__(self) -> str:
199
+ """Short string representation."""
200
+ status = "PASSED" if self.passed else "FAILED"
201
+ return f"GraderResult({self.grader_name}: {status}, score={self.score:.2f})"
202
+
203
+
204
+ class Grader(ABC):
205
+ """Base class for all graders.
206
+
207
+ Graders evaluate traces and return structured results with evidence.
208
+ All graders must implement the `grade` method.
209
+
210
+ Attributes:
211
+ name: Human-readable name for this grader
212
+ deterministic: Whether this grader produces identical results on repeated runs
213
+ required_step_types: Step types this grader needs to function
214
+ """
215
+
216
+ name: str = "base_grader"
217
+ deterministic: bool = True
218
+ required_step_types: list[str] = []
219
+
220
+ @abstractmethod
221
+ def grade(self, trace: TraceRun) -> GraderResult:
222
+ """Evaluate a trace and return a grading result.
223
+
224
+ Args:
225
+ trace: The trace to evaluate
226
+
227
+ Returns:
228
+ GraderResult with pass/fail, score, and evidence
229
+ """
230
+ pass
231
+
232
+ def validate_trace(self, trace: TraceRun) -> list[str]:
233
+ """Check if trace has required step types.
234
+
235
+ Args:
236
+ trace: The trace to validate
237
+
238
+ Returns:
239
+ List of missing step types (empty if all present)
240
+ """
241
+ present_types = {step.step_type for step in trace.steps}
242
+ missing = [st for st in self.required_step_types if st not in present_types]
243
+ return missing
244
+
245
+ def check_required_steps(self, trace: TraceRun) -> None:
246
+ """Validate trace has required step types, raising if not.
247
+
248
+ Call this at the start of grade() to fail fast on invalid traces.
249
+
250
+ Args:
251
+ trace: The trace to validate
252
+
253
+ Raises:
254
+ ValueError: If required step types are missing
255
+ """
256
+ missing = self.validate_trace(trace)
257
+ if missing:
258
+ raise ValueError(
259
+ f"Grader '{self.name}' requires step types {self.required_step_types}, "
260
+ f"but trace is missing: {missing}"
261
+ )
262
+
263
+ def __repr__(self) -> str:
264
+ return f"{self.__class__.__name__}(name={self.name!r}, deterministic={self.deterministic})"
@@ -0,0 +1,11 @@
1
+ """Deterministic graders for ContextForge.
2
+
3
+ These graders check for INVARIANTS - things that are always wrong
4
+ regardless of the agent's non-deterministic path.
5
+
6
+ For semantic evaluation (understanding), use LLM judges instead.
7
+ """
8
+
9
+ from context_forge.graders.deterministic.memory_corruption import MemoryCorruptionGrader
10
+
11
+ __all__ = ["MemoryCorruptionGrader"]