contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. context_forge/__init__.py +95 -0
  2. context_forge/core/__init__.py +55 -0
  3. context_forge/core/trace.py +369 -0
  4. context_forge/core/types.py +121 -0
  5. context_forge/evaluation.py +267 -0
  6. context_forge/exceptions.py +56 -0
  7. context_forge/graders/__init__.py +44 -0
  8. context_forge/graders/base.py +264 -0
  9. context_forge/graders/deterministic/__init__.py +11 -0
  10. context_forge/graders/deterministic/memory_corruption.py +130 -0
  11. context_forge/graders/hybrid.py +190 -0
  12. context_forge/graders/judges/__init__.py +11 -0
  13. context_forge/graders/judges/backends/__init__.py +9 -0
  14. context_forge/graders/judges/backends/ollama.py +173 -0
  15. context_forge/graders/judges/base.py +158 -0
  16. context_forge/graders/judges/memory_hygiene_judge.py +332 -0
  17. context_forge/graders/judges/models.py +113 -0
  18. context_forge/harness/__init__.py +43 -0
  19. context_forge/harness/user_simulator/__init__.py +70 -0
  20. context_forge/harness/user_simulator/adapters/__init__.py +13 -0
  21. context_forge/harness/user_simulator/adapters/base.py +67 -0
  22. context_forge/harness/user_simulator/adapters/crewai.py +100 -0
  23. context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
  24. context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
  25. context_forge/harness/user_simulator/llm/__init__.py +5 -0
  26. context_forge/harness/user_simulator/llm/ollama.py +119 -0
  27. context_forge/harness/user_simulator/models.py +103 -0
  28. context_forge/harness/user_simulator/persona.py +154 -0
  29. context_forge/harness/user_simulator/runner.py +342 -0
  30. context_forge/harness/user_simulator/scenario.py +95 -0
  31. context_forge/harness/user_simulator/simulator.py +307 -0
  32. context_forge/instrumentation/__init__.py +23 -0
  33. context_forge/instrumentation/base.py +307 -0
  34. context_forge/instrumentation/instrumentors/__init__.py +17 -0
  35. context_forge/instrumentation/instrumentors/langchain.py +671 -0
  36. context_forge/instrumentation/instrumentors/langgraph.py +534 -0
  37. context_forge/instrumentation/tracer.py +588 -0
  38. context_forge/py.typed +0 -0
  39. contextforge_eval-0.1.0.dist-info/METADATA +420 -0
  40. contextforge_eval-0.1.0.dist-info/RECORD +43 -0
  41. contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
  42. contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  43. contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,130 @@
1
+ """Deterministic Memory Corruption Grader.
2
+
3
+ Checks for TRUE INVARIANTS that are ALWAYS wrong regardless of agent path:
4
+ - Data corruption: Existing correct data was deleted or overwritten with null
5
+ - Schema violations: Wrong types, malformed data structures
6
+
7
+ These checks detect hard failures that should never happen, regardless of
8
+ the non-deterministic path the agent takes.
9
+
10
+ For semantic evaluation (did the agent save the right facts?), use the
11
+ MemoryHygieneJudge LLM-based grader instead.
12
+ """
13
+
14
+ from context_forge.core.trace import MemoryWriteStep, TraceRun
15
+ from context_forge.graders.base import Evidence, Grader, GraderResult, Severity
16
+
17
+
18
+ class MemoryCorruptionGrader(Grader):
19
+ """Deterministic grader for memory corruption detection.
20
+
21
+ Checks for invariant violations that are ALWAYS wrong:
22
+ 1. Data corruption: Existing data deleted without replacement
23
+ 2. Field deletion: Required fields removed
24
+
25
+ These are hard constraints - if violated, something is broken.
26
+
27
+ For semantic checks (missed facts, hallucinations), use the
28
+ MemoryHygieneJudge LLM-based grader.
29
+
30
+ Usage:
31
+ grader = MemoryCorruptionGrader()
32
+ result = grader.grade(trace)
33
+ if not result.passed:
34
+ for error in result.errors:
35
+ print(f"Corruption detected: {error.description}")
36
+ """
37
+
38
+ name = "memory_corruption"
39
+ deterministic = True
40
+ required_step_types = [] # Can run on any trace
41
+
42
+ def __init__(self, fail_on_data_loss: bool = True):
43
+ """Initialize the grader.
44
+
45
+ Args:
46
+ fail_on_data_loss: Treat data loss as errors (default: True)
47
+ """
48
+ self.fail_on_data_loss = fail_on_data_loss
49
+
50
+ def grade(self, trace: TraceRun) -> GraderResult:
51
+ """Check for memory corruption in a trace.
52
+
53
+ Args:
54
+ trace: The trace to evaluate
55
+
56
+ Returns:
57
+ GraderResult with corruption findings
58
+ """
59
+ evidence: list[Evidence] = []
60
+
61
+ # Get memory write steps
62
+ memory_writes = [s for s in trace.steps if isinstance(s, MemoryWriteStep)]
63
+
64
+ # Check for data corruption (deletion of existing values)
65
+ evidence.extend(self._check_data_corruption(memory_writes))
66
+
67
+ # Calculate score and pass/fail
68
+ errors = [e for e in evidence if e.severity == Severity.ERROR]
69
+
70
+ # Score: 1.0 - 0.5 per corruption error
71
+ score = max(0.0, 1.0 - (len(errors) * 0.5))
72
+ passed = len(errors) == 0
73
+
74
+ return GraderResult(
75
+ grader_name=self.name,
76
+ passed=passed,
77
+ score=score,
78
+ evidence=evidence,
79
+ metadata={
80
+ "total_memory_writes": len(memory_writes),
81
+ "corruption_errors": len(errors),
82
+ },
83
+ )
84
+
85
+ def _check_data_corruption(
86
+ self, memory_writes: list[MemoryWriteStep]
87
+ ) -> list[Evidence]:
88
+ """Check for data corruption: existing data deleted or nullified.
89
+
90
+ Data corruption occurs when:
91
+ - old_value exists (not None)
92
+ - new_value is None (data deleted)
93
+
94
+ This is an invariant violation - correct user data should never
95
+ be deleted without explicit user request.
96
+ """
97
+ evidence = []
98
+
99
+ for write in memory_writes:
100
+ if not write.changes:
101
+ continue
102
+
103
+ corrupted_fields = []
104
+ for change in write.changes:
105
+ # Corruption: had value, now null (data lost)
106
+ if change.old_value is not None and change.new_value is None:
107
+ corrupted_fields.append(change)
108
+
109
+ if corrupted_fields:
110
+ severity = Severity.ERROR if self.fail_on_data_loss else Severity.WARN
111
+ paths = [c.path for c in corrupted_fields]
112
+ evidence.append(
113
+ Evidence(
114
+ check_name="data_corruption",
115
+ description=f"Existing data was deleted: {paths}",
116
+ severity=severity,
117
+ step_ids=[write.step_id],
118
+ details={
119
+ "corrupted_fields": [
120
+ {
121
+ "path": c.path,
122
+ "lost_value": c.old_value,
123
+ }
124
+ for c in corrupted_fields
125
+ ],
126
+ },
127
+ )
128
+ )
129
+
130
+ return evidence
@@ -0,0 +1,190 @@
1
+ """Hybrid graders that combine deterministic and LLM-based evaluation.
2
+
3
+ Hybrid graders leverage the strengths of both approaches:
4
+ - Deterministic: Fast, cheap, catches invariant violations (corruption)
5
+ - LLM Judge: Semantic understanding, catches meaning-related issues
6
+ """
7
+
8
+ from typing import Optional
9
+
10
+ from context_forge.core.trace import TraceRun
11
+ from context_forge.graders.base import Evidence, Grader, GraderResult, Severity
12
+ from context_forge.graders.deterministic.memory_corruption import MemoryCorruptionGrader
13
+ from context_forge.graders.judges.base import LLMBackend
14
+ from context_forge.graders.judges.memory_hygiene_judge import MemoryHygieneJudge
15
+
16
+
17
+ class HybridMemoryHygieneGrader(Grader):
18
+ """Hybrid grader combining corruption detection and semantic evaluation.
19
+
20
+ Layer 1 - Deterministic (MemoryCorruptionGrader):
21
+ Checks for INVARIANTS that are always wrong:
22
+ - Data corruption: Existing data deleted without replacement
23
+
24
+ Layer 2 - LLM Judge (MemoryHygieneJudge):
25
+ Checks for SEMANTIC issues requiring understanding:
26
+ - Missed facts: User stated something, agent didn't save it
27
+ - Hallucinations: Agent saved something user never said
28
+ - Contradictions: Saved data conflicts with user statements
29
+
30
+ The deterministic layer catches hard failures (corruption).
31
+ The LLM layer catches semantic failures (wrong understanding).
32
+
33
+ Usage:
34
+ # With LLM (recommended - full semantic analysis)
35
+ from context_forge.graders.judges.backends import OllamaBackend
36
+
37
+ grader = HybridMemoryHygieneGrader(
38
+ llm_backend=OllamaBackend(model="llama3.2")
39
+ )
40
+ result = grader.grade(trace)
41
+
42
+ # Without LLM (only corruption detection)
43
+ grader = HybridMemoryHygieneGrader()
44
+ result = grader.grade(trace) # Only checks for data corruption
45
+ """
46
+
47
+ name = "hybrid_memory_hygiene"
48
+ deterministic = False # Because LLM layer is non-deterministic
49
+
50
+ def __init__(
51
+ self,
52
+ llm_backend: Optional[LLMBackend] = None,
53
+ skip_llm_on_corruption: bool = True,
54
+ llm_temperature: float = 0.0,
55
+ ):
56
+ """Initialize the hybrid grader.
57
+
58
+ Args:
59
+ llm_backend: Optional LLM backend for semantic checks.
60
+ If None, only corruption detection runs.
61
+ skip_llm_on_corruption: If True, skip LLM when corruption
62
+ is detected (saves tokens, corruption is fatal).
63
+ llm_temperature: Temperature for LLM calls (0.0 recommended).
64
+ """
65
+ self.llm_backend = llm_backend
66
+ self.skip_llm_on_corruption = skip_llm_on_corruption
67
+
68
+ # Layer 1: Corruption detection (invariants)
69
+ self.corruption_grader = MemoryCorruptionGrader()
70
+
71
+ # Layer 2: Semantic evaluation (understanding)
72
+ self.llm_judge: Optional[MemoryHygieneJudge] = None
73
+ if llm_backend:
74
+ self.llm_judge = MemoryHygieneJudge(
75
+ backend=llm_backend,
76
+ temperature=llm_temperature,
77
+ )
78
+
79
+ def grade(self, trace: TraceRun) -> GraderResult:
80
+ """Run hybrid evaluation on a trace.
81
+
82
+ 1. Run corruption detection (always)
83
+ 2. Run LLM judge (if configured and no corruption found)
84
+ 3. Combine results
85
+
86
+ Args:
87
+ trace: The trace to evaluate
88
+
89
+ Returns:
90
+ Combined GraderResult from both layers
91
+ """
92
+ all_evidence: list[Evidence] = []
93
+
94
+ # Layer 1: Corruption detection
95
+ corruption_result = self.corruption_grader.grade(trace)
96
+ all_evidence.extend(corruption_result.evidence)
97
+
98
+ # Add layer marker
99
+ all_evidence.append(
100
+ Evidence(
101
+ check_name="layer_1_complete",
102
+ description=f"Corruption check: {'PASSED' if corruption_result.passed else 'FAILED'} (score: {corruption_result.score:.2f})",
103
+ severity=Severity.INFO,
104
+ )
105
+ )
106
+
107
+ # Layer 2: LLM Semantic Judge (if configured)
108
+ llm_result: Optional[GraderResult] = None
109
+
110
+ if self.llm_judge:
111
+ # Skip LLM if corruption detected (corruption is fatal)
112
+ if self.skip_llm_on_corruption and not corruption_result.passed:
113
+ all_evidence.append(
114
+ Evidence(
115
+ check_name="layer_2_skipped",
116
+ description="Semantic evaluation skipped: data corruption detected",
117
+ severity=Severity.INFO,
118
+ )
119
+ )
120
+ else:
121
+ try:
122
+ llm_result = self.llm_judge.grade(trace)
123
+ all_evidence.extend(llm_result.evidence)
124
+
125
+ all_evidence.append(
126
+ Evidence(
127
+ check_name="layer_2_complete",
128
+ description=f"Semantic evaluation: {'PASSED' if llm_result.passed else 'FAILED'} (score: {llm_result.score:.2f})",
129
+ severity=Severity.INFO,
130
+ )
131
+ )
132
+ except Exception as e:
133
+ all_evidence.append(
134
+ Evidence(
135
+ check_name="layer_2_error",
136
+ description=f"Semantic evaluation failed: {e}",
137
+ severity=Severity.WARN,
138
+ )
139
+ )
140
+
141
+ # Combine results
142
+ combined_result = self._combine_results(
143
+ corruption_result, llm_result, all_evidence
144
+ )
145
+ return combined_result
146
+
147
+ def _combine_results(
148
+ self,
149
+ corruption_result: GraderResult,
150
+ llm_result: Optional[GraderResult],
151
+ all_evidence: list[Evidence],
152
+ ) -> GraderResult:
153
+ """Combine corruption and semantic results into final result.
154
+
155
+ Scoring:
156
+ - If LLM ran: average of both scores
157
+ - If LLM didn't run: corruption score only
158
+
159
+ Passing:
160
+ - Must pass BOTH layers to pass overall
161
+ - Corruption failure is fatal (always fails)
162
+ """
163
+ if llm_result:
164
+ # Both layers ran - combine
165
+ combined_score = (corruption_result.score + llm_result.score) / 2
166
+ combined_passed = corruption_result.passed and llm_result.passed
167
+
168
+ # Merge metadata
169
+ metadata = {
170
+ "corruption": corruption_result.metadata,
171
+ "semantic": llm_result.metadata,
172
+ "layers_run": ["corruption", "semantic"],
173
+ }
174
+ else:
175
+ # Only corruption check ran
176
+ combined_score = corruption_result.score
177
+ combined_passed = corruption_result.passed
178
+
179
+ metadata = {
180
+ "corruption": corruption_result.metadata,
181
+ "layers_run": ["corruption"],
182
+ }
183
+
184
+ return GraderResult(
185
+ grader_name=self.name,
186
+ passed=combined_passed,
187
+ score=combined_score,
188
+ evidence=all_evidence,
189
+ metadata=metadata,
190
+ )
@@ -0,0 +1,11 @@
1
+ """LLM-based judges for ContextForge.
2
+
3
+ These judges use language models to evaluate semantic aspects of traces
4
+ that rule-based graders cannot assess. They include full reproducibility
5
+ metadata (prompt, response, model parameters).
6
+ """
7
+
8
+ from context_forge.graders.judges.base import LLMJudge
9
+ from context_forge.graders.judges.memory_hygiene_judge import MemoryHygieneJudge
10
+
11
+ __all__ = ["LLMJudge", "MemoryHygieneJudge"]
@@ -0,0 +1,9 @@
1
+ """LLM backends for ContextForge judges.
2
+
3
+ Backends provide the connection to LLM providers. Ollama is the
4
+ primary/default backend for local execution.
5
+ """
6
+
7
+ from context_forge.graders.judges.backends.ollama import OllamaBackend
8
+
9
+ __all__ = ["OllamaBackend"]
@@ -0,0 +1,173 @@
1
+ """Ollama backend for LLM judges.
2
+
3
+ Provides local LLM execution via Ollama. This is the primary backend
4
+ for ContextForge, enabling evaluation without sending data to cloud APIs.
5
+
6
+ Supports structured output via JSON schema - pass a Pydantic model
7
+ to get validated, typed responses.
8
+
9
+ Uses the official Ollama Python SDK for cleaner, more maintainable code.
10
+ """
11
+
12
+ import logging
13
+ from typing import TypeVar
14
+
15
+ import ollama
16
+ from pydantic import BaseModel
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ T = TypeVar("T", bound=BaseModel)
21
+
22
+
23
+ class OllamaBackend:
24
+ """Ollama backend for local LLM execution with structured output support.
25
+
26
+ Usage:
27
+ backend = OllamaBackend(model="llama3.2")
28
+
29
+ # Basic completion (returns string)
30
+ response = backend.complete("Evaluate this trace...")
31
+
32
+ # Structured output with Pydantic model (returns validated object)
33
+ result = backend.complete_structured(
34
+ prompt="Evaluate this trace...",
35
+ response_model=MemoryHygieneEvaluation,
36
+ )
37
+
38
+ Requires Ollama to be running at localhost:11434 (default).
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ model: str = "llama3.2",
44
+ host: str = "http://localhost:11434",
45
+ timeout: float = 120.0,
46
+ ):
47
+ """Initialize the Ollama backend.
48
+
49
+ Args:
50
+ model: Ollama model to use (e.g., "llama3.2", "mistral")
51
+ host: Ollama host URL
52
+ timeout: Request timeout in seconds
53
+ """
54
+ self.model = model
55
+ self.host = host
56
+ self.timeout = timeout
57
+ self._client = ollama.Client(host=host, timeout=timeout)
58
+
59
+ @property
60
+ def model_id(self) -> str:
61
+ """The model identifier."""
62
+ return f"ollama/{self.model}"
63
+
64
+ def complete(
65
+ self,
66
+ prompt: str,
67
+ temperature: float = 0.0,
68
+ json_mode: bool = False,
69
+ ) -> str:
70
+ """Generate a completion using Ollama.
71
+
72
+ Args:
73
+ prompt: The prompt to complete
74
+ temperature: Sampling temperature (0.0 for deterministic)
75
+ json_mode: If True, enforce JSON output format
76
+
77
+ Returns:
78
+ The model's response text
79
+
80
+ Raises:
81
+ ollama.ResponseError: If the request fails
82
+ ValueError: If Ollama is not running
83
+ """
84
+ try:
85
+ response = self._client.generate(
86
+ model=self.model,
87
+ prompt=prompt,
88
+ format="json" if json_mode else None,
89
+ options={"temperature": temperature},
90
+ )
91
+ return response.get("response", "")
92
+
93
+ except ollama.ResponseError as e:
94
+ logger.error(f"Ollama request failed: {e}")
95
+ raise
96
+
97
+ except Exception as e:
98
+ if "connection" in str(e).lower() or "refused" in str(e).lower():
99
+ logger.error(f"Failed to connect to Ollama at {self.host}: {e}")
100
+ raise ValueError(
101
+ f"Cannot connect to Ollama at {self.host}. "
102
+ "Is Ollama running? Start it with: ollama serve"
103
+ ) from e
104
+ raise
105
+
106
+ def complete_structured(
107
+ self,
108
+ prompt: str,
109
+ response_model: type[T],
110
+ temperature: float = 0.0,
111
+ ) -> T:
112
+ """Generate a structured completion with Pydantic validation.
113
+
114
+ Uses Ollama's structured output feature to enforce a JSON schema,
115
+ then validates with Pydantic for type safety.
116
+
117
+ Args:
118
+ prompt: The prompt to complete
119
+ response_model: Pydantic model class for the response
120
+ temperature: Sampling temperature (0.0 for deterministic)
121
+
122
+ Returns:
123
+ Validated Pydantic model instance
124
+
125
+ Raises:
126
+ ollama.ResponseError: If the request fails
127
+ pydantic.ValidationError: If response doesn't match schema
128
+ ValueError: If Ollama connection fails
129
+ """
130
+ # Get JSON schema from Pydantic model
131
+ schema = response_model.model_json_schema()
132
+
133
+ try:
134
+ response = self._client.generate(
135
+ model=self.model,
136
+ prompt=prompt,
137
+ format=schema, # Ollama enforces this schema
138
+ options={"temperature": temperature},
139
+ )
140
+ response_text = response.get("response", "")
141
+
142
+ # Parse and validate with Pydantic
143
+ return response_model.model_validate_json(response_text)
144
+
145
+ except ollama.ResponseError as e:
146
+ logger.error(f"Ollama request failed: {e}")
147
+ raise
148
+
149
+ except Exception as e:
150
+ if "connection" in str(e).lower() or "refused" in str(e).lower():
151
+ logger.error(f"Failed to connect to Ollama at {self.host}: {e}")
152
+ raise ValueError(
153
+ f"Cannot connect to Ollama at {self.host}. "
154
+ "Is Ollama running? Start it with: ollama serve"
155
+ ) from e
156
+ raise
157
+
158
+ def is_available(self) -> bool:
159
+ """Check if Ollama is running and the model is available.
160
+
161
+ Returns:
162
+ True if Ollama is accessible and model is pulled
163
+ """
164
+ try:
165
+ response = self._client.list()
166
+ # SDK returns ListResponse with .models attribute containing Model objects
167
+ model_names = [m.model.split(":")[0] for m in response.models]
168
+ return self.model.split(":")[0] in model_names
169
+ except Exception:
170
+ return False
171
+
172
+ def __repr__(self) -> str:
173
+ return f"OllamaBackend(model={self.model!r}, host={self.host!r})"
@@ -0,0 +1,158 @@
1
+ """Base class for LLM-based judges.
2
+
3
+ LLM judges evaluate semantic aspects of traces that require
4
+ natural language understanding. They are marked as non-deterministic
5
+ and include full reproducibility metadata.
6
+
7
+ Supports structured output with Pydantic models for reliable parsing.
8
+ """
9
+
10
+ from abc import abstractmethod
11
+ from typing import Any, Optional, Protocol, TypeVar
12
+
13
+ from pydantic import BaseModel
14
+
15
+ from context_forge.core.trace import TraceRun
16
+ from context_forge.graders.base import Grader, GraderResult
17
+
18
+ T = TypeVar("T", bound=BaseModel)
19
+
20
+
21
+ class LLMBackend(Protocol):
22
+ """Protocol for LLM backends (Ollama, OpenAI, etc.)."""
23
+
24
+ @property
25
+ def model_id(self) -> str:
26
+ """The model identifier."""
27
+ ...
28
+
29
+ def complete(self, prompt: str, temperature: float = 0.0) -> str:
30
+ """Generate a completion for the given prompt.
31
+
32
+ Args:
33
+ prompt: The prompt to complete
34
+ temperature: Sampling temperature (0.0 for deterministic)
35
+
36
+ Returns:
37
+ The model's response text
38
+ """
39
+ ...
40
+
41
+ def complete_structured(
42
+ self,
43
+ prompt: str,
44
+ response_model: type[T],
45
+ temperature: float = 0.0,
46
+ ) -> T:
47
+ """Generate a structured completion with Pydantic validation.
48
+
49
+ Args:
50
+ prompt: The prompt to complete
51
+ response_model: Pydantic model class for the response
52
+ temperature: Sampling temperature (0.0 for deterministic)
53
+
54
+ Returns:
55
+ Validated Pydantic model instance
56
+ """
57
+ ...
58
+
59
+
60
+ class LLMJudge(Grader):
61
+ """Base class for LLM-based judges.
62
+
63
+ LLM judges use language models to evaluate aspects of traces that
64
+ require semantic understanding. Unlike deterministic graders, they:
65
+ - Are marked as non-deterministic
66
+ - Include full reproducibility metadata (prompt, response, model)
67
+ - Support configurable backends (Ollama, OpenAI, etc.)
68
+
69
+ Subclasses must implement:
70
+ - _build_prompt(): Construct the evaluation prompt
71
+ - _parse_response(): Parse the LLM response into a GraderResult
72
+
73
+ Usage:
74
+ class MyJudge(LLMJudge):
75
+ def _build_prompt(self, trace):
76
+ return f"Evaluate this trace: {trace}"
77
+
78
+ def _parse_response(self, response, trace):
79
+ return GraderResult(...)
80
+
81
+ judge = MyJudge(backend=OllamaBackend(model="llama3.2"))
82
+ result = judge.grade(trace)
83
+ """
84
+
85
+ name = "llm_judge"
86
+ deterministic = False # LLM outputs can vary
87
+
88
+ def __init__(
89
+ self,
90
+ backend: LLMBackend,
91
+ temperature: float = 0.0,
92
+ ):
93
+ """Initialize the judge with an LLM backend.
94
+
95
+ Args:
96
+ backend: LLM backend to use for evaluation
97
+ temperature: Sampling temperature (default 0.0 for consistency)
98
+ """
99
+ self.backend = backend
100
+ self.temperature = temperature
101
+
102
+ def grade(self, trace: TraceRun) -> GraderResult:
103
+ """Evaluate a trace using the LLM.
104
+
105
+ Args:
106
+ trace: The trace to evaluate
107
+
108
+ Returns:
109
+ GraderResult with LLM evaluation and reproducibility metadata
110
+ """
111
+ # Build prompt
112
+ prompt = self._build_prompt(trace)
113
+
114
+ # Call LLM
115
+ response = self.backend.complete(prompt, temperature=self.temperature)
116
+
117
+ # Parse response into result
118
+ result = self._parse_response(response, trace)
119
+
120
+ # Add reproducibility metadata
121
+ if result.metadata is None:
122
+ result.metadata = {}
123
+
124
+ result.metadata.update({
125
+ "llm": {
126
+ "model_id": self.backend.model_id,
127
+ "temperature": self.temperature,
128
+ "prompt": prompt,
129
+ "raw_response": response,
130
+ }
131
+ })
132
+
133
+ return result
134
+
135
+ @abstractmethod
136
+ def _build_prompt(self, trace: TraceRun) -> str:
137
+ """Construct the evaluation prompt for the LLM.
138
+
139
+ Args:
140
+ trace: The trace to evaluate
141
+
142
+ Returns:
143
+ The prompt string to send to the LLM
144
+ """
145
+ pass
146
+
147
+ @abstractmethod
148
+ def _parse_response(self, response: str, trace: TraceRun) -> GraderResult:
149
+ """Parse the LLM response into a GraderResult.
150
+
151
+ Args:
152
+ response: Raw LLM response text
153
+ trace: The original trace (for context)
154
+
155
+ Returns:
156
+ GraderResult with parsed findings
157
+ """
158
+ pass