contextforge-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- context_forge/__init__.py +95 -0
- context_forge/core/__init__.py +55 -0
- context_forge/core/trace.py +369 -0
- context_forge/core/types.py +121 -0
- context_forge/evaluation.py +267 -0
- context_forge/exceptions.py +56 -0
- context_forge/graders/__init__.py +44 -0
- context_forge/graders/base.py +264 -0
- context_forge/graders/deterministic/__init__.py +11 -0
- context_forge/graders/deterministic/memory_corruption.py +130 -0
- context_forge/graders/hybrid.py +190 -0
- context_forge/graders/judges/__init__.py +11 -0
- context_forge/graders/judges/backends/__init__.py +9 -0
- context_forge/graders/judges/backends/ollama.py +173 -0
- context_forge/graders/judges/base.py +158 -0
- context_forge/graders/judges/memory_hygiene_judge.py +332 -0
- context_forge/graders/judges/models.py +113 -0
- context_forge/harness/__init__.py +43 -0
- context_forge/harness/user_simulator/__init__.py +70 -0
- context_forge/harness/user_simulator/adapters/__init__.py +13 -0
- context_forge/harness/user_simulator/adapters/base.py +67 -0
- context_forge/harness/user_simulator/adapters/crewai.py +100 -0
- context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
- context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
- context_forge/harness/user_simulator/llm/__init__.py +5 -0
- context_forge/harness/user_simulator/llm/ollama.py +119 -0
- context_forge/harness/user_simulator/models.py +103 -0
- context_forge/harness/user_simulator/persona.py +154 -0
- context_forge/harness/user_simulator/runner.py +342 -0
- context_forge/harness/user_simulator/scenario.py +95 -0
- context_forge/harness/user_simulator/simulator.py +307 -0
- context_forge/instrumentation/__init__.py +23 -0
- context_forge/instrumentation/base.py +307 -0
- context_forge/instrumentation/instrumentors/__init__.py +17 -0
- context_forge/instrumentation/instrumentors/langchain.py +671 -0
- context_forge/instrumentation/instrumentors/langgraph.py +534 -0
- context_forge/instrumentation/tracer.py +588 -0
- context_forge/py.typed +0 -0
- contextforge_eval-0.1.0.dist-info/METADATA +420 -0
- contextforge_eval-0.1.0.dist-info/RECORD +43 -0
- contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
- contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Deterministic Memory Corruption Grader.
|
|
2
|
+
|
|
3
|
+
Checks for TRUE INVARIANTS that are ALWAYS wrong regardless of agent path:
|
|
4
|
+
- Data corruption: Existing correct data was deleted or overwritten with null
|
|
5
|
+
- Schema violations: Wrong types, malformed data structures
|
|
6
|
+
|
|
7
|
+
These checks detect hard failures that should never happen, regardless of
|
|
8
|
+
the non-deterministic path the agent takes.
|
|
9
|
+
|
|
10
|
+
For semantic evaluation (did the agent save the right facts?), use the
|
|
11
|
+
MemoryHygieneJudge LLM-based grader instead.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from context_forge.core.trace import MemoryWriteStep, TraceRun
|
|
15
|
+
from context_forge.graders.base import Evidence, Grader, GraderResult, Severity
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MemoryCorruptionGrader(Grader):
|
|
19
|
+
"""Deterministic grader for memory corruption detection.
|
|
20
|
+
|
|
21
|
+
Checks for invariant violations that are ALWAYS wrong:
|
|
22
|
+
1. Data corruption: Existing data deleted without replacement
|
|
23
|
+
2. Field deletion: Required fields removed
|
|
24
|
+
|
|
25
|
+
These are hard constraints - if violated, something is broken.
|
|
26
|
+
|
|
27
|
+
For semantic checks (missed facts, hallucinations), use the
|
|
28
|
+
MemoryHygieneJudge LLM-based grader.
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
grader = MemoryCorruptionGrader()
|
|
32
|
+
result = grader.grade(trace)
|
|
33
|
+
if not result.passed:
|
|
34
|
+
for error in result.errors:
|
|
35
|
+
print(f"Corruption detected: {error.description}")
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
name = "memory_corruption"
|
|
39
|
+
deterministic = True
|
|
40
|
+
required_step_types = [] # Can run on any trace
|
|
41
|
+
|
|
42
|
+
def __init__(self, fail_on_data_loss: bool = True):
|
|
43
|
+
"""Initialize the grader.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
fail_on_data_loss: Treat data loss as errors (default: True)
|
|
47
|
+
"""
|
|
48
|
+
self.fail_on_data_loss = fail_on_data_loss
|
|
49
|
+
|
|
50
|
+
def grade(self, trace: TraceRun) -> GraderResult:
|
|
51
|
+
"""Check for memory corruption in a trace.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
trace: The trace to evaluate
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
GraderResult with corruption findings
|
|
58
|
+
"""
|
|
59
|
+
evidence: list[Evidence] = []
|
|
60
|
+
|
|
61
|
+
# Get memory write steps
|
|
62
|
+
memory_writes = [s for s in trace.steps if isinstance(s, MemoryWriteStep)]
|
|
63
|
+
|
|
64
|
+
# Check for data corruption (deletion of existing values)
|
|
65
|
+
evidence.extend(self._check_data_corruption(memory_writes))
|
|
66
|
+
|
|
67
|
+
# Calculate score and pass/fail
|
|
68
|
+
errors = [e for e in evidence if e.severity == Severity.ERROR]
|
|
69
|
+
|
|
70
|
+
# Score: 1.0 - 0.5 per corruption error
|
|
71
|
+
score = max(0.0, 1.0 - (len(errors) * 0.5))
|
|
72
|
+
passed = len(errors) == 0
|
|
73
|
+
|
|
74
|
+
return GraderResult(
|
|
75
|
+
grader_name=self.name,
|
|
76
|
+
passed=passed,
|
|
77
|
+
score=score,
|
|
78
|
+
evidence=evidence,
|
|
79
|
+
metadata={
|
|
80
|
+
"total_memory_writes": len(memory_writes),
|
|
81
|
+
"corruption_errors": len(errors),
|
|
82
|
+
},
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _check_data_corruption(
|
|
86
|
+
self, memory_writes: list[MemoryWriteStep]
|
|
87
|
+
) -> list[Evidence]:
|
|
88
|
+
"""Check for data corruption: existing data deleted or nullified.
|
|
89
|
+
|
|
90
|
+
Data corruption occurs when:
|
|
91
|
+
- old_value exists (not None)
|
|
92
|
+
- new_value is None (data deleted)
|
|
93
|
+
|
|
94
|
+
This is an invariant violation - correct user data should never
|
|
95
|
+
be deleted without explicit user request.
|
|
96
|
+
"""
|
|
97
|
+
evidence = []
|
|
98
|
+
|
|
99
|
+
for write in memory_writes:
|
|
100
|
+
if not write.changes:
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
corrupted_fields = []
|
|
104
|
+
for change in write.changes:
|
|
105
|
+
# Corruption: had value, now null (data lost)
|
|
106
|
+
if change.old_value is not None and change.new_value is None:
|
|
107
|
+
corrupted_fields.append(change)
|
|
108
|
+
|
|
109
|
+
if corrupted_fields:
|
|
110
|
+
severity = Severity.ERROR if self.fail_on_data_loss else Severity.WARN
|
|
111
|
+
paths = [c.path for c in corrupted_fields]
|
|
112
|
+
evidence.append(
|
|
113
|
+
Evidence(
|
|
114
|
+
check_name="data_corruption",
|
|
115
|
+
description=f"Existing data was deleted: {paths}",
|
|
116
|
+
severity=severity,
|
|
117
|
+
step_ids=[write.step_id],
|
|
118
|
+
details={
|
|
119
|
+
"corrupted_fields": [
|
|
120
|
+
{
|
|
121
|
+
"path": c.path,
|
|
122
|
+
"lost_value": c.old_value,
|
|
123
|
+
}
|
|
124
|
+
for c in corrupted_fields
|
|
125
|
+
],
|
|
126
|
+
},
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return evidence
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Hybrid graders that combine deterministic and LLM-based evaluation.
|
|
2
|
+
|
|
3
|
+
Hybrid graders leverage the strengths of both approaches:
|
|
4
|
+
- Deterministic: Fast, cheap, catches invariant violations (corruption)
|
|
5
|
+
- LLM Judge: Semantic understanding, catches meaning-related issues
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from context_forge.core.trace import TraceRun
|
|
11
|
+
from context_forge.graders.base import Evidence, Grader, GraderResult, Severity
|
|
12
|
+
from context_forge.graders.deterministic.memory_corruption import MemoryCorruptionGrader
|
|
13
|
+
from context_forge.graders.judges.base import LLMBackend
|
|
14
|
+
from context_forge.graders.judges.memory_hygiene_judge import MemoryHygieneJudge
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HybridMemoryHygieneGrader(Grader):
|
|
18
|
+
"""Hybrid grader combining corruption detection and semantic evaluation.
|
|
19
|
+
|
|
20
|
+
Layer 1 - Deterministic (MemoryCorruptionGrader):
|
|
21
|
+
Checks for INVARIANTS that are always wrong:
|
|
22
|
+
- Data corruption: Existing data deleted without replacement
|
|
23
|
+
|
|
24
|
+
Layer 2 - LLM Judge (MemoryHygieneJudge):
|
|
25
|
+
Checks for SEMANTIC issues requiring understanding:
|
|
26
|
+
- Missed facts: User stated something, agent didn't save it
|
|
27
|
+
- Hallucinations: Agent saved something user never said
|
|
28
|
+
- Contradictions: Saved data conflicts with user statements
|
|
29
|
+
|
|
30
|
+
The deterministic layer catches hard failures (corruption).
|
|
31
|
+
The LLM layer catches semantic failures (wrong understanding).
|
|
32
|
+
|
|
33
|
+
Usage:
|
|
34
|
+
# With LLM (recommended - full semantic analysis)
|
|
35
|
+
from context_forge.graders.judges.backends import OllamaBackend
|
|
36
|
+
|
|
37
|
+
grader = HybridMemoryHygieneGrader(
|
|
38
|
+
llm_backend=OllamaBackend(model="llama3.2")
|
|
39
|
+
)
|
|
40
|
+
result = grader.grade(trace)
|
|
41
|
+
|
|
42
|
+
# Without LLM (only corruption detection)
|
|
43
|
+
grader = HybridMemoryHygieneGrader()
|
|
44
|
+
result = grader.grade(trace) # Only checks for data corruption
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
name = "hybrid_memory_hygiene"
|
|
48
|
+
deterministic = False # Because LLM layer is non-deterministic
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
llm_backend: Optional[LLMBackend] = None,
|
|
53
|
+
skip_llm_on_corruption: bool = True,
|
|
54
|
+
llm_temperature: float = 0.0,
|
|
55
|
+
):
|
|
56
|
+
"""Initialize the hybrid grader.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
llm_backend: Optional LLM backend for semantic checks.
|
|
60
|
+
If None, only corruption detection runs.
|
|
61
|
+
skip_llm_on_corruption: If True, skip LLM when corruption
|
|
62
|
+
is detected (saves tokens, corruption is fatal).
|
|
63
|
+
llm_temperature: Temperature for LLM calls (0.0 recommended).
|
|
64
|
+
"""
|
|
65
|
+
self.llm_backend = llm_backend
|
|
66
|
+
self.skip_llm_on_corruption = skip_llm_on_corruption
|
|
67
|
+
|
|
68
|
+
# Layer 1: Corruption detection (invariants)
|
|
69
|
+
self.corruption_grader = MemoryCorruptionGrader()
|
|
70
|
+
|
|
71
|
+
# Layer 2: Semantic evaluation (understanding)
|
|
72
|
+
self.llm_judge: Optional[MemoryHygieneJudge] = None
|
|
73
|
+
if llm_backend:
|
|
74
|
+
self.llm_judge = MemoryHygieneJudge(
|
|
75
|
+
backend=llm_backend,
|
|
76
|
+
temperature=llm_temperature,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def grade(self, trace: TraceRun) -> GraderResult:
|
|
80
|
+
"""Run hybrid evaluation on a trace.
|
|
81
|
+
|
|
82
|
+
1. Run corruption detection (always)
|
|
83
|
+
2. Run LLM judge (if configured and no corruption found)
|
|
84
|
+
3. Combine results
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
trace: The trace to evaluate
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Combined GraderResult from both layers
|
|
91
|
+
"""
|
|
92
|
+
all_evidence: list[Evidence] = []
|
|
93
|
+
|
|
94
|
+
# Layer 1: Corruption detection
|
|
95
|
+
corruption_result = self.corruption_grader.grade(trace)
|
|
96
|
+
all_evidence.extend(corruption_result.evidence)
|
|
97
|
+
|
|
98
|
+
# Add layer marker
|
|
99
|
+
all_evidence.append(
|
|
100
|
+
Evidence(
|
|
101
|
+
check_name="layer_1_complete",
|
|
102
|
+
description=f"Corruption check: {'PASSED' if corruption_result.passed else 'FAILED'} (score: {corruption_result.score:.2f})",
|
|
103
|
+
severity=Severity.INFO,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Layer 2: LLM Semantic Judge (if configured)
|
|
108
|
+
llm_result: Optional[GraderResult] = None
|
|
109
|
+
|
|
110
|
+
if self.llm_judge:
|
|
111
|
+
# Skip LLM if corruption detected (corruption is fatal)
|
|
112
|
+
if self.skip_llm_on_corruption and not corruption_result.passed:
|
|
113
|
+
all_evidence.append(
|
|
114
|
+
Evidence(
|
|
115
|
+
check_name="layer_2_skipped",
|
|
116
|
+
description="Semantic evaluation skipped: data corruption detected",
|
|
117
|
+
severity=Severity.INFO,
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
121
|
+
try:
|
|
122
|
+
llm_result = self.llm_judge.grade(trace)
|
|
123
|
+
all_evidence.extend(llm_result.evidence)
|
|
124
|
+
|
|
125
|
+
all_evidence.append(
|
|
126
|
+
Evidence(
|
|
127
|
+
check_name="layer_2_complete",
|
|
128
|
+
description=f"Semantic evaluation: {'PASSED' if llm_result.passed else 'FAILED'} (score: {llm_result.score:.2f})",
|
|
129
|
+
severity=Severity.INFO,
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
all_evidence.append(
|
|
134
|
+
Evidence(
|
|
135
|
+
check_name="layer_2_error",
|
|
136
|
+
description=f"Semantic evaluation failed: {e}",
|
|
137
|
+
severity=Severity.WARN,
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Combine results
|
|
142
|
+
combined_result = self._combine_results(
|
|
143
|
+
corruption_result, llm_result, all_evidence
|
|
144
|
+
)
|
|
145
|
+
return combined_result
|
|
146
|
+
|
|
147
|
+
def _combine_results(
|
|
148
|
+
self,
|
|
149
|
+
corruption_result: GraderResult,
|
|
150
|
+
llm_result: Optional[GraderResult],
|
|
151
|
+
all_evidence: list[Evidence],
|
|
152
|
+
) -> GraderResult:
|
|
153
|
+
"""Combine corruption and semantic results into final result.
|
|
154
|
+
|
|
155
|
+
Scoring:
|
|
156
|
+
- If LLM ran: average of both scores
|
|
157
|
+
- If LLM didn't run: corruption score only
|
|
158
|
+
|
|
159
|
+
Passing:
|
|
160
|
+
- Must pass BOTH layers to pass overall
|
|
161
|
+
- Corruption failure is fatal (always fails)
|
|
162
|
+
"""
|
|
163
|
+
if llm_result:
|
|
164
|
+
# Both layers ran - combine
|
|
165
|
+
combined_score = (corruption_result.score + llm_result.score) / 2
|
|
166
|
+
combined_passed = corruption_result.passed and llm_result.passed
|
|
167
|
+
|
|
168
|
+
# Merge metadata
|
|
169
|
+
metadata = {
|
|
170
|
+
"corruption": corruption_result.metadata,
|
|
171
|
+
"semantic": llm_result.metadata,
|
|
172
|
+
"layers_run": ["corruption", "semantic"],
|
|
173
|
+
}
|
|
174
|
+
else:
|
|
175
|
+
# Only corruption check ran
|
|
176
|
+
combined_score = corruption_result.score
|
|
177
|
+
combined_passed = corruption_result.passed
|
|
178
|
+
|
|
179
|
+
metadata = {
|
|
180
|
+
"corruption": corruption_result.metadata,
|
|
181
|
+
"layers_run": ["corruption"],
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return GraderResult(
|
|
185
|
+
grader_name=self.name,
|
|
186
|
+
passed=combined_passed,
|
|
187
|
+
score=combined_score,
|
|
188
|
+
evidence=all_evidence,
|
|
189
|
+
metadata=metadata,
|
|
190
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""LLM-based judges for ContextForge.
|
|
2
|
+
|
|
3
|
+
These judges use language models to evaluate semantic aspects of traces
|
|
4
|
+
that rule-based graders cannot assess. They include full reproducibility
|
|
5
|
+
metadata (prompt, response, model parameters).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from context_forge.graders.judges.base import LLMJudge
|
|
9
|
+
from context_forge.graders.judges.memory_hygiene_judge import MemoryHygieneJudge
|
|
10
|
+
|
|
11
|
+
__all__ = ["LLMJudge", "MemoryHygieneJudge"]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""LLM backends for ContextForge judges.
|
|
2
|
+
|
|
3
|
+
Backends provide the connection to LLM providers. Ollama is the
|
|
4
|
+
primary/default backend for local execution.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from context_forge.graders.judges.backends.ollama import OllamaBackend
|
|
8
|
+
|
|
9
|
+
__all__ = ["OllamaBackend"]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Ollama backend for LLM judges.
|
|
2
|
+
|
|
3
|
+
Provides local LLM execution via Ollama. This is the primary backend
|
|
4
|
+
for ContextForge, enabling evaluation without sending data to cloud APIs.
|
|
5
|
+
|
|
6
|
+
Supports structured output via JSON schema - pass a Pydantic model
|
|
7
|
+
to get validated, typed responses.
|
|
8
|
+
|
|
9
|
+
Uses the official Ollama Python SDK for cleaner, more maintainable code.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import TypeVar
|
|
14
|
+
|
|
15
|
+
import ollama
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
T = TypeVar("T", bound=BaseModel)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class OllamaBackend:
|
|
24
|
+
"""Ollama backend for local LLM execution with structured output support.
|
|
25
|
+
|
|
26
|
+
Usage:
|
|
27
|
+
backend = OllamaBackend(model="llama3.2")
|
|
28
|
+
|
|
29
|
+
# Basic completion (returns string)
|
|
30
|
+
response = backend.complete("Evaluate this trace...")
|
|
31
|
+
|
|
32
|
+
# Structured output with Pydantic model (returns validated object)
|
|
33
|
+
result = backend.complete_structured(
|
|
34
|
+
prompt="Evaluate this trace...",
|
|
35
|
+
response_model=MemoryHygieneEvaluation,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
Requires Ollama to be running at localhost:11434 (default).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
model: str = "llama3.2",
|
|
44
|
+
host: str = "http://localhost:11434",
|
|
45
|
+
timeout: float = 120.0,
|
|
46
|
+
):
|
|
47
|
+
"""Initialize the Ollama backend.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
model: Ollama model to use (e.g., "llama3.2", "mistral")
|
|
51
|
+
host: Ollama host URL
|
|
52
|
+
timeout: Request timeout in seconds
|
|
53
|
+
"""
|
|
54
|
+
self.model = model
|
|
55
|
+
self.host = host
|
|
56
|
+
self.timeout = timeout
|
|
57
|
+
self._client = ollama.Client(host=host, timeout=timeout)
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def model_id(self) -> str:
|
|
61
|
+
"""The model identifier."""
|
|
62
|
+
return f"ollama/{self.model}"
|
|
63
|
+
|
|
64
|
+
def complete(
|
|
65
|
+
self,
|
|
66
|
+
prompt: str,
|
|
67
|
+
temperature: float = 0.0,
|
|
68
|
+
json_mode: bool = False,
|
|
69
|
+
) -> str:
|
|
70
|
+
"""Generate a completion using Ollama.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
prompt: The prompt to complete
|
|
74
|
+
temperature: Sampling temperature (0.0 for deterministic)
|
|
75
|
+
json_mode: If True, enforce JSON output format
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The model's response text
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ollama.ResponseError: If the request fails
|
|
82
|
+
ValueError: If Ollama is not running
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
response = self._client.generate(
|
|
86
|
+
model=self.model,
|
|
87
|
+
prompt=prompt,
|
|
88
|
+
format="json" if json_mode else None,
|
|
89
|
+
options={"temperature": temperature},
|
|
90
|
+
)
|
|
91
|
+
return response.get("response", "")
|
|
92
|
+
|
|
93
|
+
except ollama.ResponseError as e:
|
|
94
|
+
logger.error(f"Ollama request failed: {e}")
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
if "connection" in str(e).lower() or "refused" in str(e).lower():
|
|
99
|
+
logger.error(f"Failed to connect to Ollama at {self.host}: {e}")
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"Cannot connect to Ollama at {self.host}. "
|
|
102
|
+
"Is Ollama running? Start it with: ollama serve"
|
|
103
|
+
) from e
|
|
104
|
+
raise
|
|
105
|
+
|
|
106
|
+
def complete_structured(
|
|
107
|
+
self,
|
|
108
|
+
prompt: str,
|
|
109
|
+
response_model: type[T],
|
|
110
|
+
temperature: float = 0.0,
|
|
111
|
+
) -> T:
|
|
112
|
+
"""Generate a structured completion with Pydantic validation.
|
|
113
|
+
|
|
114
|
+
Uses Ollama's structured output feature to enforce a JSON schema,
|
|
115
|
+
then validates with Pydantic for type safety.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
prompt: The prompt to complete
|
|
119
|
+
response_model: Pydantic model class for the response
|
|
120
|
+
temperature: Sampling temperature (0.0 for deterministic)
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Validated Pydantic model instance
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
ollama.ResponseError: If the request fails
|
|
127
|
+
pydantic.ValidationError: If response doesn't match schema
|
|
128
|
+
ValueError: If Ollama connection fails
|
|
129
|
+
"""
|
|
130
|
+
# Get JSON schema from Pydantic model
|
|
131
|
+
schema = response_model.model_json_schema()
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
response = self._client.generate(
|
|
135
|
+
model=self.model,
|
|
136
|
+
prompt=prompt,
|
|
137
|
+
format=schema, # Ollama enforces this schema
|
|
138
|
+
options={"temperature": temperature},
|
|
139
|
+
)
|
|
140
|
+
response_text = response.get("response", "")
|
|
141
|
+
|
|
142
|
+
# Parse and validate with Pydantic
|
|
143
|
+
return response_model.model_validate_json(response_text)
|
|
144
|
+
|
|
145
|
+
except ollama.ResponseError as e:
|
|
146
|
+
logger.error(f"Ollama request failed: {e}")
|
|
147
|
+
raise
|
|
148
|
+
|
|
149
|
+
except Exception as e:
|
|
150
|
+
if "connection" in str(e).lower() or "refused" in str(e).lower():
|
|
151
|
+
logger.error(f"Failed to connect to Ollama at {self.host}: {e}")
|
|
152
|
+
raise ValueError(
|
|
153
|
+
f"Cannot connect to Ollama at {self.host}. "
|
|
154
|
+
"Is Ollama running? Start it with: ollama serve"
|
|
155
|
+
) from e
|
|
156
|
+
raise
|
|
157
|
+
|
|
158
|
+
def is_available(self) -> bool:
|
|
159
|
+
"""Check if Ollama is running and the model is available.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
True if Ollama is accessible and model is pulled
|
|
163
|
+
"""
|
|
164
|
+
try:
|
|
165
|
+
response = self._client.list()
|
|
166
|
+
# SDK returns ListResponse with .models attribute containing Model objects
|
|
167
|
+
model_names = [m.model.split(":")[0] for m in response.models]
|
|
168
|
+
return self.model.split(":")[0] in model_names
|
|
169
|
+
except Exception:
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
def __repr__(self) -> str:
|
|
173
|
+
return f"OllamaBackend(model={self.model!r}, host={self.host!r})"
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Base class for LLM-based judges.
|
|
2
|
+
|
|
3
|
+
LLM judges evaluate semantic aspects of traces that require
|
|
4
|
+
natural language understanding. They are marked as non-deterministic
|
|
5
|
+
and include full reproducibility metadata.
|
|
6
|
+
|
|
7
|
+
Supports structured output with Pydantic models for reliable parsing.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from abc import abstractmethod
|
|
11
|
+
from typing import Any, Optional, Protocol, TypeVar
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
from context_forge.core.trace import TraceRun
|
|
16
|
+
from context_forge.graders.base import Grader, GraderResult
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T", bound=BaseModel)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LLMBackend(Protocol):
|
|
22
|
+
"""Protocol for LLM backends (Ollama, OpenAI, etc.)."""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def model_id(self) -> str:
|
|
26
|
+
"""The model identifier."""
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
def complete(self, prompt: str, temperature: float = 0.0) -> str:
|
|
30
|
+
"""Generate a completion for the given prompt.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
prompt: The prompt to complete
|
|
34
|
+
temperature: Sampling temperature (0.0 for deterministic)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The model's response text
|
|
38
|
+
"""
|
|
39
|
+
...
|
|
40
|
+
|
|
41
|
+
def complete_structured(
|
|
42
|
+
self,
|
|
43
|
+
prompt: str,
|
|
44
|
+
response_model: type[T],
|
|
45
|
+
temperature: float = 0.0,
|
|
46
|
+
) -> T:
|
|
47
|
+
"""Generate a structured completion with Pydantic validation.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
prompt: The prompt to complete
|
|
51
|
+
response_model: Pydantic model class for the response
|
|
52
|
+
temperature: Sampling temperature (0.0 for deterministic)
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Validated Pydantic model instance
|
|
56
|
+
"""
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class LLMJudge(Grader):
|
|
61
|
+
"""Base class for LLM-based judges.
|
|
62
|
+
|
|
63
|
+
LLM judges use language models to evaluate aspects of traces that
|
|
64
|
+
require semantic understanding. Unlike deterministic graders, they:
|
|
65
|
+
- Are marked as non-deterministic
|
|
66
|
+
- Include full reproducibility metadata (prompt, response, model)
|
|
67
|
+
- Support configurable backends (Ollama, OpenAI, etc.)
|
|
68
|
+
|
|
69
|
+
Subclasses must implement:
|
|
70
|
+
- _build_prompt(): Construct the evaluation prompt
|
|
71
|
+
- _parse_response(): Parse the LLM response into a GraderResult
|
|
72
|
+
|
|
73
|
+
Usage:
|
|
74
|
+
class MyJudge(LLMJudge):
|
|
75
|
+
def _build_prompt(self, trace):
|
|
76
|
+
return f"Evaluate this trace: {trace}"
|
|
77
|
+
|
|
78
|
+
def _parse_response(self, response, trace):
|
|
79
|
+
return GraderResult(...)
|
|
80
|
+
|
|
81
|
+
judge = MyJudge(backend=OllamaBackend(model="llama3.2"))
|
|
82
|
+
result = judge.grade(trace)
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
name = "llm_judge"
|
|
86
|
+
deterministic = False # LLM outputs can vary
|
|
87
|
+
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
backend: LLMBackend,
|
|
91
|
+
temperature: float = 0.0,
|
|
92
|
+
):
|
|
93
|
+
"""Initialize the judge with an LLM backend.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
backend: LLM backend to use for evaluation
|
|
97
|
+
temperature: Sampling temperature (default 0.0 for consistency)
|
|
98
|
+
"""
|
|
99
|
+
self.backend = backend
|
|
100
|
+
self.temperature = temperature
|
|
101
|
+
|
|
102
|
+
def grade(self, trace: TraceRun) -> GraderResult:
|
|
103
|
+
"""Evaluate a trace using the LLM.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
trace: The trace to evaluate
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
GraderResult with LLM evaluation and reproducibility metadata
|
|
110
|
+
"""
|
|
111
|
+
# Build prompt
|
|
112
|
+
prompt = self._build_prompt(trace)
|
|
113
|
+
|
|
114
|
+
# Call LLM
|
|
115
|
+
response = self.backend.complete(prompt, temperature=self.temperature)
|
|
116
|
+
|
|
117
|
+
# Parse response into result
|
|
118
|
+
result = self._parse_response(response, trace)
|
|
119
|
+
|
|
120
|
+
# Add reproducibility metadata
|
|
121
|
+
if result.metadata is None:
|
|
122
|
+
result.metadata = {}
|
|
123
|
+
|
|
124
|
+
result.metadata.update({
|
|
125
|
+
"llm": {
|
|
126
|
+
"model_id": self.backend.model_id,
|
|
127
|
+
"temperature": self.temperature,
|
|
128
|
+
"prompt": prompt,
|
|
129
|
+
"raw_response": response,
|
|
130
|
+
}
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
@abstractmethod
|
|
136
|
+
def _build_prompt(self, trace: TraceRun) -> str:
|
|
137
|
+
"""Construct the evaluation prompt for the LLM.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
trace: The trace to evaluate
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
The prompt string to send to the LLM
|
|
144
|
+
"""
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
@abstractmethod
|
|
148
|
+
def _parse_response(self, response: str, trace: TraceRun) -> GraderResult:
|
|
149
|
+
"""Parse the LLM response into a GraderResult.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
response: Raw LLM response text
|
|
153
|
+
trace: The original trace (for context)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
GraderResult with parsed findings
|
|
157
|
+
"""
|
|
158
|
+
pass
|