contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. context_forge/__init__.py +95 -0
  2. context_forge/core/__init__.py +55 -0
  3. context_forge/core/trace.py +369 -0
  4. context_forge/core/types.py +121 -0
  5. context_forge/evaluation.py +267 -0
  6. context_forge/exceptions.py +56 -0
  7. context_forge/graders/__init__.py +44 -0
  8. context_forge/graders/base.py +264 -0
  9. context_forge/graders/deterministic/__init__.py +11 -0
  10. context_forge/graders/deterministic/memory_corruption.py +130 -0
  11. context_forge/graders/hybrid.py +190 -0
  12. context_forge/graders/judges/__init__.py +11 -0
  13. context_forge/graders/judges/backends/__init__.py +9 -0
  14. context_forge/graders/judges/backends/ollama.py +173 -0
  15. context_forge/graders/judges/base.py +158 -0
  16. context_forge/graders/judges/memory_hygiene_judge.py +332 -0
  17. context_forge/graders/judges/models.py +113 -0
  18. context_forge/harness/__init__.py +43 -0
  19. context_forge/harness/user_simulator/__init__.py +70 -0
  20. context_forge/harness/user_simulator/adapters/__init__.py +13 -0
  21. context_forge/harness/user_simulator/adapters/base.py +67 -0
  22. context_forge/harness/user_simulator/adapters/crewai.py +100 -0
  23. context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
  24. context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
  25. context_forge/harness/user_simulator/llm/__init__.py +5 -0
  26. context_forge/harness/user_simulator/llm/ollama.py +119 -0
  27. context_forge/harness/user_simulator/models.py +103 -0
  28. context_forge/harness/user_simulator/persona.py +154 -0
  29. context_forge/harness/user_simulator/runner.py +342 -0
  30. context_forge/harness/user_simulator/scenario.py +95 -0
  31. context_forge/harness/user_simulator/simulator.py +307 -0
  32. context_forge/instrumentation/__init__.py +23 -0
  33. context_forge/instrumentation/base.py +307 -0
  34. context_forge/instrumentation/instrumentors/__init__.py +17 -0
  35. context_forge/instrumentation/instrumentors/langchain.py +671 -0
  36. context_forge/instrumentation/instrumentors/langgraph.py +534 -0
  37. context_forge/instrumentation/tracer.py +588 -0
  38. context_forge/py.typed +0 -0
  39. contextforge_eval-0.1.0.dist-info/METADATA +420 -0
  40. contextforge_eval-0.1.0.dist-info/RECORD +43 -0
  41. contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
  42. contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  43. contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,332 @@
1
+ """Memory Hygiene Judge - LLM-based semantic evaluation.
2
+
3
+ Layer 2 of the hybrid Memory Hygiene Grader. Uses an LLM to evaluate:
4
+ - Did the user provide new facts about themselves?
5
+ - Do those facts contradict stored memory?
6
+ - Was memory appropriately updated?
7
+
8
+ These semantic checks require natural language understanding.
9
+
10
+ Uses Ollama's structured output feature for reliable JSON parsing.
11
+ """
12
+
13
+ import json
14
+ import logging
15
+ from typing import Any
16
+
17
+ from pydantic import ValidationError
18
+
19
+ from context_forge.core.trace import (
20
+ MemoryReadStep,
21
+ MemoryWriteStep,
22
+ TraceRun,
23
+ UserInputStep,
24
+ )
25
+ from context_forge.graders.base import Evidence, GraderResult, Severity
26
+ from context_forge.graders.judges.base import LLMBackend, LLMJudge
27
+ from context_forge.graders.judges.models import MemoryHygieneEvaluation
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ MEMORY_HYGIENE_PROMPT_TEMPLATE = '''You are evaluating an AI agent's memory management.
33
+
34
+ ## Task
35
+ Analyze whether the agent correctly identified and saved ONLY facts the user explicitly stated.
36
+
37
+ ## Current Memory (what the agent knew at session start)
38
+ {memory_state}
39
+
40
+ ## User Messages (what the user said during the session)
41
+ {user_messages}
42
+
43
+ ## Memory Changes (field-level differences, old_value -> new_value)
44
+ {memory_writes}
45
+
46
+ ## Step-by-Step Evaluation
47
+
48
+ ### Step 1: List user-stated facts
49
+ Read the user messages carefully. What concrete facts did the user explicitly state about themselves?
50
+ - Only include facts the user ACTUALLY said
51
+ - Example: If user says "I have a Tesla Model 3", that's a user fact
52
+ - Example: If user says "When should I charge?", that contains NO facts about themselves
53
+
54
+ ### Step 2: Check each memory change
55
+ For EACH field in "Memory Changes", ask:
56
+ - Did the user explicitly state this information? If yes → correct save
57
+ - Did the user NOT mention this at all? If the agent invented it → HALLUCINATION
58
+ - Was existing correct data deleted or overwritten incorrectly? → data loss
59
+
60
+ ### Step 3: Check for missed facts
61
+ For each user-stated fact from Step 1, was it saved to memory? If not → missed fact
62
+
63
+ ## What IS a hallucination (flag these!)
64
+ - Agent saves "user plans to buy solar" but user never mentioned solar → HALLUCINATION
65
+ - Agent saves "user prefers morning charging" but user never stated a preference → HALLUCINATION
66
+ - Agent saves ANY new semantic content that the user did not explicitly state → HALLUCINATION
67
+
68
+ ## What is NOT a hallucination (ignore these)
69
+ - Timestamp/metadata changes (updated_at, created_at, IDs)
70
+ - Preserving existing data that was already in memory
71
+ - Reformatting user's words (e.g., "12kW" saved as "12000W")
72
+
73
+ ## Critical Rule
74
+ If the agent writes NEW information to memory that the user did NOT say, that is a hallucination.
75
+ The agent should ONLY save facts the user explicitly stated.
76
+
77
+ Evaluate the memory management and provide your assessment.'''
78
+
79
+
80
+ class MemoryHygieneJudge(LLMJudge):
81
+ """LLM-based judge for memory hygiene semantic evaluation.
82
+
83
+ Evaluates whether the agent correctly identified user facts and
84
+ updated memory appropriately. Catches issues that rule-based
85
+ checks cannot detect:
86
+ - User stated new fact but it wasn't saved
87
+ - Agent saved something user didn't say (hallucination)
88
+ - Contradictions between user statements and memory updates
89
+
90
+ Uses Pydantic models for structured output validation.
91
+
92
+ Usage:
93
+ from context_forge.graders.judges.backends import OllamaBackend
94
+
95
+ judge = MemoryHygieneJudge(backend=OllamaBackend(model="llama3.2"))
96
+ result = judge.grade(trace)
97
+
98
+ if not result.passed:
99
+ print("Issues found:")
100
+ for evidence in result.errors:
101
+ print(f" - {evidence.description}")
102
+ """
103
+
104
+ name = "memory_hygiene_judge"
105
+ required_step_types = ["user_input"]
106
+
107
+ def _build_prompt(self, trace: TraceRun) -> str:
108
+ """Build the evaluation prompt from trace data.
109
+
110
+ Extracts user inputs, memory reads, and memory writes from
111
+ the trace and formats them for LLM evaluation.
112
+ """
113
+ # Extract relevant steps
114
+ user_inputs = [s for s in trace.steps if isinstance(s, UserInputStep)]
115
+ memory_reads = [s for s in trace.steps if isinstance(s, MemoryReadStep)]
116
+ memory_writes = [s for s in trace.steps if isinstance(s, MemoryWriteStep)]
117
+
118
+ # Format memory state (from reads)
119
+ if memory_reads:
120
+ memory_state = self._format_memory_state(memory_reads)
121
+ else:
122
+ memory_state = "No memory was read at session start."
123
+
124
+ # Format user messages
125
+ if user_inputs:
126
+ user_messages = self._format_user_messages(user_inputs)
127
+ else:
128
+ user_messages = "No user messages in this session."
129
+
130
+ # Format memory writes
131
+ if memory_writes:
132
+ memory_writes_text = self._format_memory_writes(memory_writes)
133
+ else:
134
+ memory_writes_text = "No memory updates were made."
135
+
136
+ # Build prompt
137
+ prompt = MEMORY_HYGIENE_PROMPT_TEMPLATE.format(
138
+ memory_state=memory_state,
139
+ user_messages=user_messages,
140
+ memory_writes=memory_writes_text,
141
+ )
142
+ return prompt
143
+
144
+ def _format_memory_state(self, memory_reads: list[MemoryReadStep]) -> str:
145
+ """Format memory read results for the prompt."""
146
+ parts = []
147
+ for i, read in enumerate(memory_reads, 1):
148
+ if read.results:
149
+ # Pretty print the results
150
+ results_str = json.dumps(read.results, indent=2, default=str)
151
+ parts.append(f"Read {i}:\n{results_str}")
152
+ else:
153
+ parts.append(f"Read {i}: (empty)")
154
+ return "\n\n".join(parts)
155
+
156
+ def _format_user_messages(self, user_inputs: list[UserInputStep]) -> str:
157
+ """Format user input messages for the prompt."""
158
+ parts = []
159
+ for i, inp in enumerate(user_inputs, 1):
160
+ parts.append(f"Message {i}: {inp.content}")
161
+ return "\n".join(parts)
162
+
163
+ def _format_memory_writes(self, memory_writes: list[MemoryWriteStep]) -> str:
164
+ """Format memory writes for the prompt."""
165
+ parts = []
166
+ for i, write in enumerate(memory_writes, 1):
167
+ if write.changes:
168
+ changes_str = "\n".join(
169
+ f" - {c.path}: {c.old_value} -> {c.new_value}"
170
+ for c in write.changes
171
+ )
172
+ parts.append(f"Write {i} (to {write.namespace}):\n{changes_str}")
173
+ else:
174
+ parts.append(f"Write {i}: {write.data}")
175
+ return "\n\n".join(parts)
176
+
177
+ def grade(self, trace: TraceRun) -> GraderResult:
178
+ """Evaluate a trace using structured LLM output.
179
+
180
+ Overrides base class to use complete_structured for reliable parsing.
181
+
182
+ Args:
183
+ trace: The trace to evaluate
184
+
185
+ Returns:
186
+ GraderResult with LLM evaluation
187
+ """
188
+ prompt = self._build_prompt(trace)
189
+
190
+ try:
191
+ # Use structured output - Ollama enforces the schema
192
+ evaluation = self.backend.complete_structured(
193
+ prompt=prompt,
194
+ response_model=MemoryHygieneEvaluation,
195
+ temperature=self.temperature,
196
+ )
197
+
198
+ # Convert to GraderResult
199
+ evidence = self._evaluation_to_evidence(evaluation)
200
+ result = GraderResult(
201
+ grader_name=self.name,
202
+ passed=evaluation.passed,
203
+ score=evaluation.score,
204
+ evidence=evidence,
205
+ )
206
+
207
+ # Add reproducibility metadata
208
+ result.metadata = {
209
+ "llm": {
210
+ "model_id": self.backend.model_id,
211
+ "temperature": self.temperature,
212
+ "prompt": prompt,
213
+ }
214
+ }
215
+
216
+ return result
217
+
218
+ except (ValidationError, ValueError) as e:
219
+ logger.warning(f"Structured output failed: {e}")
220
+
221
+ # Fallback: return a warning result
222
+ return GraderResult(
223
+ grader_name=self.name,
224
+ passed=True, # Don't fail just because of LLM error
225
+ score=0.5,
226
+ evidence=[
227
+ Evidence(
228
+ check_name="llm_error",
229
+ description=f"LLM evaluation failed: {e}",
230
+ severity=Severity.WARN,
231
+ )
232
+ ],
233
+ metadata={
234
+ "llm": {
235
+ "model_id": self.backend.model_id,
236
+ "temperature": self.temperature,
237
+ "prompt": prompt,
238
+ "error": str(e),
239
+ }
240
+ },
241
+ )
242
+
243
+ def _parse_response(self, response: str, trace: TraceRun) -> GraderResult:
244
+ """Parse LLM response (not used with structured output).
245
+
246
+ This method is kept for compatibility but the grade() method
247
+ uses complete_structured() instead.
248
+ """
249
+ # This shouldn't be called when using structured output
250
+ raise NotImplementedError(
251
+ "MemoryHygieneJudge uses structured output via grade() method"
252
+ )
253
+
254
+ def _evaluation_to_evidence(
255
+ self, evaluation: MemoryHygieneEvaluation
256
+ ) -> list[Evidence]:
257
+ """Convert a validated evaluation to evidence items."""
258
+ evidence: list[Evidence] = []
259
+
260
+ # Missed facts (ERROR)
261
+ for item in evaluation.facts_missed:
262
+ evidence.append(
263
+ Evidence(
264
+ check_name="missed_fact",
265
+ description=f"User stated '{item.fact}' but it was not saved",
266
+ severity=Severity.ERROR,
267
+ details={
268
+ "fact": item.fact,
269
+ "should_have_updated": item.should_have_updated,
270
+ },
271
+ )
272
+ )
273
+
274
+ # Hallucinations (ERROR)
275
+ for item in evaluation.hallucinations:
276
+ evidence.append(
277
+ Evidence(
278
+ check_name="hallucination",
279
+ description=f"Agent saved '{item.saved}' which user did not state",
280
+ severity=Severity.ERROR,
281
+ details={
282
+ "saved": item.saved,
283
+ "reason": item.reason,
284
+ },
285
+ )
286
+ )
287
+
288
+ # Data loss (ERROR)
289
+ for item in evaluation.data_incorrectly_lost:
290
+ evidence.append(
291
+ Evidence(
292
+ check_name="incorrect_data_loss",
293
+ description=f"Field '{item.field}' was incorrectly overwritten",
294
+ severity=Severity.ERROR,
295
+ details={
296
+ "field": item.field,
297
+ "old_value": item.old_value,
298
+ "reason": item.reason,
299
+ },
300
+ )
301
+ )
302
+
303
+ # Correctly saved facts (INFO - positive feedback)
304
+ for item in evaluation.facts_correctly_saved:
305
+ evidence.append(
306
+ Evidence(
307
+ check_name="correct_save",
308
+ description=f"Correctly saved: '{item.fact}'",
309
+ severity=Severity.INFO,
310
+ details={
311
+ "fact": item.fact,
312
+ "saved_as": item.saved_as,
313
+ },
314
+ )
315
+ )
316
+
317
+ # Summary (INFO)
318
+ evidence.append(
319
+ Evidence(
320
+ check_name="llm_summary",
321
+ description=evaluation.summary,
322
+ severity=Severity.INFO,
323
+ details={
324
+ "user_facts_count": len(evaluation.user_facts_stated),
325
+ "correctly_saved_count": len(evaluation.facts_correctly_saved),
326
+ "missed_count": len(evaluation.facts_missed),
327
+ "hallucinations_count": len(evaluation.hallucinations),
328
+ },
329
+ )
330
+ )
331
+
332
+ return evidence
@@ -0,0 +1,113 @@
1
+ """Pydantic models for LLM judge responses.
2
+
3
+ Using Pydantic models for LLM output provides:
4
+ - Structured validation of responses
5
+ - Clear schema documentation
6
+ - Better error messages when parsing fails
7
+ - Type safety throughout the codebase
8
+ """
9
+
10
+ from typing import Optional
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+
15
+ class UserFact(BaseModel):
16
+ """A fact the user stated about themselves."""
17
+
18
+ fact: str = Field(description="Description of what the user stated")
19
+ topic: str = Field(description="Category: equipment, schedule, preference, household, location")
20
+
21
+
22
+ class CorrectSave(BaseModel):
23
+ """A fact that was correctly saved to memory."""
24
+
25
+ fact: str = Field(description="What the user stated")
26
+ saved_as: str = Field(description="How it was saved to memory")
27
+
28
+
29
+ class MissedFact(BaseModel):
30
+ """A fact the user stated but was not saved."""
31
+
32
+ fact: str = Field(description="What the user stated")
33
+ should_have_updated: str = Field(description="Which memory field should have been updated")
34
+
35
+
36
+ class Hallucination(BaseModel):
37
+ """Something saved to memory that the user did not state."""
38
+
39
+ saved: str = Field(description="What was incorrectly saved")
40
+ reason: str = Field(description="Why this is considered a hallucination")
41
+
42
+
43
+ class DataLoss(BaseModel):
44
+ """Correct data that was incorrectly lost or overwritten."""
45
+
46
+ field: str = Field(description="Which field was affected")
47
+ old_value: str = Field(description="The value that was lost")
48
+ reason: str = Field(description="Why this loss was incorrect")
49
+
50
+
51
+ class MemoryHygieneEvaluation(BaseModel):
52
+ """Complete evaluation result from the Memory Hygiene Judge.
53
+
54
+ This model defines the expected structure of the LLM's response.
55
+ The LLM is prompted to return JSON matching this schema.
56
+ """
57
+
58
+ user_facts_stated: list[UserFact] = Field(
59
+ default_factory=list,
60
+ description="Facts the user stated about themselves during the session",
61
+ )
62
+ facts_correctly_saved: list[CorrectSave] = Field(
63
+ default_factory=list,
64
+ description="Facts that were correctly identified and saved",
65
+ )
66
+ facts_missed: list[MissedFact] = Field(
67
+ default_factory=list,
68
+ description="Facts the user stated but were not saved to memory",
69
+ )
70
+ hallucinations: list[Hallucination] = Field(
71
+ default_factory=list,
72
+ description="Things saved to memory that the user did not actually state",
73
+ )
74
+ data_incorrectly_lost: list[DataLoss] = Field(
75
+ default_factory=list,
76
+ description="Correct data that was incorrectly overwritten or deleted",
77
+ )
78
+ summary: str = Field(
79
+ description="One sentence summary of memory management quality"
80
+ )
81
+ score: float = Field(
82
+ ge=0.0,
83
+ le=1.0,
84
+ description="Quality score from 0.0 (worst) to 1.0 (best)",
85
+ )
86
+ passed: bool = Field(
87
+ description="Whether the memory management passed evaluation",
88
+ )
89
+
90
+ @classmethod
91
+ def get_json_schema_prompt(cls) -> str:
92
+ """Get a prompt-friendly description of the expected JSON schema."""
93
+ # Note: Using single braces - this string is NOT passed through .format()
94
+ return """{
95
+ "user_facts_stated": [
96
+ {"fact": "description of fact", "topic": "equipment|schedule|preference|household|location"}
97
+ ],
98
+ "facts_correctly_saved": [
99
+ {"fact": "what user stated", "saved_as": "how it was saved"}
100
+ ],
101
+ "facts_missed": [
102
+ {"fact": "what user stated", "should_have_updated": "which memory field"}
103
+ ],
104
+ "hallucinations": [
105
+ {"saved": "what was incorrectly saved", "reason": "why this is wrong"}
106
+ ],
107
+ "data_incorrectly_lost": [
108
+ {"field": "which field", "old_value": "what was lost", "reason": "why this was wrong"}
109
+ ],
110
+ "summary": "One sentence summary",
111
+ "score": 0.0 to 1.0,
112
+ "passed": true or false
113
+ }"""
@@ -0,0 +1,43 @@
1
+ """ContextForge harness module for evaluation and simulation."""
2
+
3
+ from context_forge.harness.user_simulator import (
4
+ AgentAdapter,
5
+ BatchSimulationRunner,
6
+ CrewAIAdapter,
7
+ GenerativeScenario,
8
+ Goal,
9
+ LangGraphAdapter,
10
+ LLMUserSimulator,
11
+ Persona,
12
+ PydanticAIAdapter,
13
+ ScriptedScenario,
14
+ ScriptedUserSimulator,
15
+ SimulationResult,
16
+ SimulationRunner,
17
+ SimulationState,
18
+ UserSimulator,
19
+ )
20
+
21
+ __all__ = [
22
+ # Runner
23
+ "SimulationRunner",
24
+ "BatchSimulationRunner",
25
+ # State
26
+ "SimulationState",
27
+ "SimulationResult",
28
+ # Personas & Scenarios
29
+ "Persona",
30
+ "Goal",
31
+ "ScriptedScenario",
32
+ "GenerativeScenario",
33
+ # Protocols
34
+ "UserSimulator",
35
+ "AgentAdapter",
36
+ # Simulators
37
+ "LLMUserSimulator",
38
+ "ScriptedUserSimulator",
39
+ # Adapters
40
+ "LangGraphAdapter",
41
+ "CrewAIAdapter",
42
+ "PydanticAIAdapter",
43
+ ]
@@ -0,0 +1,70 @@
1
+ """User simulator module for generating multi-turn conversations with agents."""
2
+
3
+ from .adapters import (
4
+ AgentAdapter,
5
+ CrewAIAdapter,
6
+ LangGraphAdapter,
7
+ PydanticAIAdapter,
8
+ )
9
+ from .llm import OllamaClient, OllamaConfig
10
+ from .models import (
11
+ ConversationRole,
12
+ SimulationResult,
13
+ SimulationState,
14
+ SimulationTurn,
15
+ )
16
+ from .persona import (
17
+ Behavior,
18
+ CommunicationStyle,
19
+ Goal,
20
+ Persona,
21
+ TechnicalLevel,
22
+ )
23
+ from .runner import BatchSimulationRunner, SimulationRunner
24
+ from .scenario import (
25
+ GenerativeScenario,
26
+ Scenario,
27
+ ScriptedScenario,
28
+ ScriptedTurn,
29
+ TerminationCondition,
30
+ )
31
+ from .simulator import (
32
+ LLMUserSimulator,
33
+ ScriptedUserSimulator,
34
+ UserSimulator,
35
+ )
36
+
37
+ __all__ = [
38
+ # Models
39
+ "SimulationState",
40
+ "SimulationResult",
41
+ "SimulationTurn",
42
+ "ConversationRole",
43
+ # Personas
44
+ "Persona",
45
+ "Behavior",
46
+ "Goal",
47
+ "CommunicationStyle",
48
+ "TechnicalLevel",
49
+ # Scenarios
50
+ "Scenario",
51
+ "ScriptedScenario",
52
+ "GenerativeScenario",
53
+ "ScriptedTurn",
54
+ "TerminationCondition",
55
+ # Simulators
56
+ "UserSimulator",
57
+ "LLMUserSimulator",
58
+ "ScriptedUserSimulator",
59
+ # Adapters
60
+ "AgentAdapter",
61
+ "LangGraphAdapter",
62
+ "CrewAIAdapter",
63
+ "PydanticAIAdapter",
64
+ # Runner
65
+ "SimulationRunner",
66
+ "BatchSimulationRunner",
67
+ # LLM
68
+ "OllamaClient",
69
+ "OllamaConfig",
70
+ ]
@@ -0,0 +1,13 @@
1
+ """Agent adapters for different frameworks."""
2
+
3
+ from .base import AgentAdapter
4
+ from .crewai import CrewAIAdapter
5
+ from .langgraph import LangGraphAdapter
6
+ from .pydanticai import PydanticAIAdapter
7
+
8
+ __all__ = [
9
+ "AgentAdapter",
10
+ "LangGraphAdapter",
11
+ "CrewAIAdapter",
12
+ "PydanticAIAdapter",
13
+ ]
@@ -0,0 +1,67 @@
1
+ """Base protocol for agent adapters."""
2
+
3
+ from typing import Any, Protocol, runtime_checkable
4
+
5
+ from langchain_core.messages import BaseMessage
6
+
7
+ from ..models import SimulationState
8
+
9
+
10
+ @runtime_checkable
11
+ class AgentAdapter(Protocol):
12
+ """Protocol for adapting different agent frameworks to the simulation harness.
13
+
14
+ Each adapter wraps a framework-specific agent and provides a uniform
15
+ interface for:
16
+ - Invoking the agent with user messages
17
+ - Extracting responses in BaseMessage format
18
+ - Managing agent state between turns
19
+ """
20
+
21
+ @property
22
+ def framework(self) -> str:
23
+ """Return the framework name (e.g., 'langgraph', 'crewai', 'pydanticai')."""
24
+ ...
25
+
26
+ @property
27
+ def agent_name(self) -> str:
28
+ """Return the agent's name/identifier."""
29
+ ...
30
+
31
+ async def invoke(
32
+ self,
33
+ message: BaseMessage,
34
+ state: SimulationState,
35
+ ) -> BaseMessage:
36
+ """Invoke the agent with a user message and return the response.
37
+
38
+ Args:
39
+ message: User's input message (HumanMessage)
40
+ state: Current simulation state for context
41
+
42
+ Returns:
43
+ Agent's response as AIMessage
44
+ """
45
+ ...
46
+
47
+ async def initialize(
48
+ self,
49
+ config: dict[str, Any] | None = None,
50
+ ) -> None:
51
+ """Initialize the agent before simulation starts.
52
+
53
+ Called once per simulation run. Use for setup that should
54
+ happen before the first turn.
55
+ """
56
+ ...
57
+
58
+ async def cleanup(self) -> None:
59
+ """Clean up agent resources after simulation ends."""
60
+ ...
61
+
62
+ def get_state(self) -> dict[str, Any]:
63
+ """Get the current internal state of the agent.
64
+
65
+ Used for trace capture and debugging.
66
+ """
67
+ ...