agent-runtime-core 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_runtime/__init__.py +19 -1
- agent_runtime/testing.py +358 -0
- {agent_runtime_core-0.1.1.dist-info → agent_runtime_core-0.1.2.dist-info}/METADATA +1 -1
- {agent_runtime_core-0.1.1.dist-info → agent_runtime_core-0.1.2.dist-info}/RECORD +6 -5
- {agent_runtime_core-0.1.1.dist-info → agent_runtime_core-0.1.2.dist-info}/WHEEL +0 -0
- {agent_runtime_core-0.1.1.dist-info → agent_runtime_core-0.1.2.dist-info}/licenses/LICENSE +0 -0
agent_runtime/__init__.py
CHANGED
|
@@ -34,7 +34,7 @@ Example usage:
|
|
|
34
34
|
return RunResult(final_output={"message": "Hello!"})
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
-
__version__ = "0.1.
|
|
37
|
+
__version__ = "0.1.2"
|
|
38
38
|
|
|
39
39
|
# Core interfaces
|
|
40
40
|
from agent_runtime.interfaces import (
|
|
@@ -76,6 +76,17 @@ from agent_runtime.runner import (
|
|
|
76
76
|
RunContextImpl,
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
+
|
|
80
|
+
# Testing utilities
|
|
81
|
+
from agent_runtime.testing import (
|
|
82
|
+
MockRunContext,
|
|
83
|
+
MockLLMClient,
|
|
84
|
+
MockLLMResponse,
|
|
85
|
+
LLMEvaluator,
|
|
86
|
+
create_test_context,
|
|
87
|
+
run_agent_test,
|
|
88
|
+
)
|
|
89
|
+
|
|
79
90
|
__all__ = [
|
|
80
91
|
# Version
|
|
81
92
|
"__version__",
|
|
@@ -107,4 +118,11 @@ __all__ = [
|
|
|
107
118
|
"AgentRunner",
|
|
108
119
|
"RunnerConfig",
|
|
109
120
|
"RunContextImpl",
|
|
121
|
+
# Testing
|
|
122
|
+
"MockRunContext",
|
|
123
|
+
"MockLLMClient",
|
|
124
|
+
"MockLLMResponse",
|
|
125
|
+
"LLMEvaluator",
|
|
126
|
+
"create_test_context",
|
|
127
|
+
"run_agent_test",
|
|
110
128
|
]
|
agent_runtime/testing.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Testing utilities for agent runtimes.
|
|
3
|
+
|
|
4
|
+
This module provides tools for testing agent implementations:
|
|
5
|
+
- MockRunContext: A concrete RunContext for unit tests
|
|
6
|
+
- MockLLMClient: A mock LLM client with predefined responses
|
|
7
|
+
- AgentTestCase: Base test class with common helpers
|
|
8
|
+
- LLMEvaluator: Use LLM to evaluate agent responses
|
|
9
|
+
|
|
10
|
+
Example usage:
|
|
11
|
+
from agent_runtime.testing import MockRunContext, MockLLMClient, AgentTestCase
|
|
12
|
+
|
|
13
|
+
class TestMyAgent(AgentTestCase):
|
|
14
|
+
async def test_agent_responds(self):
|
|
15
|
+
ctx = self.create_context("Hello, agent!")
|
|
16
|
+
result = await self.agent.run(ctx)
|
|
17
|
+
self.assertIn("response", result.final_output)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any, Callable, Optional, AsyncIterator
|
|
23
|
+
from uuid import UUID, uuid4
|
|
24
|
+
import json
|
|
25
|
+
|
|
26
|
+
from .interfaces import (
|
|
27
|
+
AgentRuntime,
|
|
28
|
+
EventType,
|
|
29
|
+
LLMClient,
|
|
30
|
+
LLMResponse,
|
|
31
|
+
LLMStreamChunk,
|
|
32
|
+
Message,
|
|
33
|
+
RunContext,
|
|
34
|
+
RunResult,
|
|
35
|
+
Tool,
|
|
36
|
+
ToolRegistry,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MockRunContext:
|
|
42
|
+
"""
|
|
43
|
+
A concrete implementation of RunContext for testing.
|
|
44
|
+
|
|
45
|
+
Use this in unit tests to provide a context to your agent
|
|
46
|
+
without needing the full runtime infrastructure.
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
ctx = MockRunContext(
|
|
50
|
+
input_messages=[{"role": "user", "content": "Hello"}],
|
|
51
|
+
metadata={"user_id": "123"}
|
|
52
|
+
)
|
|
53
|
+
result = await my_agent.run(ctx)
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
input_messages: list[Message] = field(default_factory=list)
|
|
57
|
+
params: dict = field(default_factory=dict)
|
|
58
|
+
metadata: dict = field(default_factory=dict)
|
|
59
|
+
run_id: UUID = field(default_factory=uuid4)
|
|
60
|
+
conversation_id: Optional[UUID] = None
|
|
61
|
+
tool_registry: ToolRegistry = field(default_factory=ToolRegistry)
|
|
62
|
+
|
|
63
|
+
# Internal state
|
|
64
|
+
_events: list[tuple[str, dict]] = field(default_factory=list)
|
|
65
|
+
_checkpoints: list[dict] = field(default_factory=list)
|
|
66
|
+
_cancelled: bool = False
|
|
67
|
+
|
|
68
|
+
async def emit(self, event_type: EventType | str, payload: dict) -> None:
|
|
69
|
+
"""Record emitted events for later inspection."""
|
|
70
|
+
event_name = event_type.value if isinstance(event_type, EventType) else event_type
|
|
71
|
+
self._events.append((event_name, payload))
|
|
72
|
+
|
|
73
|
+
async def checkpoint(self, state: dict) -> None:
|
|
74
|
+
"""Save a checkpoint."""
|
|
75
|
+
self._checkpoints.append(state)
|
|
76
|
+
|
|
77
|
+
async def get_state(self) -> Optional[dict]:
|
|
78
|
+
"""Get the last checkpoint."""
|
|
79
|
+
return self._checkpoints[-1] if self._checkpoints else None
|
|
80
|
+
|
|
81
|
+
def cancelled(self) -> bool:
|
|
82
|
+
"""Check if cancelled."""
|
|
83
|
+
return self._cancelled
|
|
84
|
+
|
|
85
|
+
def cancel(self) -> None:
|
|
86
|
+
"""Request cancellation."""
|
|
87
|
+
self._cancelled = True
|
|
88
|
+
|
|
89
|
+
# Test helpers
|
|
90
|
+
def get_events(self, event_type: Optional[str] = None) -> list[tuple[str, dict]]:
|
|
91
|
+
"""Get recorded events, optionally filtered by type."""
|
|
92
|
+
if event_type is None:
|
|
93
|
+
return self._events
|
|
94
|
+
return [(t, p) for t, p in self._events if t == event_type]
|
|
95
|
+
|
|
96
|
+
def get_checkpoints(self) -> list[dict]:
|
|
97
|
+
"""Get all checkpoints."""
|
|
98
|
+
return self._checkpoints
|
|
99
|
+
|
|
100
|
+
def clear(self) -> None:
|
|
101
|
+
"""Clear recorded events and checkpoints."""
|
|
102
|
+
self._events.clear()
|
|
103
|
+
self._checkpoints.clear()
|
|
104
|
+
self._cancelled = False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class MockLLMResponse:
|
|
109
|
+
"""A predefined response for MockLLMClient."""
|
|
110
|
+
content: str
|
|
111
|
+
tool_calls: Optional[list[dict]] = None
|
|
112
|
+
finish_reason: str = "stop"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class MockLLMClient(LLMClient):
|
|
116
|
+
"""
|
|
117
|
+
A mock LLM client for testing.
|
|
118
|
+
|
|
119
|
+
Configure with predefined responses or a response function.
|
|
120
|
+
|
|
121
|
+
Example:
|
|
122
|
+
# Simple predefined responses
|
|
123
|
+
client = MockLLMClient(responses=[
|
|
124
|
+
MockLLMResponse(content="Hello!"),
|
|
125
|
+
MockLLMResponse(content="How can I help?"),
|
|
126
|
+
])
|
|
127
|
+
|
|
128
|
+
# Dynamic responses based on input
|
|
129
|
+
def respond(messages):
|
|
130
|
+
if "weather" in messages[-1]["content"].lower():
|
|
131
|
+
return MockLLMResponse(content="It's sunny!")
|
|
132
|
+
return MockLLMResponse(content="I don't know.")
|
|
133
|
+
|
|
134
|
+
client = MockLLMClient(response_fn=respond)
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
def __init__(
|
|
138
|
+
self,
|
|
139
|
+
responses: Optional[list[MockLLMResponse]] = None,
|
|
140
|
+
response_fn: Optional[Callable[[list[Message]], MockLLMResponse]] = None,
|
|
141
|
+
default_response: str = "Mock response",
|
|
142
|
+
):
|
|
143
|
+
self._responses = responses or []
|
|
144
|
+
self._response_fn = response_fn
|
|
145
|
+
self._default_response = default_response
|
|
146
|
+
self._call_count = 0
|
|
147
|
+
self._calls: list[dict] = []
|
|
148
|
+
|
|
149
|
+
async def generate(
|
|
150
|
+
self,
|
|
151
|
+
messages: list[Message],
|
|
152
|
+
*,
|
|
153
|
+
model: Optional[str] = None,
|
|
154
|
+
stream: bool = False,
|
|
155
|
+
tools: Optional[list[dict]] = None,
|
|
156
|
+
temperature: Optional[float] = None,
|
|
157
|
+
max_tokens: Optional[int] = None,
|
|
158
|
+
**kwargs,
|
|
159
|
+
) -> LLMResponse:
|
|
160
|
+
"""Generate a mock response."""
|
|
161
|
+
# Record the call
|
|
162
|
+
self._calls.append({
|
|
163
|
+
"messages": messages,
|
|
164
|
+
"model": model,
|
|
165
|
+
"tools": tools,
|
|
166
|
+
"kwargs": kwargs,
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
# Get response
|
|
170
|
+
if self._response_fn:
|
|
171
|
+
mock_resp = self._response_fn(messages)
|
|
172
|
+
elif self._call_count < len(self._responses):
|
|
173
|
+
mock_resp = self._responses[self._call_count]
|
|
174
|
+
else:
|
|
175
|
+
mock_resp = MockLLMResponse(content=self._default_response)
|
|
176
|
+
|
|
177
|
+
self._call_count += 1
|
|
178
|
+
|
|
179
|
+
# Build message
|
|
180
|
+
message: Message = {
|
|
181
|
+
"role": "assistant",
|
|
182
|
+
"content": mock_resp.content,
|
|
183
|
+
}
|
|
184
|
+
if mock_resp.tool_calls:
|
|
185
|
+
message["tool_calls"] = mock_resp.tool_calls
|
|
186
|
+
|
|
187
|
+
return LLMResponse(
|
|
188
|
+
message=message,
|
|
189
|
+
model=model or "mock-model",
|
|
190
|
+
finish_reason=mock_resp.finish_reason,
|
|
191
|
+
usage={"prompt_tokens": 10, "completion_tokens": 20},
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
async def stream(
|
|
195
|
+
self,
|
|
196
|
+
messages: list[Message],
|
|
197
|
+
*,
|
|
198
|
+
model: Optional[str] = None,
|
|
199
|
+
tools: Optional[list[dict]] = None,
|
|
200
|
+
**kwargs,
|
|
201
|
+
) -> AsyncIterator[LLMStreamChunk]:
|
|
202
|
+
"""Stream a mock response (yields content in chunks)."""
|
|
203
|
+
response = await self.generate(messages, model=model, tools=tools, **kwargs)
|
|
204
|
+
content = response.message.get("content", "")
|
|
205
|
+
|
|
206
|
+
# Yield content in chunks
|
|
207
|
+
for i in range(0, len(content), 10):
|
|
208
|
+
yield LLMStreamChunk(delta=content[i:i+10])
|
|
209
|
+
|
|
210
|
+
yield LLMStreamChunk(finish_reason="stop", usage=response.usage)
|
|
211
|
+
|
|
212
|
+
# Test helpers
|
|
213
|
+
def get_calls(self) -> list[dict]:
|
|
214
|
+
"""Get all recorded calls."""
|
|
215
|
+
return self._calls
|
|
216
|
+
|
|
217
|
+
def get_call_count(self) -> int:
|
|
218
|
+
"""Get the number of calls made."""
|
|
219
|
+
return self._call_count
|
|
220
|
+
|
|
221
|
+
def reset(self) -> None:
|
|
222
|
+
"""Reset call tracking."""
|
|
223
|
+
self._call_count = 0
|
|
224
|
+
self._calls.clear()
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class LLMEvaluator:
|
|
228
|
+
"""
|
|
229
|
+
Use an LLM to evaluate agent responses.
|
|
230
|
+
|
|
231
|
+
This is useful for testing that agent responses meet certain criteria
|
|
232
|
+
without having to write brittle string matching tests.
|
|
233
|
+
|
|
234
|
+
Example:
|
|
235
|
+
evaluator = LLMEvaluator(openai_client)
|
|
236
|
+
|
|
237
|
+
passed, explanation = await evaluator.evaluate(
|
|
238
|
+
user_query="What's the weather?",
|
|
239
|
+
agent_response="It's currently 72°F and sunny in San Francisco.",
|
|
240
|
+
criteria="The response should include temperature and weather conditions"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
assert passed, f"Evaluation failed: {explanation}"
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(self, llm_client: LLMClient, model: str = "gpt-4o-mini"):
|
|
247
|
+
self._client = llm_client
|
|
248
|
+
self._model = model
|
|
249
|
+
|
|
250
|
+
async def evaluate(
|
|
251
|
+
self,
|
|
252
|
+
user_query: str,
|
|
253
|
+
agent_response: str,
|
|
254
|
+
criteria: str,
|
|
255
|
+
) -> tuple[bool, str]:
|
|
256
|
+
"""
|
|
257
|
+
Evaluate an agent response against criteria.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
user_query: The original user query
|
|
261
|
+
agent_response: The agent's response
|
|
262
|
+
criteria: What the response should satisfy
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Tuple of (passed: bool, explanation: str)
|
|
266
|
+
"""
|
|
267
|
+
eval_prompt = f"""You are evaluating an AI assistant's response.
|
|
268
|
+
|
|
269
|
+
User Query: {user_query}
|
|
270
|
+
|
|
271
|
+
Agent Response: {agent_response}
|
|
272
|
+
|
|
273
|
+
Evaluation Criteria: {criteria}
|
|
274
|
+
|
|
275
|
+
Does the response meet the criteria? Answer with just "PASS" or "FAIL" followed by a brief explanation."""
|
|
276
|
+
|
|
277
|
+
response = await self._client.generate(
|
|
278
|
+
messages=[{"role": "user", "content": eval_prompt}],
|
|
279
|
+
model=self._model,
|
|
280
|
+
temperature=0,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
result = response.message.get("content", "FAIL Unknown error")
|
|
284
|
+
passed = result.strip().upper().startswith("PASS")
|
|
285
|
+
return passed, result
|
|
286
|
+
|
|
287
|
+
async def evaluate_tool_usage(
|
|
288
|
+
self,
|
|
289
|
+
user_query: str,
|
|
290
|
+
tool_calls: list[dict],
|
|
291
|
+
expected_tools: list[str],
|
|
292
|
+
) -> tuple[bool, str]:
|
|
293
|
+
"""
|
|
294
|
+
Evaluate whether the agent used the expected tools.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
user_query: The original user query
|
|
298
|
+
tool_calls: List of tool calls made by the agent
|
|
299
|
+
expected_tools: List of tool names that should have been called
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Tuple of (passed: bool, explanation: str)
|
|
303
|
+
"""
|
|
304
|
+
tool_names = [tc.get("function", {}).get("name", tc.get("name", "unknown"))
|
|
305
|
+
for tc in tool_calls]
|
|
306
|
+
|
|
307
|
+
missing = set(expected_tools) - set(tool_names)
|
|
308
|
+
if missing:
|
|
309
|
+
return False, f"Missing expected tools: {missing}. Called: {tool_names}"
|
|
310
|
+
|
|
311
|
+
return True, f"All expected tools were called: {tool_names}"
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def create_test_context(
|
|
315
|
+
message: str,
|
|
316
|
+
*,
|
|
317
|
+
tools: Optional[list[Tool]] = None,
|
|
318
|
+
metadata: Optional[dict] = None,
|
|
319
|
+
params: Optional[dict] = None,
|
|
320
|
+
) -> MockRunContext:
|
|
321
|
+
"""
|
|
322
|
+
Convenience function to create a test context.
|
|
323
|
+
|
|
324
|
+
Example:
|
|
325
|
+
ctx = create_test_context("Hello, agent!", tools=[my_tool])
|
|
326
|
+
result = await agent.run(ctx)
|
|
327
|
+
"""
|
|
328
|
+
registry = ToolRegistry()
|
|
329
|
+
if tools:
|
|
330
|
+
for tool in tools:
|
|
331
|
+
registry.register(tool)
|
|
332
|
+
|
|
333
|
+
return MockRunContext(
|
|
334
|
+
input_messages=[{"role": "user", "content": message}],
|
|
335
|
+
tool_registry=registry,
|
|
336
|
+
metadata=metadata or {},
|
|
337
|
+
params=params or {},
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
async def run_agent_test(
|
|
342
|
+
agent: AgentRuntime,
|
|
343
|
+
message: str,
|
|
344
|
+
*,
|
|
345
|
+
tools: Optional[list[Tool]] = None,
|
|
346
|
+
metadata: Optional[dict] = None,
|
|
347
|
+
) -> tuple[RunResult, MockRunContext]:
|
|
348
|
+
"""
|
|
349
|
+
Run an agent with a test message and return both result and context.
|
|
350
|
+
|
|
351
|
+
Example:
|
|
352
|
+
result, ctx = await run_agent_test(my_agent, "Hello!")
|
|
353
|
+
assert "greeting" in result.final_output
|
|
354
|
+
assert len(ctx.get_events()) > 0
|
|
355
|
+
"""
|
|
356
|
+
ctx = create_test_context(message, tools=tools, metadata=metadata)
|
|
357
|
+
result = await agent.run(ctx)
|
|
358
|
+
return result, ctx
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agent-runtime-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Framework-agnostic Python library for executing AI agents with consistent patterns
|
|
5
5
|
Project-URL: Homepage, https://github.com/colstrom/agent_runtime
|
|
6
6
|
Project-URL: Repository, https://github.com/colstrom/agent_runtime
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
agent_runtime/__init__.py,sha256=
|
|
1
|
+
agent_runtime/__init__.py,sha256=4d_QOOgxuntU8u6ltwJ__BNFzO-tAKfv0nHM8dCqf5g,2430
|
|
2
2
|
agent_runtime/config.py,sha256=ZRjpILjsjeh_kl7873DtV2g_zaTrfdkb3NgdQ6ndb5Y,4981
|
|
3
3
|
agent_runtime/interfaces.py,sha256=AGDY0w6muQnNiice9O3ogb8fRdgMRW6wqXpxcuyn0N0,10103
|
|
4
4
|
agent_runtime/registry.py,sha256=sa0speDFxFCZlXoCge8cPNqWYUeWHyazs6tBer5Jg1w,1471
|
|
5
5
|
agent_runtime/runner.py,sha256=Sb2FfSJvATaL7ideQZy2JhVZp0sSYGVIov93E-gxODU,12741
|
|
6
|
+
agent_runtime/testing.py,sha256=aqN67RdbTdYf_rJfp5pEpn2s_tkeU-3oSpzTdADxH5g,11126
|
|
6
7
|
agent_runtime/events/__init__.py,sha256=JNH-D40O6yz2evIf1_r2o3w7OQjLt4Yebn-sBNLzzh8,1550
|
|
7
8
|
agent_runtime/events/base.py,sha256=NfHYyoczxr40Er5emROi_aY_07m5hDrKsn31pdWY2DY,1950
|
|
8
9
|
agent_runtime/events/memory.py,sha256=7qseR6RtdaP833FxEHwyPw5TC7l4brJHr8uEx0mLc1Y,2486
|
|
@@ -25,7 +26,7 @@ agent_runtime/state/sqlite.py,sha256=NwuiTBXELb2tyOoH91MZqRJaCk9h8PskyY2VUc5EMr0
|
|
|
25
26
|
agent_runtime/tracing/__init__.py,sha256=m4WzfgJpnV5XCCoMpBYZdJU_JTkAdhEhl7M7tpf62RY,1246
|
|
26
27
|
agent_runtime/tracing/langfuse.py,sha256=uThF0P6f1VJ1l1b7UuiFQ-oHZ-tCa9MbbHvTqkSuQ2A,3650
|
|
27
28
|
agent_runtime/tracing/noop.py,sha256=MOm5eTrnf3d4WhiWrwVU5Kd3GmJ1903V0U7U3Qwho7U,746
|
|
28
|
-
agent_runtime_core-0.1.
|
|
29
|
-
agent_runtime_core-0.1.
|
|
30
|
-
agent_runtime_core-0.1.
|
|
31
|
-
agent_runtime_core-0.1.
|
|
29
|
+
agent_runtime_core-0.1.2.dist-info/METADATA,sha256=md3hU685e-GWY_TCri5W768EKzkto0j4IpG_j70Kx7s,12478
|
|
30
|
+
agent_runtime_core-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
31
|
+
agent_runtime_core-0.1.2.dist-info/licenses/LICENSE,sha256=PcOO8aiOZ4H2MWYeKIis3o6xTCT1hNkDyCxHZhh1NeM,1070
|
|
32
|
+
agent_runtime_core-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|