understudy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- understudy-0.1.0/PKG-INFO +212 -0
- understudy-0.1.0/README.md +168 -0
- understudy-0.1.0/pyproject.toml +67 -0
- understudy-0.1.0/src/understudy/__init__.py +67 -0
- understudy-0.1.0/src/understudy/adk/__init__.py +194 -0
- understudy-0.1.0/src/understudy/check.py +138 -0
- understudy-0.1.0/src/understudy/cli.py +258 -0
- understudy-0.1.0/src/understudy/http/__init__.py +352 -0
- understudy-0.1.0/src/understudy/judges.py +105 -0
- understudy-0.1.0/src/understudy/loaders/__init__.py +0 -0
- understudy-0.1.0/src/understudy/mocks.py +58 -0
- understudy-0.1.0/src/understudy/models.py +157 -0
- understudy-0.1.0/src/understudy/personas/__init__.py +0 -0
- understudy-0.1.0/src/understudy/prompts/__init__.py +21 -0
- understudy-0.1.0/src/understudy/prompts/rubrics.py +51 -0
- understudy-0.1.0/src/understudy/reports.py +191 -0
- understudy-0.1.0/src/understudy/runner.py +171 -0
- understudy-0.1.0/src/understudy/simulator.py +71 -0
- understudy-0.1.0/src/understudy/storage.py +206 -0
- understudy-0.1.0/src/understudy/suite.py +187 -0
- understudy-0.1.0/src/understudy/templates/base.html +248 -0
- understudy-0.1.0/src/understudy/templates/index.html +110 -0
- understudy-0.1.0/src/understudy/templates/metrics.html +128 -0
- understudy-0.1.0/src/understudy/templates/run_detail.html +167 -0
- understudy-0.1.0/src/understudy/trace.py +147 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: understudy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Simulation and trace-based evaluation for agentic systems
|
|
5
|
+
Keywords: agent,evaluation,simulation,testing,llm
|
|
6
|
+
Author: Gaurav Sood
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Topic :: Software Development :: Testing
|
|
15
|
+
Requires-Dist: pydantic>=2.0
|
|
16
|
+
Requires-Dist: pyyaml>=6.0
|
|
17
|
+
Requires-Dist: google-adk>=1.0 ; extra == 'adk'
|
|
18
|
+
Requires-Dist: understudy[adk,judges,http,reports] ; extra == 'all'
|
|
19
|
+
Requires-Dist: understudy[all,docs] ; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest>=8.0 ; extra == 'dev'
|
|
21
|
+
Requires-Dist: pytest-asyncio>=0.23 ; extra == 'dev'
|
|
22
|
+
Requires-Dist: ruff>=0.4 ; extra == 'dev'
|
|
23
|
+
Requires-Dist: sphinx>=7.0 ; extra == 'docs'
|
|
24
|
+
Requires-Dist: furo>=2024.0 ; extra == 'docs'
|
|
25
|
+
Requires-Dist: myst-parser>=3.0 ; extra == 'docs'
|
|
26
|
+
Requires-Dist: sphinx-autodoc-typehints>=2.0 ; extra == 'docs'
|
|
27
|
+
Requires-Dist: httpx>=0.27 ; extra == 'http'
|
|
28
|
+
Requires-Dist: litellm>=1.50 ; extra == 'judges'
|
|
29
|
+
Requires-Dist: jinja2>=3.0 ; extra == 'reports'
|
|
30
|
+
Requires-Dist: click>=8.0 ; extra == 'reports'
|
|
31
|
+
Requires-Python: >=3.12
|
|
32
|
+
Project-URL: Homepage, https://github.com/gojiplus/understudy
|
|
33
|
+
Project-URL: Documentation, https://gojiplus.github.io/understudy
|
|
34
|
+
Project-URL: Repository, https://github.com/gojiplus/understudy
|
|
35
|
+
Project-URL: Issues, https://github.com/gojiplus/understudy/issues
|
|
36
|
+
Provides-Extra: adk
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Provides-Extra: docs
|
|
40
|
+
Provides-Extra: http
|
|
41
|
+
Provides-Extra: judges
|
|
42
|
+
Provides-Extra: reports
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
# understudy
|
|
46
|
+
|
|
47
|
+
[](https://badge.fury.io/py/understudy)
|
|
48
|
+
[](https://www.python.org/downloads/)
|
|
49
|
+
[](https://opensource.org/licenses/MIT)
|
|
50
|
+
|
|
51
|
+
Test your AI agents with simulated users.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install understudy[all]
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Quick Start
|
|
60
|
+
|
|
61
|
+
### 1. Wrap your agent
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from understudy.adk import ADKApp
|
|
65
|
+
from my_agent import agent
|
|
66
|
+
|
|
67
|
+
app = ADKApp(agent=agent)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 2. Mock your tools
|
|
71
|
+
|
|
72
|
+
Your agent has tools that call external services. Mock them for testing:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from understudy.mocks import MockToolkit
|
|
76
|
+
|
|
77
|
+
mocks = MockToolkit()
|
|
78
|
+
|
|
79
|
+
@mocks.handle("lookup_order")
|
|
80
|
+
def lookup_order(order_id: str) -> dict:
|
|
81
|
+
return {"order_id": order_id, "items": [...], "status": "delivered"}
|
|
82
|
+
|
|
83
|
+
@mocks.handle("create_return")
|
|
84
|
+
def create_return(order_id: str, item_sku: str, reason: str) -> dict:
|
|
85
|
+
return {"return_id": "RET-001", "status": "created"}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 3. Write a scene
|
|
89
|
+
|
|
90
|
+
Create `scenes/return_backpack.yaml`:
|
|
91
|
+
|
|
92
|
+
```yaml
|
|
93
|
+
id: return_eligible_backpack
|
|
94
|
+
description: Customer wants to return a backpack
|
|
95
|
+
|
|
96
|
+
starting_prompt: "I'd like to return an item please."
|
|
97
|
+
conversation_plan: |
|
|
98
|
+
Goal: Return the hiking backpack from order ORD-10031.
|
|
99
|
+
- Provide order ID when asked
|
|
100
|
+
- Return reason: too small
|
|
101
|
+
|
|
102
|
+
persona: cooperative
|
|
103
|
+
max_turns: 15
|
|
104
|
+
|
|
105
|
+
expectations:
|
|
106
|
+
required_tools:
|
|
107
|
+
- lookup_order
|
|
108
|
+
- create_return
|
|
109
|
+
allowed_terminal_states:
|
|
110
|
+
- return_created
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### 4. Run simulation
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from understudy import Scene, run, check
|
|
117
|
+
|
|
118
|
+
scene = Scene.from_file("scenes/return_backpack.yaml")
|
|
119
|
+
trace = run(app, scene, mocks=mocks)
|
|
120
|
+
|
|
121
|
+
assert trace.called("lookup_order")
|
|
122
|
+
assert trace.called("create_return")
|
|
123
|
+
assert trace.terminal_state == "return_created"
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Or with pytest (define `app` and `mocks` fixtures in conftest.py):
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
pytest test_returns.py -v
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## CLI Commands
|
|
133
|
+
|
|
134
|
+
After running simulations, use the CLI to inspect results:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# List all saved runs
|
|
138
|
+
understudy list
|
|
139
|
+
|
|
140
|
+
# Show aggregate metrics (pass rate, avg turns, tool usage, terminal states)
|
|
141
|
+
understudy summary
|
|
142
|
+
|
|
143
|
+
# Show details for a specific run
|
|
144
|
+
understudy show <run_id>
|
|
145
|
+
|
|
146
|
+
# Generate static HTML report
|
|
147
|
+
understudy report --output report.html
|
|
148
|
+
|
|
149
|
+
# Start interactive report browser
|
|
150
|
+
understudy serve --port 8080
|
|
151
|
+
|
|
152
|
+
# Delete runs
|
|
153
|
+
understudy delete <run_id>
|
|
154
|
+
understudy clear
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## LLM Judges
|
|
158
|
+
|
|
159
|
+
For qualities that can't be checked deterministically:
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from understudy.judges import Judge
|
|
163
|
+
|
|
164
|
+
empathy_judge = Judge(
|
|
165
|
+
rubric="The agent acknowledged frustration and was empathetic while enforcing policy.",
|
|
166
|
+
samples=5,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
result = empathy_judge.evaluate(trace)
|
|
170
|
+
assert result.score == 1
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Built-in rubrics:
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from understudy.judges import (
|
|
177
|
+
TOOL_USAGE_CORRECTNESS,
|
|
178
|
+
POLICY_COMPLIANCE,
|
|
179
|
+
TONE_EMPATHY,
|
|
180
|
+
ADVERSARIAL_ROBUSTNESS,
|
|
181
|
+
TASK_COMPLETION,
|
|
182
|
+
)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Report Contents
|
|
186
|
+
|
|
187
|
+
The `understudy summary` command shows:
|
|
188
|
+
- **Pass rate** - percentage of scenes that passed all expectations
|
|
189
|
+
- **Avg turns** - average conversation length
|
|
190
|
+
- **Tool usage** - distribution of tool calls across runs
|
|
191
|
+
- **Terminal states** - breakdown of how conversations ended
|
|
192
|
+
- **Agents** - which agents were invoked
|
|
193
|
+
|
|
194
|
+
The HTML report (`understudy report`) includes:
|
|
195
|
+
- All metrics above
|
|
196
|
+
- Full conversation transcripts
|
|
197
|
+
- Tool call details with arguments
|
|
198
|
+
- Expectation check results
|
|
199
|
+
- Judge evaluation results (when used)
|
|
200
|
+
|
|
201
|
+
## Documentation
|
|
202
|
+
|
|
203
|
+
See the [full documentation](https://gojiplus.github.io/understudy) for:
|
|
204
|
+
- [Installation guide](https://gojiplus.github.io/understudy/installation.html)
|
|
205
|
+
- [Writing scenes](https://gojiplus.github.io/understudy/tutorial/scenes.html)
|
|
206
|
+
- [ADK integration](https://gojiplus.github.io/understudy/adk-integration.html)
|
|
207
|
+
- [HTTP client for deployed agents](https://gojiplus.github.io/understudy/tutorial/http.html)
|
|
208
|
+
- [API reference](https://gojiplus.github.io/understudy/api/index.html)
|
|
209
|
+
|
|
210
|
+
## License
|
|
211
|
+
|
|
212
|
+
MIT
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# understudy
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/understudy)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
Test your AI agents with simulated users.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install understudy[all]
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
### 1. Wrap your agent
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from understudy.adk import ADKApp
|
|
21
|
+
from my_agent import agent
|
|
22
|
+
|
|
23
|
+
app = ADKApp(agent=agent)
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### 2. Mock your tools
|
|
27
|
+
|
|
28
|
+
Your agent has tools that call external services. Mock them for testing:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from understudy.mocks import MockToolkit
|
|
32
|
+
|
|
33
|
+
mocks = MockToolkit()
|
|
34
|
+
|
|
35
|
+
@mocks.handle("lookup_order")
|
|
36
|
+
def lookup_order(order_id: str) -> dict:
|
|
37
|
+
return {"order_id": order_id, "items": [...], "status": "delivered"}
|
|
38
|
+
|
|
39
|
+
@mocks.handle("create_return")
|
|
40
|
+
def create_return(order_id: str, item_sku: str, reason: str) -> dict:
|
|
41
|
+
return {"return_id": "RET-001", "status": "created"}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### 3. Write a scene
|
|
45
|
+
|
|
46
|
+
Create `scenes/return_backpack.yaml`:
|
|
47
|
+
|
|
48
|
+
```yaml
|
|
49
|
+
id: return_eligible_backpack
|
|
50
|
+
description: Customer wants to return a backpack
|
|
51
|
+
|
|
52
|
+
starting_prompt: "I'd like to return an item please."
|
|
53
|
+
conversation_plan: |
|
|
54
|
+
Goal: Return the hiking backpack from order ORD-10031.
|
|
55
|
+
- Provide order ID when asked
|
|
56
|
+
- Return reason: too small
|
|
57
|
+
|
|
58
|
+
persona: cooperative
|
|
59
|
+
max_turns: 15
|
|
60
|
+
|
|
61
|
+
expectations:
|
|
62
|
+
required_tools:
|
|
63
|
+
- lookup_order
|
|
64
|
+
- create_return
|
|
65
|
+
allowed_terminal_states:
|
|
66
|
+
- return_created
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### 4. Run simulation
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from understudy import Scene, run, check
|
|
73
|
+
|
|
74
|
+
scene = Scene.from_file("scenes/return_backpack.yaml")
|
|
75
|
+
trace = run(app, scene, mocks=mocks)
|
|
76
|
+
|
|
77
|
+
assert trace.called("lookup_order")
|
|
78
|
+
assert trace.called("create_return")
|
|
79
|
+
assert trace.terminal_state == "return_created"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Or with pytest (define `app` and `mocks` fixtures in conftest.py):
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pytest test_returns.py -v
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## CLI Commands
|
|
89
|
+
|
|
90
|
+
After running simulations, use the CLI to inspect results:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# List all saved runs
|
|
94
|
+
understudy list
|
|
95
|
+
|
|
96
|
+
# Show aggregate metrics (pass rate, avg turns, tool usage, terminal states)
|
|
97
|
+
understudy summary
|
|
98
|
+
|
|
99
|
+
# Show details for a specific run
|
|
100
|
+
understudy show <run_id>
|
|
101
|
+
|
|
102
|
+
# Generate static HTML report
|
|
103
|
+
understudy report --output report.html
|
|
104
|
+
|
|
105
|
+
# Start interactive report browser
|
|
106
|
+
understudy serve --port 8080
|
|
107
|
+
|
|
108
|
+
# Delete runs
|
|
109
|
+
understudy delete <run_id>
|
|
110
|
+
understudy clear
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## LLM Judges
|
|
114
|
+
|
|
115
|
+
For qualities that can't be checked deterministically:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from understudy.judges import Judge
|
|
119
|
+
|
|
120
|
+
empathy_judge = Judge(
|
|
121
|
+
rubric="The agent acknowledged frustration and was empathetic while enforcing policy.",
|
|
122
|
+
samples=5,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
result = empathy_judge.evaluate(trace)
|
|
126
|
+
assert result.score == 1
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Built-in rubrics:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from understudy.judges import (
|
|
133
|
+
TOOL_USAGE_CORRECTNESS,
|
|
134
|
+
POLICY_COMPLIANCE,
|
|
135
|
+
TONE_EMPATHY,
|
|
136
|
+
ADVERSARIAL_ROBUSTNESS,
|
|
137
|
+
TASK_COMPLETION,
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Report Contents
|
|
142
|
+
|
|
143
|
+
The `understudy summary` command shows:
|
|
144
|
+
- **Pass rate** - percentage of scenes that passed all expectations
|
|
145
|
+
- **Avg turns** - average conversation length
|
|
146
|
+
- **Tool usage** - distribution of tool calls across runs
|
|
147
|
+
- **Terminal states** - breakdown of how conversations ended
|
|
148
|
+
- **Agents** - which agents were invoked
|
|
149
|
+
|
|
150
|
+
The HTML report (`understudy report`) includes:
|
|
151
|
+
- All metrics above
|
|
152
|
+
- Full conversation transcripts
|
|
153
|
+
- Tool call details with arguments
|
|
154
|
+
- Expectation check results
|
|
155
|
+
- Judge evaluation results (when used)
|
|
156
|
+
|
|
157
|
+
## Documentation
|
|
158
|
+
|
|
159
|
+
See the [full documentation](https://gojiplus.github.io/understudy) for:
|
|
160
|
+
- [Installation guide](https://gojiplus.github.io/understudy/installation.html)
|
|
161
|
+
- [Writing scenes](https://gojiplus.github.io/understudy/tutorial/scenes.html)
|
|
162
|
+
- [ADK integration](https://gojiplus.github.io/understudy/adk-integration.html)
|
|
163
|
+
- [HTTP client for deployed agents](https://gojiplus.github.io/understudy/tutorial/http.html)
|
|
164
|
+
- [API reference](https://gojiplus.github.io/understudy/api/index.html)
|
|
165
|
+
|
|
166
|
+
## License
|
|
167
|
+
|
|
168
|
+
MIT
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build>=0.7"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "understudy"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Simulation and trace-based evaluation for agentic systems"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.12"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Gaurav Sood" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["agent", "evaluation", "simulation", "testing", "llm"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Software Development :: Testing",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"pydantic>=2.0",
|
|
27
|
+
"pyyaml>=6.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
adk = ["google-adk>=1.0"]
|
|
32
|
+
judges = ["litellm>=1.50"]
|
|
33
|
+
http = ["httpx>=0.27"]
|
|
34
|
+
reports = ["jinja2>=3.0", "click>=8.0"]
|
|
35
|
+
docs = [
|
|
36
|
+
"sphinx>=7.0",
|
|
37
|
+
"furo>=2024.0",
|
|
38
|
+
"myst-parser>=3.0",
|
|
39
|
+
"sphinx-autodoc-typehints>=2.0",
|
|
40
|
+
]
|
|
41
|
+
all = ["understudy[adk,judges,http,reports]"]
|
|
42
|
+
dev = [
|
|
43
|
+
"understudy[all,docs]",
|
|
44
|
+
"pytest>=8.0",
|
|
45
|
+
"pytest-asyncio>=0.23",
|
|
46
|
+
"ruff>=0.4",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.scripts]
|
|
50
|
+
understudy = "understudy.cli:main"
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
Homepage = "https://github.com/gojiplus/understudy"
|
|
54
|
+
Documentation = "https://gojiplus.github.io/understudy"
|
|
55
|
+
Repository = "https://github.com/gojiplus/understudy"
|
|
56
|
+
Issues = "https://github.com/gojiplus/understudy/issues"
|
|
57
|
+
|
|
58
|
+
[tool.ruff]
|
|
59
|
+
line-length = 100
|
|
60
|
+
target-version = "py312"
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint]
|
|
63
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
64
|
+
|
|
65
|
+
[tool.pytest.ini_options]
|
|
66
|
+
testpaths = ["tests"]
|
|
67
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""understudy: simulation and trace-based evaluation for agentic systems.
|
|
2
|
+
|
|
3
|
+
The simulated user is an understudy standing in for a real user.
|
|
4
|
+
You write scenes, run rehearsals, and check the performance —
|
|
5
|
+
not by reading the script, but by inspecting what actually happened.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .check import CheckItem, CheckResult, check
|
|
9
|
+
from .judges import Judge, JudgeResult
|
|
10
|
+
from .mocks import MockToolkit, ToolError
|
|
11
|
+
from .models import Expectations, Persona, PersonaPreset, Scene
|
|
12
|
+
from .prompts import (
|
|
13
|
+
ADVERSARIAL_ROBUSTNESS,
|
|
14
|
+
FACTUAL_GROUNDING,
|
|
15
|
+
INSTRUCTION_FOLLOWING,
|
|
16
|
+
POLICY_COMPLIANCE,
|
|
17
|
+
TASK_COMPLETION,
|
|
18
|
+
TONE_EMPATHY,
|
|
19
|
+
TOOL_USAGE_CORRECTNESS,
|
|
20
|
+
)
|
|
21
|
+
from .runner import AgentApp, AgentResponse, run
|
|
22
|
+
from .storage import RunStorage
|
|
23
|
+
from .suite import SceneResult, Suite, SuiteResults
|
|
24
|
+
from .trace import AgentTransfer, ToolCall, Trace, Turn
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
# models
|
|
30
|
+
"Scene",
|
|
31
|
+
"Persona",
|
|
32
|
+
"PersonaPreset",
|
|
33
|
+
"Expectations",
|
|
34
|
+
# trace
|
|
35
|
+
"Trace",
|
|
36
|
+
"Turn",
|
|
37
|
+
"ToolCall",
|
|
38
|
+
"AgentTransfer",
|
|
39
|
+
# runner
|
|
40
|
+
"run",
|
|
41
|
+
"AgentApp",
|
|
42
|
+
"AgentResponse",
|
|
43
|
+
# check
|
|
44
|
+
"check",
|
|
45
|
+
"CheckResult",
|
|
46
|
+
"CheckItem",
|
|
47
|
+
# suite
|
|
48
|
+
"Suite",
|
|
49
|
+
"SuiteResults",
|
|
50
|
+
"SceneResult",
|
|
51
|
+
# storage
|
|
52
|
+
"RunStorage",
|
|
53
|
+
# judges
|
|
54
|
+
"Judge",
|
|
55
|
+
"JudgeResult",
|
|
56
|
+
# mocks
|
|
57
|
+
"MockToolkit",
|
|
58
|
+
"ToolError",
|
|
59
|
+
# rubrics
|
|
60
|
+
"TOOL_USAGE_CORRECTNESS",
|
|
61
|
+
"POLICY_COMPLIANCE",
|
|
62
|
+
"TONE_EMPATHY",
|
|
63
|
+
"ADVERSARIAL_ROBUSTNESS",
|
|
64
|
+
"TASK_COMPLETION",
|
|
65
|
+
"FACTUAL_GROUNDING",
|
|
66
|
+
"INSTRUCTION_FOLLOWING",
|
|
67
|
+
]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""ADK adapter: wraps Google ADK agents for use with understudy."""
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ..mocks import MockToolkit
|
|
7
|
+
from ..runner import AgentApp, AgentResponse
|
|
8
|
+
from ..trace import AgentTransfer, ToolCall
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _create_mock_callback(mocks: MockToolkit | None):
|
|
12
|
+
"""Create a before_tool_callback that returns mock responses.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
mocks: MockToolkit instance or None.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
A callback function compatible with google-adk's before_tool_callback.
|
|
19
|
+
Returns dict to bypass real tool execution (mock response).
|
|
20
|
+
Returns None to allow normal execution.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def callback(tool, args: dict[str, Any], tool_context) -> dict | None:
|
|
24
|
+
if mocks is None:
|
|
25
|
+
return None
|
|
26
|
+
tool_name = getattr(tool, "name", None) or getattr(tool, "__name__", str(tool))
|
|
27
|
+
if mocks.get_handler(tool_name):
|
|
28
|
+
try:
|
|
29
|
+
result = mocks.call(tool_name, **args)
|
|
30
|
+
return result
|
|
31
|
+
except Exception as e:
|
|
32
|
+
return {"error": str(e)}
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
return callback
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ADKApp(AgentApp):
|
|
39
|
+
"""Wraps a Google ADK Agent for use with understudy.
|
|
40
|
+
|
|
41
|
+
Usage:
|
|
42
|
+
from google.adk import Agent
|
|
43
|
+
from understudy.adk import ADKApp
|
|
44
|
+
|
|
45
|
+
agent = Agent(model="gemini-2.5-flash", name="my_agent", ...)
|
|
46
|
+
app = ADKApp(agent=agent)
|
|
47
|
+
trace = run(app, scene)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, agent: Any, session_id: str | None = None):
|
|
51
|
+
"""
|
|
52
|
+
Args:
|
|
53
|
+
agent: A google.adk.Agent instance.
|
|
54
|
+
session_id: Optional session ID. If None, a random one is generated.
|
|
55
|
+
"""
|
|
56
|
+
self.agent = agent
|
|
57
|
+
self.session_id = session_id
|
|
58
|
+
self._runner = None
|
|
59
|
+
self._session = None
|
|
60
|
+
self._mocks: MockToolkit | None = None
|
|
61
|
+
self._current_agent: str | None = None
|
|
62
|
+
self._agent_transfers: list[AgentTransfer] = []
|
|
63
|
+
|
|
64
|
+
def start(self, mocks: MockToolkit | None = None) -> None:
|
|
65
|
+
"""Initialize the ADK session."""
|
|
66
|
+
try:
|
|
67
|
+
from google.adk import Runner
|
|
68
|
+
from google.adk.sessions import InMemorySessionService
|
|
69
|
+
except ImportError as e:
|
|
70
|
+
raise ImportError(
|
|
71
|
+
"google-adk package required. Install with: pip install understudy[adk]"
|
|
72
|
+
) from e
|
|
73
|
+
import uuid
|
|
74
|
+
|
|
75
|
+
self._mocks = mocks
|
|
76
|
+
self._current_agent = getattr(self.agent, "name", None)
|
|
77
|
+
self._agent_transfers = []
|
|
78
|
+
self._session_id = self.session_id or str(uuid.uuid4())
|
|
79
|
+
|
|
80
|
+
session_service = InMemorySessionService()
|
|
81
|
+
if mocks:
|
|
82
|
+
self.agent.before_tool_callback = _create_mock_callback(mocks)
|
|
83
|
+
|
|
84
|
+
self._runner = Runner(
|
|
85
|
+
agent=self.agent,
|
|
86
|
+
app_name="understudy_test",
|
|
87
|
+
session_service=session_service,
|
|
88
|
+
)
|
|
89
|
+
self._session = session_service.create_session_sync(
|
|
90
|
+
app_name="understudy_test",
|
|
91
|
+
user_id="understudy_user",
|
|
92
|
+
session_id=self._session_id,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def send(self, message: str) -> AgentResponse:
|
|
96
|
+
"""Send a user message to the ADK agent and capture the response."""
|
|
97
|
+
try:
|
|
98
|
+
from google.genai import types
|
|
99
|
+
except ImportError as e:
|
|
100
|
+
raise ImportError(
|
|
101
|
+
"google-adk package required. Install with: pip install understudy[adk]"
|
|
102
|
+
) from e
|
|
103
|
+
|
|
104
|
+
user_content = types.Content(
|
|
105
|
+
role="user",
|
|
106
|
+
parts=[types.Part(text=message)],
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
tool_calls: list[ToolCall] = []
|
|
110
|
+
agent_text_parts: list[str] = []
|
|
111
|
+
terminal_state: str | None = None
|
|
112
|
+
current_agent_name = self._current_agent
|
|
113
|
+
|
|
114
|
+
for event in self._runner.run(
|
|
115
|
+
user_id="understudy_user",
|
|
116
|
+
session_id=self._session.id,
|
|
117
|
+
new_message=user_content,
|
|
118
|
+
):
|
|
119
|
+
# track agent attribution from event.author
|
|
120
|
+
if hasattr(event, "author") and event.author:
|
|
121
|
+
event_agent = event.author
|
|
122
|
+
if event_agent != current_agent_name and current_agent_name:
|
|
123
|
+
self._agent_transfers.append(
|
|
124
|
+
AgentTransfer(
|
|
125
|
+
from_agent=current_agent_name,
|
|
126
|
+
to_agent=event_agent,
|
|
127
|
+
timestamp=datetime.now(UTC),
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
current_agent_name = event_agent
|
|
131
|
+
|
|
132
|
+
# detect explicit transfer_to_agent actions
|
|
133
|
+
if (
|
|
134
|
+
hasattr(event, "actions")
|
|
135
|
+
and event.actions
|
|
136
|
+
and hasattr(event.actions, "transfer_to_agent")
|
|
137
|
+
and event.actions.transfer_to_agent
|
|
138
|
+
):
|
|
139
|
+
target_agent = event.actions.transfer_to_agent
|
|
140
|
+
if current_agent_name and target_agent != current_agent_name:
|
|
141
|
+
self._agent_transfers.append(
|
|
142
|
+
AgentTransfer(
|
|
143
|
+
from_agent=current_agent_name,
|
|
144
|
+
to_agent=target_agent,
|
|
145
|
+
timestamp=datetime.now(UTC),
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
current_agent_name = target_agent
|
|
149
|
+
|
|
150
|
+
# capture tool calls using get_function_calls()
|
|
151
|
+
for fc in event.get_function_calls():
|
|
152
|
+
call = ToolCall(
|
|
153
|
+
tool_name=fc.name,
|
|
154
|
+
arguments=dict(fc.args) if fc.args else {},
|
|
155
|
+
agent_name=current_agent_name,
|
|
156
|
+
)
|
|
157
|
+
tool_calls.append(call)
|
|
158
|
+
|
|
159
|
+
# capture function responses and update tool call results
|
|
160
|
+
for fr in event.get_function_responses():
|
|
161
|
+
for call in tool_calls:
|
|
162
|
+
if call.tool_name == fr.name and call.result is None:
|
|
163
|
+
call.result = fr.response
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
# capture text responses from content parts
|
|
167
|
+
if hasattr(event, "content") and event.content and hasattr(event.content, "parts"):
|
|
168
|
+
for part in event.content.parts:
|
|
169
|
+
text = getattr(part, "text", None)
|
|
170
|
+
if text:
|
|
171
|
+
agent_text_parts.append(text)
|
|
172
|
+
|
|
173
|
+
# check for terminal state markers
|
|
174
|
+
# convention: agent emits "TERMINAL_STATE: <state>"
|
|
175
|
+
if "TERMINAL_STATE:" in text:
|
|
176
|
+
state = text.split("TERMINAL_STATE:")[-1].strip()
|
|
177
|
+
terminal_state = state.split()[0].strip()
|
|
178
|
+
|
|
179
|
+
self._current_agent = current_agent_name
|
|
180
|
+
|
|
181
|
+
response = AgentResponse(
|
|
182
|
+
content=" ".join(agent_text_parts),
|
|
183
|
+
tool_calls=tool_calls,
|
|
184
|
+
terminal_state=terminal_state,
|
|
185
|
+
)
|
|
186
|
+
response.agent_name = current_agent_name
|
|
187
|
+
response.agent_transfers = list(self._agent_transfers)
|
|
188
|
+
return response
|
|
189
|
+
|
|
190
|
+
def stop(self) -> None:
|
|
191
|
+
"""Clean up the ADK session."""
|
|
192
|
+
self._runner = None
|
|
193
|
+
self._session = None
|
|
194
|
+
self._mocks = None
|