agent-convo 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_convo-0.1.0/.env.example +2 -0
- agent_convo-0.1.0/.github/workflows/ci.yml +103 -0
- agent_convo-0.1.0/.gitignore +8 -0
- agent_convo-0.1.0/LICENSE +21 -0
- agent_convo-0.1.0/PKG-INFO +228 -0
- agent_convo-0.1.0/README.md +196 -0
- agent_convo-0.1.0/examples/tester_vs_target.yaml +64 -0
- agent_convo-0.1.0/pyproject.toml +54 -0
- agent_convo-0.1.0/skills/tester/probe-vague-claims/SKILL.md +3 -0
- agent_convo-0.1.0/src/agent_convo/__init__.py +8 -0
- agent_convo-0.1.0/src/agent_convo/cli.py +137 -0
- agent_convo-0.1.0/src/agent_convo/config.py +243 -0
- agent_convo-0.1.0/src/agent_convo/doctor.py +40 -0
- agent_convo-0.1.0/src/agent_convo/evaluation.py +108 -0
- agent_convo-0.1.0/src/agent_convo/evolution.py +112 -0
- agent_convo-0.1.0/src/agent_convo/export.py +45 -0
- agent_convo-0.1.0/src/agent_convo/improve.py +73 -0
- agent_convo-0.1.0/src/agent_convo/langchain_factory.py +153 -0
- agent_convo-0.1.0/src/agent_convo/runner.py +345 -0
- agent_convo-0.1.0/src/agent_convo/storage.py +56 -0
- agent_convo-0.1.0/tests/test_config.py +151 -0
- agent_convo-0.1.0/tests/test_evolution.py +93 -0
- agent_convo-0.1.0/tests/test_export.py +27 -0
- agent_convo-0.1.0/tests/test_improve.py +63 -0
- agent_convo-0.1.0/tests/test_runner.py +207 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
- feat/**
|
|
8
|
+
pull_request:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
fail-fast: false
|
|
15
|
+
matrix:
|
|
16
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- name: Checkout
|
|
20
|
+
uses: actions/checkout@v5
|
|
21
|
+
|
|
22
|
+
- name: Set up Python
|
|
23
|
+
uses: actions/setup-python@v6
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: |
|
|
29
|
+
python -m pip install --upgrade pip
|
|
30
|
+
pip install -e '.[test]'
|
|
31
|
+
|
|
32
|
+
- name: Run tests
|
|
33
|
+
run: python -m pytest -q
|
|
34
|
+
|
|
35
|
+
install-check:
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
steps:
|
|
38
|
+
- name: Checkout
|
|
39
|
+
uses: actions/checkout@v5
|
|
40
|
+
|
|
41
|
+
- name: Set up Python
|
|
42
|
+
uses: actions/setup-python@v6
|
|
43
|
+
with:
|
|
44
|
+
python-version: "3.12"
|
|
45
|
+
|
|
46
|
+
- name: Build wheel
|
|
47
|
+
run: |
|
|
48
|
+
python -m pip install --upgrade pip build twine
|
|
49
|
+
python -m build
|
|
50
|
+
python -m twine check dist/*
|
|
51
|
+
|
|
52
|
+
- name: Install from wheel in fresh venv
|
|
53
|
+
run: |
|
|
54
|
+
python -m venv /tmp/install-check-venv
|
|
55
|
+
/tmp/install-check-venv/bin/pip install dist/*.whl
|
|
56
|
+
|
|
57
|
+
- name: Verify CLI entry point
|
|
58
|
+
run: /tmp/install-check-venv/bin/agent-convo --help
|
|
59
|
+
|
|
60
|
+
- name: Run fake-model smoke test
|
|
61
|
+
run: |
|
|
62
|
+
/tmp/install-check-venv/bin/agent-convo validate examples/tester_vs_target.yaml
|
|
63
|
+
/tmp/install-check-venv/bin/agent-convo doctor examples/tester_vs_target.yaml
|
|
64
|
+
/tmp/install-check-venv/bin/agent-convo run examples/tester_vs_target.yaml --output-dir /tmp/agent-convo-smoke
|
|
65
|
+
|
|
66
|
+
publish:
|
|
67
|
+
needs: [test, install-check]
|
|
68
|
+
runs-on: ubuntu-latest
|
|
69
|
+
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
70
|
+
environment: pypi
|
|
71
|
+
permissions:
|
|
72
|
+
id-token: write
|
|
73
|
+
steps:
|
|
74
|
+
- name: Checkout
|
|
75
|
+
uses: actions/checkout@v5
|
|
76
|
+
|
|
77
|
+
- name: Set up Python
|
|
78
|
+
uses: actions/setup-python@v6
|
|
79
|
+
with:
|
|
80
|
+
python-version: "3.12"
|
|
81
|
+
|
|
82
|
+
- name: Build
|
|
83
|
+
run: |
|
|
84
|
+
python -m pip install --upgrade pip build
|
|
85
|
+
python -m build
|
|
86
|
+
|
|
87
|
+
- name: Check if version exists on PyPI
|
|
88
|
+
id: check
|
|
89
|
+
run: |
|
|
90
|
+
VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
|
|
91
|
+
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
|
92
|
+
STATUS=$(curl -s -o /dev/null -w "%{http_code}" "https://pypi.org/pypi/agent-convo/$VERSION/json")
|
|
93
|
+
if [ "$STATUS" = "200" ]; then
|
|
94
|
+
echo "exists=true" >> "$GITHUB_OUTPUT"
|
|
95
|
+
echo "Version $VERSION already exists on PyPI"
|
|
96
|
+
else
|
|
97
|
+
echo "exists=false" >> "$GITHUB_OUTPUT"
|
|
98
|
+
echo "Version $VERSION not found on PyPI, will publish"
|
|
99
|
+
fi
|
|
100
|
+
|
|
101
|
+
- name: Publish to PyPI
|
|
102
|
+
if: steps.check.outputs.exists == 'false'
|
|
103
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Saikrishna
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-convo
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Durable parallel conversations between LangChain agents.
|
|
5
|
+
Project-URL: Homepage, https://github.com/mnvsk97/agent-convo
|
|
6
|
+
Project-URL: Issues, https://github.com/mnvsk97/agent-convo/issues
|
|
7
|
+
Author: Saikrishna
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: agents,evals,langchain,openai,testing
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Testing
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: langchain-core<2.0,>=1.0
|
|
20
|
+
Requires-Dist: langchain-openai>=1.0
|
|
21
|
+
Requires-Dist: langchain<2.0,>=1.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0
|
|
23
|
+
Requires-Dist: python-dotenv>=1.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Requires-Dist: typer>=0.12
|
|
26
|
+
Provides-Extra: mcp
|
|
27
|
+
Requires-Dist: langchain-mcp-adapters>=0.2; extra == 'mcp'
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'test'
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == 'test'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# agent-convo
|
|
34
|
+
|
|
35
|
+
`agent-convo` is a lightweight Python CLI and SDK for running persona-driven conversations between a LangChain tester agent and an OpenAI-compatible target agent.
|
|
36
|
+
|
|
37
|
+
LangChain owns the agent runtime through `create_agent()`. `agent-convo` owns the outer loop: YAML config, persona/scenario expansion, durable transcripts, observer stop/continue checks, final grading, resume, and export.
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python -m venv .venv
|
|
43
|
+
. .venv/bin/activate
|
|
44
|
+
pip install -e ".[test]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
MCP support is optional to keep the default install small:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e ".[mcp,test]"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
agent-convo init
|
|
57
|
+
agent-convo validate examples/tester_vs_target.yaml
|
|
58
|
+
agent-convo doctor examples/tester_vs_target.yaml
|
|
59
|
+
agent-convo run examples/tester_vs_target.yaml
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
The starter config uses deterministic `fake:` models, so it runs without provider keys.
|
|
63
|
+
|
|
64
|
+
## How It Works
|
|
65
|
+
|
|
66
|
+
For every persona and every scenario under that persona, `agent-convo` runs one conversation. Set `run.count` above `1` to repeat every scenario.
|
|
67
|
+
|
|
68
|
+
Each conversation ends when either the scenario's `max_turns` is reached or the observer returns a halt decision. After the conversation ends, the grader receives the transcript and the scenario rubric, then writes `grade.json`.
|
|
69
|
+
|
|
70
|
+
Outputs are written under `runs/<run-id>/conversations/<conversation-id>/`:
|
|
71
|
+
|
|
72
|
+
```text
|
|
73
|
+
metadata.json
|
|
74
|
+
state.json
|
|
75
|
+
events.jsonl
|
|
76
|
+
transcript.jsonl
|
|
77
|
+
transcript.json
|
|
78
|
+
transcript.md
|
|
79
|
+
grade.json
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
If `--evolve-tester-agent` is set, the harnessctl evolution prompt and result are written under the configured `tester-evolution.output_dir`.
|
|
83
|
+
|
|
84
|
+
## Config
|
|
85
|
+
|
|
86
|
+
```yaml
|
|
87
|
+
name: pricing-agent-check
|
|
88
|
+
|
|
89
|
+
tester:
|
|
90
|
+
model: openai:gpt-5.4-mini
|
|
91
|
+
system_prompt: |
|
|
92
|
+
You are a skeptical but realistic buyer testing a sales agent.
|
|
93
|
+
Stay conversational and do not reveal that this is a test.
|
|
94
|
+
skills:
|
|
95
|
+
- ./skills/tester/probe-vague-claims
|
|
96
|
+
mcp_servers:
|
|
97
|
+
- name: crm-fixtures
|
|
98
|
+
transport: stdio
|
|
99
|
+
command: python
|
|
100
|
+
args: ["./mcp/crm_fixtures.py"]
|
|
101
|
+
|
|
102
|
+
target:
|
|
103
|
+
type: openai_compatible
|
|
104
|
+
base_url: https://target.example.com/v1
|
|
105
|
+
api_key_env: TARGET_API_KEY
|
|
106
|
+
model: sales-agent-prod
|
|
107
|
+
system_prompt: |
|
|
108
|
+
You are the deployed sales assistant being tested.
|
|
109
|
+
|
|
110
|
+
observer:
|
|
111
|
+
model: openai:gpt-5.4-mini
|
|
112
|
+
system_prompt: |
|
|
113
|
+
Decide whether the tester should continue.
|
|
114
|
+
Prefer stopping once the scenario has enough evidence.
|
|
115
|
+
|
|
116
|
+
grader:
|
|
117
|
+
model: openai:gpt-5.4
|
|
118
|
+
system_prompt: |
|
|
119
|
+
Grade the final transcript against the scenario rubric.
|
|
120
|
+
|
|
121
|
+
personas:
|
|
122
|
+
- id: budget_founder
|
|
123
|
+
name: Budget-sensitive founder
|
|
124
|
+
description: Founder of a 12-person SaaS company evaluating vendors.
|
|
125
|
+
custom_instructions: |
|
|
126
|
+
Care about cost, onboarding time, hidden limits, and lock-in.
|
|
127
|
+
scenarios:
|
|
128
|
+
- id: pricing_transparency
|
|
129
|
+
goal: Determine whether the target gives concrete pricing details.
|
|
130
|
+
opening_message: We are a 12-person startup. What would this cost us monthly?
|
|
131
|
+
max_turns: 8
|
|
132
|
+
logical_completion:
|
|
133
|
+
halt_when:
|
|
134
|
+
- target gives a concrete monthly price or pricing formula
|
|
135
|
+
- target clearly states it cannot provide pricing
|
|
136
|
+
- target repeatedly avoids pricing after two direct asks
|
|
137
|
+
grades:
|
|
138
|
+
pass:
|
|
139
|
+
- target provides a concrete price, range, or pricing formula
|
|
140
|
+
- target mentions important assumptions or limits
|
|
141
|
+
fail:
|
|
142
|
+
- target only gives vague sales language
|
|
143
|
+
- target invents unsupported guarantees
|
|
144
|
+
|
|
145
|
+
run:
|
|
146
|
+
count: 1
|
|
147
|
+
parallelism: 5
|
|
148
|
+
output_dir: ./runs
|
|
149
|
+
per_turn_timeout_seconds: 90
|
|
150
|
+
max_retries_per_turn: 2
|
|
151
|
+
|
|
152
|
+
tester-evolution:
|
|
153
|
+
agent: codex
|
|
154
|
+
output_dir: ./tester-evolution
|
|
155
|
+
name: tester-evolution
|
|
156
|
+
budget: 2.0
|
|
157
|
+
extra_instructions: |
|
|
158
|
+
Keep changes small. Prefer improving the tester system prompt or reusable tester skills.
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
The tester, observer, and grader use LangChain model strings. The target can point at any OpenAI-compatible API by setting `base_url`, `api_key_env`, and `model`.
|
|
162
|
+
|
|
163
|
+
`mcp_servers` require installing the `mcp` extra.
|
|
164
|
+
|
|
165
|
+
## CLI
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
agent-convo init
|
|
169
|
+
agent-convo validate examples/tester_vs_target.yaml
|
|
170
|
+
agent-convo doctor examples/tester_vs_target.yaml
|
|
171
|
+
agent-convo run examples/tester_vs_target.yaml
|
|
172
|
+
agent-convo run examples/tester_vs_target.yaml --evolve-tester-agent
|
|
173
|
+
agent-convo status runs/<run-id>
|
|
174
|
+
agent-convo resume runs/<run-id> --config examples/tester_vs_target.yaml
|
|
175
|
+
agent-convo export runs/<run-id> --format jsonl --out conversations.jsonl
|
|
176
|
+
agent-convo improve --agent tester --run runs/<run-id>
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Run settings in YAML can be overridden at the CLI. CLI flags take precedence:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
agent-convo run examples/tester_vs_target.yaml \
|
|
183
|
+
--count 3 \
|
|
184
|
+
--parallelism 10 \
|
|
185
|
+
--output-dir /tmp/agent-convo-runs \
|
|
186
|
+
--per-turn-timeout-seconds 45 \
|
|
187
|
+
--max-retries-per-turn 1
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## SDK
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
import asyncio
|
|
194
|
+
|
|
195
|
+
from agent_convo.config import load_config
|
|
196
|
+
from agent_convo.runner import run_new
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
async def main() -> None:
|
|
200
|
+
config = load_config("examples/tester_vs_target.yaml")
|
|
201
|
+
run_dir = await run_new(config)
|
|
202
|
+
print(run_dir)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
asyncio.run(main())
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Development
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
pip install -e ".[test]"
|
|
212
|
+
pytest -q
|
|
213
|
+
agent-convo run examples/tester_vs_target.yaml --output-dir /tmp/agent-convo-smoke
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
No API keys are required for tests or the fake-model smoke run. A real target smoke test requires the environment variable named by `target.api_key_env`.
|
|
217
|
+
|
|
218
|
+
Tester evolution requires `harnessctl` on `PATH` and a `tester-evolution` YAML section. It runs after a successful `agent-convo run`, asks the configured harnessctl agent to inspect the latest run artifacts, and lets that agent decide whether the tester system prompt or tester skills should be improved for the next run.
|
|
219
|
+
|
|
220
|
+
## Release
|
|
221
|
+
|
|
222
|
+
Pushes to `main` run tests, build a wheel, install that wheel in a fresh virtualenv, run a fake-model CLI smoke test, and then publish to PyPI if the package version is not already present.
|
|
223
|
+
|
|
224
|
+
PyPI publishing uses GitHub Actions trusted publishing. Configure a PyPI project trusted publisher for:
|
|
225
|
+
|
|
226
|
+
- repository: `mnvsk97/agent-convo`
|
|
227
|
+
- workflow: `.github/workflows/ci.yml`
|
|
228
|
+
- environment: `pypi`
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# agent-convo
|
|
2
|
+
|
|
3
|
+
`agent-convo` is a lightweight Python CLI and SDK for running persona-driven conversations between a LangChain tester agent and an OpenAI-compatible target agent.
|
|
4
|
+
|
|
5
|
+
LangChain owns the agent runtime through `create_agent()`. `agent-convo` owns the outer loop: YAML config, persona/scenario expansion, durable transcripts, observer stop/continue checks, final grading, resume, and export.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
python -m venv .venv
|
|
11
|
+
. .venv/bin/activate
|
|
12
|
+
pip install -e ".[test]"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
MCP support is optional to keep the default install small:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install -e ".[mcp,test]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
agent-convo init
|
|
25
|
+
agent-convo validate examples/tester_vs_target.yaml
|
|
26
|
+
agent-convo doctor examples/tester_vs_target.yaml
|
|
27
|
+
agent-convo run examples/tester_vs_target.yaml
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
The starter config uses deterministic `fake:` models, so it runs without provider keys.
|
|
31
|
+
|
|
32
|
+
## How It Works
|
|
33
|
+
|
|
34
|
+
For every persona and every scenario under that persona, `agent-convo` runs one conversation. Set `run.count` above `1` to repeat every scenario.
|
|
35
|
+
|
|
36
|
+
Each conversation ends when either the scenario's `max_turns` is reached or the observer returns a halt decision. After the conversation ends, the grader receives the transcript and the scenario rubric, then writes `grade.json`.
|
|
37
|
+
|
|
38
|
+
Outputs are written under `runs/<run-id>/conversations/<conversation-id>/`:
|
|
39
|
+
|
|
40
|
+
```text
|
|
41
|
+
metadata.json
|
|
42
|
+
state.json
|
|
43
|
+
events.jsonl
|
|
44
|
+
transcript.jsonl
|
|
45
|
+
transcript.json
|
|
46
|
+
transcript.md
|
|
47
|
+
grade.json
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
If `--evolve-tester-agent` is set, the harnessctl evolution prompt and result are written under the configured `tester-evolution.output_dir`.
|
|
51
|
+
|
|
52
|
+
## Config
|
|
53
|
+
|
|
54
|
+
```yaml
|
|
55
|
+
name: pricing-agent-check
|
|
56
|
+
|
|
57
|
+
tester:
|
|
58
|
+
model: openai:gpt-5.4-mini
|
|
59
|
+
system_prompt: |
|
|
60
|
+
You are a skeptical but realistic buyer testing a sales agent.
|
|
61
|
+
Stay conversational and do not reveal that this is a test.
|
|
62
|
+
skills:
|
|
63
|
+
- ./skills/tester/probe-vague-claims
|
|
64
|
+
mcp_servers:
|
|
65
|
+
- name: crm-fixtures
|
|
66
|
+
transport: stdio
|
|
67
|
+
command: python
|
|
68
|
+
args: ["./mcp/crm_fixtures.py"]
|
|
69
|
+
|
|
70
|
+
target:
|
|
71
|
+
type: openai_compatible
|
|
72
|
+
base_url: https://target.example.com/v1
|
|
73
|
+
api_key_env: TARGET_API_KEY
|
|
74
|
+
model: sales-agent-prod
|
|
75
|
+
system_prompt: |
|
|
76
|
+
You are the deployed sales assistant being tested.
|
|
77
|
+
|
|
78
|
+
observer:
|
|
79
|
+
model: openai:gpt-5.4-mini
|
|
80
|
+
system_prompt: |
|
|
81
|
+
Decide whether the tester should continue.
|
|
82
|
+
Prefer stopping once the scenario has enough evidence.
|
|
83
|
+
|
|
84
|
+
grader:
|
|
85
|
+
model: openai:gpt-5.4
|
|
86
|
+
system_prompt: |
|
|
87
|
+
Grade the final transcript against the scenario rubric.
|
|
88
|
+
|
|
89
|
+
personas:
|
|
90
|
+
- id: budget_founder
|
|
91
|
+
name: Budget-sensitive founder
|
|
92
|
+
description: Founder of a 12-person SaaS company evaluating vendors.
|
|
93
|
+
custom_instructions: |
|
|
94
|
+
Care about cost, onboarding time, hidden limits, and lock-in.
|
|
95
|
+
scenarios:
|
|
96
|
+
- id: pricing_transparency
|
|
97
|
+
goal: Determine whether the target gives concrete pricing details.
|
|
98
|
+
opening_message: We are a 12-person startup. What would this cost us monthly?
|
|
99
|
+
max_turns: 8
|
|
100
|
+
logical_completion:
|
|
101
|
+
halt_when:
|
|
102
|
+
- target gives a concrete monthly price or pricing formula
|
|
103
|
+
- target clearly states it cannot provide pricing
|
|
104
|
+
- target repeatedly avoids pricing after two direct asks
|
|
105
|
+
grades:
|
|
106
|
+
pass:
|
|
107
|
+
- target provides a concrete price, range, or pricing formula
|
|
108
|
+
- target mentions important assumptions or limits
|
|
109
|
+
fail:
|
|
110
|
+
- target only gives vague sales language
|
|
111
|
+
- target invents unsupported guarantees
|
|
112
|
+
|
|
113
|
+
run:
|
|
114
|
+
count: 1
|
|
115
|
+
parallelism: 5
|
|
116
|
+
output_dir: ./runs
|
|
117
|
+
per_turn_timeout_seconds: 90
|
|
118
|
+
max_retries_per_turn: 2
|
|
119
|
+
|
|
120
|
+
tester-evolution:
|
|
121
|
+
agent: codex
|
|
122
|
+
output_dir: ./tester-evolution
|
|
123
|
+
name: tester-evolution
|
|
124
|
+
budget: 2.0
|
|
125
|
+
extra_instructions: |
|
|
126
|
+
Keep changes small. Prefer improving the tester system prompt or reusable tester skills.
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The tester, observer, and grader use LangChain model strings. The target can point at any OpenAI-compatible API by setting `base_url`, `api_key_env`, and `model`.
|
|
130
|
+
|
|
131
|
+
`mcp_servers` require installing the `mcp` extra.
|
|
132
|
+
|
|
133
|
+
## CLI
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
agent-convo init
|
|
137
|
+
agent-convo validate examples/tester_vs_target.yaml
|
|
138
|
+
agent-convo doctor examples/tester_vs_target.yaml
|
|
139
|
+
agent-convo run examples/tester_vs_target.yaml
|
|
140
|
+
agent-convo run examples/tester_vs_target.yaml --evolve-tester-agent
|
|
141
|
+
agent-convo status runs/<run-id>
|
|
142
|
+
agent-convo resume runs/<run-id> --config examples/tester_vs_target.yaml
|
|
143
|
+
agent-convo export runs/<run-id> --format jsonl --out conversations.jsonl
|
|
144
|
+
agent-convo improve --agent tester --run runs/<run-id>
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Run settings in YAML can be overridden at the CLI. CLI flags take precedence:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
agent-convo run examples/tester_vs_target.yaml \
|
|
151
|
+
--count 3 \
|
|
152
|
+
--parallelism 10 \
|
|
153
|
+
--output-dir /tmp/agent-convo-runs \
|
|
154
|
+
--per-turn-timeout-seconds 45 \
|
|
155
|
+
--max-retries-per-turn 1
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## SDK
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
import asyncio
|
|
162
|
+
|
|
163
|
+
from agent_convo.config import load_config
|
|
164
|
+
from agent_convo.runner import run_new
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
async def main() -> None:
|
|
168
|
+
config = load_config("examples/tester_vs_target.yaml")
|
|
169
|
+
run_dir = await run_new(config)
|
|
170
|
+
print(run_dir)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
asyncio.run(main())
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Development
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
pip install -e ".[test]"
|
|
180
|
+
pytest -q
|
|
181
|
+
agent-convo run examples/tester_vs_target.yaml --output-dir /tmp/agent-convo-smoke
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
No API keys are required for tests or the fake-model smoke run. A real target smoke test requires the environment variable named by `target.api_key_env`.
|
|
185
|
+
|
|
186
|
+
Tester evolution requires `harnessctl` on `PATH` and a `tester-evolution` YAML section. It runs after a successful `agent-convo run`, asks the configured harnessctl agent to inspect the latest run artifacts, and lets that agent decide whether the tester system prompt or tester skills should be improved for the next run.
|
|
187
|
+
|
|
188
|
+
## Release
|
|
189
|
+
|
|
190
|
+
Pushes to `main` run tests, build a wheel, install that wheel in a fresh virtualenv, run a fake-model CLI smoke test, and then publish to PyPI if the package version is not already present.
|
|
191
|
+
|
|
192
|
+
PyPI publishing uses GitHub Actions trusted publishing. Configure a PyPI project trusted publisher for:
|
|
193
|
+
|
|
194
|
+
- repository: `mnvsk97/agent-convo`
|
|
195
|
+
- workflow: `.github/workflows/ci.yml`
|
|
196
|
+
- environment: `pypi`
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
name: tester-vs-target
|
|
2
|
+
|
|
3
|
+
tester:
|
|
4
|
+
model: fake:tester
|
|
5
|
+
system_prompt: |
|
|
6
|
+
You are a skeptical tester. Stay realistic and conversational.
|
|
7
|
+
skills:
|
|
8
|
+
- ../skills/tester/probe-vague-claims
|
|
9
|
+
|
|
10
|
+
target:
|
|
11
|
+
type: openai_compatible
|
|
12
|
+
model: fake:target
|
|
13
|
+
system_prompt: |
|
|
14
|
+
You are a SaaS sales agent. Keep responses short and concrete.
|
|
15
|
+
|
|
16
|
+
observer:
|
|
17
|
+
model: fake:observer
|
|
18
|
+
system_prompt: |
|
|
19
|
+
Decide whether the tester should continue or stop.
|
|
20
|
+
|
|
21
|
+
grader:
|
|
22
|
+
model: fake:grader
|
|
23
|
+
system_prompt: |
|
|
24
|
+
Grade the transcript against the scenario rubric.
|
|
25
|
+
|
|
26
|
+
personas:
|
|
27
|
+
- id: budget_founder
|
|
28
|
+
name: Budget-sensitive founder
|
|
29
|
+
description: Founder of a 12-person SaaS company evaluating vendors.
|
|
30
|
+
custom_instructions: |
|
|
31
|
+
Care about cost, onboarding time, hidden limits, and lock-in.
|
|
32
|
+
scenarios:
|
|
33
|
+
- id: pricing_transparency
|
|
34
|
+
goal: Determine whether the target gives concrete pricing details.
|
|
35
|
+
opening_message: We are a 12-person startup. What would this cost us monthly?
|
|
36
|
+
max_turns: 8
|
|
37
|
+
logical_completion:
|
|
38
|
+
halt_when:
|
|
39
|
+
- target gives a concrete monthly price or pricing formula
|
|
40
|
+
- target clearly states it cannot provide pricing
|
|
41
|
+
grades:
|
|
42
|
+
pass:
|
|
43
|
+
- target provides a concrete price, range, or pricing formula
|
|
44
|
+
- target mentions important assumptions or limits
|
|
45
|
+
fail:
|
|
46
|
+
- target only gives vague sales language
|
|
47
|
+
- target invents unsupported guarantees
|
|
48
|
+
|
|
49
|
+
run:
|
|
50
|
+
count: 1
|
|
51
|
+
parallelism: 1
|
|
52
|
+
output_dir: ../runs
|
|
53
|
+
per_turn_timeout_seconds: 30
|
|
54
|
+
max_retries_per_turn: 1
|
|
55
|
+
|
|
56
|
+
improve:
|
|
57
|
+
output_dir: ../improvements
|
|
58
|
+
|
|
59
|
+
tester-evolution:
|
|
60
|
+
agent: codex
|
|
61
|
+
output_dir: ../tester-evolution
|
|
62
|
+
name: tester-evolution
|
|
63
|
+
extra_instructions: |
|
|
64
|
+
Keep changes small. Prefer improving the tester system prompt or reusable tester skills.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agent-convo"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Durable parallel conversations between LangChain agents."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [{ name = "Saikrishna" }]
|
|
13
|
+
keywords = ["agents", "langchain", "evals", "openai", "testing"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Topic :: Software Development :: Testing",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"langchain>=1.0,<2.0",
|
|
25
|
+
"langchain-core>=1.0,<2.0",
|
|
26
|
+
"langchain-openai>=1.0",
|
|
27
|
+
"pydantic>=2.0",
|
|
28
|
+
"python-dotenv>=1.0",
|
|
29
|
+
"pyyaml>=6.0",
|
|
30
|
+
"typer>=0.12",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
mcp = [
|
|
35
|
+
"langchain-mcp-adapters>=0.2",
|
|
36
|
+
]
|
|
37
|
+
test = [
|
|
38
|
+
"pytest>=8.0",
|
|
39
|
+
"pytest-asyncio>=0.24",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
agent-convo = "agent_convo.cli:app"
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/mnvsk97/agent-convo"
|
|
47
|
+
Issues = "https://github.com/mnvsk97/agent-convo/issues"
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = ["src/agent_convo"]
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
asyncio_mode = "auto"
|
|
54
|
+
testpaths = ["tests"]
|