agent-eval-rpc 0.21.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_eval_rpc-0.21.0/.gitignore +8 -0
- agent_eval_rpc-0.21.0/PKG-INFO +198 -0
- agent_eval_rpc-0.21.0/README.md +170 -0
- agent_eval_rpc-0.21.0/pyproject.toml +54 -0
- agent_eval_rpc-0.21.0/src/agent_eval_rpc/__init__.py +67 -0
- agent_eval_rpc-0.21.0/src/agent_eval_rpc/client.py +215 -0
- agent_eval_rpc-0.21.0/src/agent_eval_rpc/errors.py +52 -0
- agent_eval_rpc-0.21.0/src/agent_eval_rpc/models.py +124 -0
- agent_eval_rpc-0.21.0/tests/test_models.py +67 -0
- agent_eval_rpc-0.21.0/tests/test_subprocess.py +87 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-eval-rpc
|
|
3
|
+
Version: 0.21.0
|
|
4
|
+
Summary: Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client.
|
|
5
|
+
Project-URL: Homepage, https://github.com/tangle-network/agent-eval
|
|
6
|
+
Project-URL: Issues, https://github.com/tangle-network/agent-eval/issues
|
|
7
|
+
Project-URL: Documentation, https://github.com/tangle-network/agent-eval/blob/main/clients/python/README.md
|
|
8
|
+
Author: Tangle Network
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: agent,evaluation,judge,llm,rubric
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: httpx>=0.27
|
|
22
|
+
Requires-Dist: pydantic>=2.6
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# agent-eval-rpc — Python client
|
|
30
|
+
|
|
31
|
+
Python client for [`@tangle-network/agent-eval`](https://github.com/tangle-network/agent-eval) — a content/code judging framework written in TypeScript. This package is a **thin transport adapter**: every judgement runs in the Node runtime, marshalled over HTTP or stdio RPC. Two languages, one implementation. No drift.
|
|
32
|
+
|
|
33
|
+
## What you get
|
|
34
|
+
|
|
35
|
+
A function-call interface to score any string against a rubric:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from agent_eval_rpc import Client
|
|
39
|
+
|
|
40
|
+
client = Client() # auto-detects HTTP server, falls back to subprocess
|
|
41
|
+
result = client.judge(
|
|
42
|
+
content="We just launched zero-copy IO between agents and their workdir",
|
|
43
|
+
rubric_name="anti-slop",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
print(result.composite) # 0.0..1.0 — single number to gate on
|
|
47
|
+
print(result.dimensions) # {"buyer_quality": 0.7, "voice": 0.8, "signal": 0.9}
|
|
48
|
+
print(result.failure_modes) # [] or ["ai-cadence", "marketing-tone", ...]
|
|
49
|
+
print(result.wins) # ["specific-component", "earned-detail", ...]
|
|
50
|
+
print(result.rationale) # "The post names a real architectural detail..."
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
That's the entire surface for content judging.
|
|
54
|
+
|
|
55
|
+
## Install
|
|
56
|
+
|
|
57
|
+
```sh
|
|
58
|
+
cd clients/python
|
|
59
|
+
pip install -e .
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
To use it, **one of**:
|
|
63
|
+
|
|
64
|
+
- `npm install -g @tangle-network/agent-eval` — gives you the `agent-eval` binary, used by the subprocess transport (works offline, slower per call due to Node startup ~500ms).
|
|
65
|
+
- Run a server: `agent-eval serve --port 5005` — gives you HTTP transport (~10ms per call once up).
|
|
66
|
+
|
|
67
|
+
The Python client picks whichever is available. Force one with `Client(transport="http")` or `Client(transport="subprocess")`.
|
|
68
|
+
|
|
69
|
+
## Why the architecture works this way
|
|
70
|
+
|
|
71
|
+
The TypeScript package is the source of truth for evaluation logic. We don't reimplement rubrics, scoring, or judges in Python — we marshal JSON to the canonical runtime over a versioned wire protocol (defined as Zod schemas, exported as OpenAPI, mirrored in this package as pydantic models).
|
|
72
|
+
|
|
73
|
+
Adding a new method to the API means: define a Zod schema in `src/wire/schemas.ts`, write the handler in `src/wire/handlers.ts`, and the Python client picks it up on the next regeneration. **There is no separate Python implementation to maintain.**
|
|
74
|
+
|
|
75
|
+
This is the same pattern as the Anthropic SDK, Stripe SDK, and gRPC: one canonical implementation, language-specific transport clients.
|
|
76
|
+
|
|
77
|
+
## API
|
|
78
|
+
|
|
79
|
+
### `Client`
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
Client(
|
|
83
|
+
base_url: str | None = None, # AGENT_EVAL_URL or http://127.0.0.1:5005
|
|
84
|
+
cli_path: str | None = None, # AGENT_EVAL_CLI or 'agent-eval'
|
|
85
|
+
transport: "auto" | "http" | "subprocess" = "auto",
|
|
86
|
+
timeout_s: float = 120.0,
|
|
87
|
+
)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### `client.judge(...)`
|
|
91
|
+
|
|
92
|
+
Score a piece of content against a rubric.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
def judge(
|
|
96
|
+
*,
|
|
97
|
+
content: str, # the text being judged
|
|
98
|
+
rubric_name: str | None = None, # OR
|
|
99
|
+
rubric: Rubric | dict | None = None, # an inline rubric definition
|
|
100
|
+
context: dict | None = None, # free-form metadata for the judge
|
|
101
|
+
model: str | None = None, # override the judge LLM
|
|
102
|
+
) -> JudgeResult
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Either** `rubric_name` (use a built-in like `"anti-slop"`) **or** `rubric` (an inline definition with your own dimensions/prompt). Not both.
|
|
106
|
+
|
|
107
|
+
**Returns** `JudgeResult`:
|
|
108
|
+
- `composite: float` — weighted score in 0..1. The single number to gate on.
|
|
109
|
+
- `dimensions: dict[str, float]` — per-axis scores (e.g. `{"buyer_quality": 0.7}`).
|
|
110
|
+
- `failure_modes: list[str]` — ids of negative patterns detected.
|
|
111
|
+
- `wins: list[str]` — ids of positive patterns detected.
|
|
112
|
+
- `rationale: str` — plain-English explanation.
|
|
113
|
+
- `rubric_version: str` — stable hash of the rubric used. Compare scores only when this matches.
|
|
114
|
+
- `model: str` — LLM that produced the judgement.
|
|
115
|
+
- `duration_ms: int` — wall-clock latency.
|
|
116
|
+
|
|
117
|
+
### `client.list_rubrics()`
|
|
118
|
+
|
|
119
|
+
Return every rubric the server has registered, with their dimensions and stable `rubric_version`.
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
rubrics = client.list_rubrics()
|
|
123
|
+
for r in rubrics.rubrics:
|
|
124
|
+
print(r.name, r.description, r.rubric_version)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### `client.version()`
|
|
128
|
+
|
|
129
|
+
Return server + wire-protocol version. Match your `pip install` version to `version`; check `wire_version` for compatibility.
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
v = client.version()
|
|
133
|
+
assert v.version.startswith("0.20")
|
|
134
|
+
assert v.wire_version == "1.0.0"
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Defining a custom rubric
|
|
138
|
+
|
|
139
|
+
Built-in `anti-slop` is tuned for technical-buyer audiences. For different scoring, pass a `Rubric` inline:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from agent_eval_rpc import Client, Rubric, RubricDimension, FailureMode
|
|
143
|
+
|
|
144
|
+
rubric = Rubric(
|
|
145
|
+
name="my-rubric",
|
|
146
|
+
description="Does this commit message explain WHY, not just what?",
|
|
147
|
+
systemPrompt="You score commit messages. Score 0..1 on whether the WHY is clear...",
|
|
148
|
+
dimensions=[
|
|
149
|
+
RubricDimension(id="explains_why", description="Does the message say *why*?", weight=1.0),
|
|
150
|
+
],
|
|
151
|
+
failureModes=[
|
|
152
|
+
FailureMode(id="what-not-why", description="States the change but not the reason"),
|
|
153
|
+
],
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
result = client.judge(content="bumped the version", rubric=rubric)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Errors
|
|
160
|
+
|
|
161
|
+
| Exception | When |
|
|
162
|
+
|---|---|
|
|
163
|
+
| `ValidationError` | Server (or pydantic) rejected the request as malformed. Fix your inputs. |
|
|
164
|
+
| `RubricNotFoundError` | Unknown `rubric_name`. Call `list_rubrics()` to see what's registered. |
|
|
165
|
+
| `TransportError` | HTTP unreachable or subprocess failed. Retry or check the server. |
|
|
166
|
+
| `AgentEvalError` | Base class — catches everything above. |
|
|
167
|
+
|
|
168
|
+
All errors carry `.code` and `.details` (the structured payload from the server).
|
|
169
|
+
|
|
170
|
+
## Versioning
|
|
171
|
+
|
|
172
|
+
This package is **version-locked** to the npm package. `agent-eval-rpc==0.21.0` ↔ `@tangle-network/agent-eval@0.21.0`. CI verifies the npm package, Python package, runtime `__version__`, and release tag all agree before publish. If one registry publish fails after the other succeeds, retry the failed publish from the same tag or supersede with the next patch release.
|
|
173
|
+
|
|
174
|
+
`wire_version` is separate. It bumps only on breaking schema changes. Package versions can differ across releases as long as `wire_version` is the same.
|
|
175
|
+
|
|
176
|
+
## Development
|
|
177
|
+
|
|
178
|
+
```sh
|
|
179
|
+
# install in editable mode
|
|
180
|
+
pip install -e ".[dev]"
|
|
181
|
+
|
|
182
|
+
# unit tests (no Node required)
|
|
183
|
+
pytest tests/test_models.py
|
|
184
|
+
|
|
185
|
+
# integration tests against the bundled CLI
|
|
186
|
+
cd ../.. && pnpm build # build the agent-eval CLI in repo root
|
|
187
|
+
cd clients/python && pytest # runs subprocess tests against dist/cli.js
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Adding a new method
|
|
191
|
+
|
|
192
|
+
When the TS side adds a new endpoint (say `evaluateScenario`):
|
|
193
|
+
1. Update `src/wire/schemas.ts` with `EvaluateScenarioRequestSchema` and `EvaluateScenarioResponseSchema`.
|
|
194
|
+
2. Add a handler in `src/wire/handlers.ts`, route in `src/wire/server.ts`, and case in `src/wire/rpc.ts`.
|
|
195
|
+
3. In this client, add the matching pydantic model in `models.py` and method on `Client`. The pattern is mechanical — copy the shape from `judge`.
|
|
196
|
+
4. Test in both languages. Bump versions together.
|
|
197
|
+
|
|
198
|
+
A future iteration moves step 3 to `datamodel-code-generator -i openapi.json` so it's mechanical-and-automatic instead of mechanical-by-hand. Until the surface grows past ~10 endpoints, hand-written models are more readable.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# agent-eval-rpc — Python client
|
|
2
|
+
|
|
3
|
+
Python client for [`@tangle-network/agent-eval`](https://github.com/tangle-network/agent-eval) — a content/code judging framework written in TypeScript. This package is a **thin transport adapter**: every judgement runs in the Node runtime, marshalled over HTTP or stdio RPC. Two languages, one implementation. No drift.
|
|
4
|
+
|
|
5
|
+
## What you get
|
|
6
|
+
|
|
7
|
+
A function-call interface to score any string against a rubric:
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from agent_eval_rpc import Client
|
|
11
|
+
|
|
12
|
+
client = Client() # auto-detects HTTP server, falls back to subprocess
|
|
13
|
+
result = client.judge(
|
|
14
|
+
content="We just launched zero-copy IO between agents and their workdir",
|
|
15
|
+
rubric_name="anti-slop",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
print(result.composite) # 0.0..1.0 — single number to gate on
|
|
19
|
+
print(result.dimensions) # {"buyer_quality": 0.7, "voice": 0.8, "signal": 0.9}
|
|
20
|
+
print(result.failure_modes) # [] or ["ai-cadence", "marketing-tone", ...]
|
|
21
|
+
print(result.wins) # ["specific-component", "earned-detail", ...]
|
|
22
|
+
print(result.rationale) # "The post names a real architectural detail..."
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
That's the entire surface for content judging.
|
|
26
|
+
|
|
27
|
+
## Install
|
|
28
|
+
|
|
29
|
+
```sh
|
|
30
|
+
cd clients/python
|
|
31
|
+
pip install -e .
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
To use it, **one of**:
|
|
35
|
+
|
|
36
|
+
- `npm install -g @tangle-network/agent-eval` — gives you the `agent-eval` binary, used by the subprocess transport (works offline, slower per call due to Node startup ~500ms).
|
|
37
|
+
- Run a server: `agent-eval serve --port 5005` — gives you HTTP transport (~10ms per call once up).
|
|
38
|
+
|
|
39
|
+
The Python client picks whichever is available. Force one with `Client(transport="http")` or `Client(transport="subprocess")`.
|
|
40
|
+
|
|
41
|
+
## Why the architecture works this way
|
|
42
|
+
|
|
43
|
+
The TypeScript package is the source of truth for evaluation logic. We don't reimplement rubrics, scoring, or judges in Python — we marshal JSON to the canonical runtime over a versioned wire protocol (defined as Zod schemas, exported as OpenAPI, mirrored in this package as pydantic models).
|
|
44
|
+
|
|
45
|
+
Adding a new method to the API means: define a Zod schema in `src/wire/schemas.ts`, write the handler in `src/wire/handlers.ts`, and the Python client picks it up on the next regeneration. **There is no separate Python implementation to maintain.**
|
|
46
|
+
|
|
47
|
+
This is the same pattern as the Anthropic SDK, Stripe SDK, and gRPC: one canonical implementation, language-specific transport clients.
|
|
48
|
+
|
|
49
|
+
## API
|
|
50
|
+
|
|
51
|
+
### `Client`
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
Client(
|
|
55
|
+
base_url: str | None = None, # AGENT_EVAL_URL or http://127.0.0.1:5005
|
|
56
|
+
cli_path: str | None = None, # AGENT_EVAL_CLI or 'agent-eval'
|
|
57
|
+
transport: "auto" | "http" | "subprocess" = "auto",
|
|
58
|
+
timeout_s: float = 120.0,
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### `client.judge(...)`
|
|
63
|
+
|
|
64
|
+
Score a piece of content against a rubric.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
def judge(
|
|
68
|
+
*,
|
|
69
|
+
content: str, # the text being judged
|
|
70
|
+
rubric_name: str | None = None, # OR
|
|
71
|
+
rubric: Rubric | dict | None = None, # an inline rubric definition
|
|
72
|
+
context: dict | None = None, # free-form metadata for the judge
|
|
73
|
+
model: str | None = None, # override the judge LLM
|
|
74
|
+
) -> JudgeResult
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**Either** `rubric_name` (use a built-in like `"anti-slop"`) **or** `rubric` (an inline definition with your own dimensions/prompt). Not both.
|
|
78
|
+
|
|
79
|
+
**Returns** `JudgeResult`:
|
|
80
|
+
- `composite: float` — weighted score in 0..1. The single number to gate on.
|
|
81
|
+
- `dimensions: dict[str, float]` — per-axis scores (e.g. `{"buyer_quality": 0.7}`).
|
|
82
|
+
- `failure_modes: list[str]` — ids of negative patterns detected.
|
|
83
|
+
- `wins: list[str]` — ids of positive patterns detected.
|
|
84
|
+
- `rationale: str` — plain-English explanation.
|
|
85
|
+
- `rubric_version: str` — stable hash of the rubric used. Compare scores only when this matches.
|
|
86
|
+
- `model: str` — LLM that produced the judgement.
|
|
87
|
+
- `duration_ms: int` — wall-clock latency.
|
|
88
|
+
|
|
89
|
+
### `client.list_rubrics()`
|
|
90
|
+
|
|
91
|
+
Return every rubric the server has registered, with their dimensions and stable `rubric_version`.
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
rubrics = client.list_rubrics()
|
|
95
|
+
for r in rubrics.rubrics:
|
|
96
|
+
print(r.name, r.description, r.rubric_version)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### `client.version()`
|
|
100
|
+
|
|
101
|
+
Return server + wire-protocol version. Match your `pip install` version to `version`; check `wire_version` for compatibility.
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
v = client.version()
|
|
105
|
+
assert v.version.startswith("0.20")
|
|
106
|
+
assert v.wire_version == "1.0.0"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Defining a custom rubric
|
|
110
|
+
|
|
111
|
+
Built-in `anti-slop` is tuned for technical-buyer audiences. For different scoring, pass a `Rubric` inline:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from agent_eval_rpc import Client, Rubric, RubricDimension, FailureMode
|
|
115
|
+
|
|
116
|
+
rubric = Rubric(
|
|
117
|
+
name="my-rubric",
|
|
118
|
+
description="Does this commit message explain WHY, not just what?",
|
|
119
|
+
systemPrompt="You score commit messages. Score 0..1 on whether the WHY is clear...",
|
|
120
|
+
dimensions=[
|
|
121
|
+
RubricDimension(id="explains_why", description="Does the message say *why*?", weight=1.0),
|
|
122
|
+
],
|
|
123
|
+
failureModes=[
|
|
124
|
+
FailureMode(id="what-not-why", description="States the change but not the reason"),
|
|
125
|
+
],
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
result = client.judge(content="bumped the version", rubric=rubric)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Errors
|
|
132
|
+
|
|
133
|
+
| Exception | When |
|
|
134
|
+
|---|---|
|
|
135
|
+
| `ValidationError` | Server (or pydantic) rejected the request as malformed. Fix your inputs. |
|
|
136
|
+
| `RubricNotFoundError` | Unknown `rubric_name`. Call `list_rubrics()` to see what's registered. |
|
|
137
|
+
| `TransportError` | HTTP unreachable or subprocess failed. Retry or check the server. |
|
|
138
|
+
| `AgentEvalError` | Base class — catches everything above. |
|
|
139
|
+
|
|
140
|
+
All errors carry `.code` and `.details` (the structured payload from the server).
|
|
141
|
+
|
|
142
|
+
## Versioning
|
|
143
|
+
|
|
144
|
+
This package is **version-locked** to the npm package. `agent-eval-rpc==0.21.0` ↔ `@tangle-network/agent-eval@0.21.0`. CI verifies the npm package, Python package, runtime `__version__`, and release tag all agree before publish. If one registry publish fails after the other succeeds, retry the failed publish from the same tag or supersede with the next patch release.
|
|
145
|
+
|
|
146
|
+
`wire_version` is separate. It bumps only on breaking schema changes. Package versions can differ across releases as long as `wire_version` is the same.
|
|
147
|
+
|
|
148
|
+
## Development
|
|
149
|
+
|
|
150
|
+
```sh
|
|
151
|
+
# install in editable mode
|
|
152
|
+
pip install -e ".[dev]"
|
|
153
|
+
|
|
154
|
+
# unit tests (no Node required)
|
|
155
|
+
pytest tests/test_models.py
|
|
156
|
+
|
|
157
|
+
# integration tests against the bundled CLI
|
|
158
|
+
cd ../.. && pnpm build # build the agent-eval CLI in repo root
|
|
159
|
+
cd clients/python && pytest # runs subprocess tests against dist/cli.js
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Adding a new method
|
|
163
|
+
|
|
164
|
+
When the TS side adds a new endpoint (say `evaluateScenario`):
|
|
165
|
+
1. Update `src/wire/schemas.ts` with `EvaluateScenarioRequestSchema` and `EvaluateScenarioResponseSchema`.
|
|
166
|
+
2. Add a handler in `src/wire/handlers.ts`, route in `src/wire/server.ts`, and case in `src/wire/rpc.ts`.
|
|
167
|
+
3. In this client, add the matching pydantic model in `models.py` and method on `Client`. The pattern is mechanical — copy the shape from `judge`.
|
|
168
|
+
4. Test in both languages. Bump versions together.
|
|
169
|
+
|
|
170
|
+
A future iteration moves step 3 to `datamodel-code-generator -i openapi.json` so it's mechanical-and-automatic instead of mechanical-by-hand. Until the surface grows past ~10 endpoints, hand-written models are more readable.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.21"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agent-eval-rpc"
|
|
7
|
+
version = "0.21.0"
|
|
8
|
+
description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Tangle Network" }]
|
|
13
|
+
keywords = ["evaluation", "llm", "rubric", "agent", "judge"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"httpx>=0.27",
|
|
27
|
+
"pydantic>=2.6",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = [
|
|
32
|
+
"pytest>=8.0",
|
|
33
|
+
"pytest-asyncio>=0.23",
|
|
34
|
+
"ruff>=0.6",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/tangle-network/agent-eval"
|
|
39
|
+
Issues = "https://github.com/tangle-network/agent-eval/issues"
|
|
40
|
+
Documentation = "https://github.com/tangle-network/agent-eval/blob/main/clients/python/README.md"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/agent_eval_rpc"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff]
|
|
46
|
+
line-length = 100
|
|
47
|
+
target-version = "py310"
|
|
48
|
+
|
|
49
|
+
[tool.ruff.lint]
|
|
50
|
+
select = ["E", "F", "W", "I", "B", "UP"]
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
addopts = "-ra -q"
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""agent-eval-rpc — Python RPC client for @tangle-network/agent-eval.
|
|
2
|
+
|
|
3
|
+
The TypeScript package is the source of truth for evaluation logic. This
|
|
4
|
+
client is a thin transport adapter — every judgement runs in the Node
|
|
5
|
+
runtime, marshalled over HTTP or stdio RPC. Two languages, one
|
|
6
|
+
implementation.
|
|
7
|
+
|
|
8
|
+
The package distributes as ``agent-eval-rpc`` on PyPI and imports as
|
|
9
|
+
``agent_eval_rpc`` to make the wire-client nature explicit; the rubric
|
|
10
|
+
logic lives upstream in ``@tangle-network/agent-eval`` on npm.
|
|
11
|
+
|
|
12
|
+
Quickstart
|
|
13
|
+
----------
|
|
14
|
+
|
|
15
|
+
from agent_eval_rpc import Client
|
|
16
|
+
|
|
17
|
+
client = Client() # auto-detects HTTP server, falls back to subprocess
|
|
18
|
+
result = client.judge(content="our scaffold supports zero-copy IO", rubric_name="anti-slop")
|
|
19
|
+
print(result.composite, result.failure_modes)
|
|
20
|
+
|
|
21
|
+
Or as a one-shot using the bundled `agent-eval` CLI:
|
|
22
|
+
|
|
23
|
+
result = client.judge(content="…", rubric={"name": "custom", ...})
|
|
24
|
+
|
|
25
|
+
See README.md for the full guide.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
29
|
+
|
|
30
|
+
from .client import Client
|
|
31
|
+
from .errors import (
|
|
32
|
+
AgentEvalError,
|
|
33
|
+
RubricNotFoundError,
|
|
34
|
+
TransportError,
|
|
35
|
+
ValidationError,
|
|
36
|
+
)
|
|
37
|
+
from .models import (
|
|
38
|
+
FailureMode,
|
|
39
|
+
JudgeRequest,
|
|
40
|
+
JudgeResult,
|
|
41
|
+
ListRubricsResponse,
|
|
42
|
+
Rubric,
|
|
43
|
+
RubricDimension,
|
|
44
|
+
RubricInfo,
|
|
45
|
+
VersionResponse,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
__version__ = version("agent-eval-rpc")
|
|
50
|
+
except PackageNotFoundError:
|
|
51
|
+
__version__ = "0.21.0"
|
|
52
|
+
|
|
53
|
+
__all__ = [
|
|
54
|
+
"Client",
|
|
55
|
+
"AgentEvalError",
|
|
56
|
+
"TransportError",
|
|
57
|
+
"RubricNotFoundError",
|
|
58
|
+
"ValidationError",
|
|
59
|
+
"JudgeRequest",
|
|
60
|
+
"JudgeResult",
|
|
61
|
+
"Rubric",
|
|
62
|
+
"RubricDimension",
|
|
63
|
+
"FailureMode",
|
|
64
|
+
"RubricInfo",
|
|
65
|
+
"ListRubricsResponse",
|
|
66
|
+
"VersionResponse",
|
|
67
|
+
]
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Client — the public entry point.
|
|
2
|
+
|
|
3
|
+
Two transports, one API:
|
|
4
|
+
|
|
5
|
+
- HTTP (default if reachable): talks to a running `agent-eval serve`.
|
|
6
|
+
Best for live agent paths and high-frequency calls.
|
|
7
|
+
- Subprocess (fallback / explicit): shells out to `agent-eval rpc <method>`.
|
|
8
|
+
Best for batch / cron — no service to manage.
|
|
9
|
+
|
|
10
|
+
Auto-detection: if `base_url` reaches a running server in `auto_probe_timeout`
|
|
11
|
+
seconds, HTTP wins. Otherwise the client falls back to subprocess. Force one
|
|
12
|
+
transport with `transport="http"` or `transport="subprocess"`.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
from typing import Any, Literal
|
|
22
|
+
|
|
23
|
+
import httpx
|
|
24
|
+
|
|
25
|
+
from .errors import AgentEvalError, TransportError, from_error_body
|
|
26
|
+
from .models import (
|
|
27
|
+
JudgeRequest,
|
|
28
|
+
JudgeResult,
|
|
29
|
+
ListRubricsResponse,
|
|
30
|
+
Rubric,
|
|
31
|
+
VersionResponse,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
Transport = Literal["http", "subprocess", "auto"]
|
|
35
|
+
|
|
36
|
+
DEFAULT_BASE_URL = "http://127.0.0.1:5005"
|
|
37
|
+
DEFAULT_CLI = "agent-eval"
|
|
38
|
+
DEFAULT_TIMEOUT_S = 120.0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Client:
|
|
42
|
+
"""Synchronous client for agent-eval.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
base_url:
|
|
47
|
+
Where to find the HTTP server. Defaults to AGENT_EVAL_URL env var
|
|
48
|
+
or http://127.0.0.1:5005.
|
|
49
|
+
cli_path:
|
|
50
|
+
Name or absolute path of the `agent-eval` binary used by the
|
|
51
|
+
subprocess transport. Defaults to AGENT_EVAL_CLI or 'agent-eval'.
|
|
52
|
+
transport:
|
|
53
|
+
'auto' (default), 'http', or 'subprocess'.
|
|
54
|
+
timeout_s:
|
|
55
|
+
Per-call timeout, default 120 seconds. Judges are allowed up to
|
|
56
|
+
~60s server-side, so 120s is comfortably above that.
|
|
57
|
+
auto_probe_timeout:
|
|
58
|
+
How long to wait for the HTTP /healthz check during auto-detect.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
base_url: str | None = None,
|
|
64
|
+
*,
|
|
65
|
+
cli_path: str | None = None,
|
|
66
|
+
transport: Transport = "auto",
|
|
67
|
+
timeout_s: float = DEFAULT_TIMEOUT_S,
|
|
68
|
+
auto_probe_timeout: float = 1.0,
|
|
69
|
+
) -> None:
|
|
70
|
+
self.base_url = (base_url or os.environ.get("AGENT_EVAL_URL") or DEFAULT_BASE_URL).rstrip("/")
|
|
71
|
+
self.cli_path = cli_path or os.environ.get("AGENT_EVAL_CLI") or DEFAULT_CLI
|
|
72
|
+
self.timeout_s = timeout_s
|
|
73
|
+
self._transport = self._resolve_transport(transport, auto_probe_timeout)
|
|
74
|
+
|
|
75
|
+
# ── Public methods ──────────────────────────────────────────────
|
|
76
|
+
|
|
77
|
+
def judge(
|
|
78
|
+
self,
|
|
79
|
+
*,
|
|
80
|
+
content: str,
|
|
81
|
+
rubric_name: str | None = None,
|
|
82
|
+
rubric: Rubric | dict[str, Any] | None = None,
|
|
83
|
+
context: dict[str, Any] | None = None,
|
|
84
|
+
model: str | None = None,
|
|
85
|
+
) -> JudgeResult:
|
|
86
|
+
"""Score `content` against a rubric and return a typed result."""
|
|
87
|
+
# Validate locally so the user sees a Python-side error before the
|
|
88
|
+
# transport even fires. The server validates again as defense in depth.
|
|
89
|
+
rubric_value: Rubric | None
|
|
90
|
+
if isinstance(rubric, dict):
|
|
91
|
+
rubric_value = Rubric.model_validate(rubric)
|
|
92
|
+
else:
|
|
93
|
+
rubric_value = rubric
|
|
94
|
+
request = JudgeRequest(
|
|
95
|
+
rubric_name=rubric_name,
|
|
96
|
+
rubric=rubric_value,
|
|
97
|
+
content=content,
|
|
98
|
+
context=context,
|
|
99
|
+
model=model,
|
|
100
|
+
)
|
|
101
|
+
body = self._call("judge", request.model_dump(by_alias=True, exclude_none=True))
|
|
102
|
+
return JudgeResult.model_validate(body)
|
|
103
|
+
|
|
104
|
+
def list_rubrics(self) -> ListRubricsResponse:
|
|
105
|
+
body = self._call("listRubrics", {})
|
|
106
|
+
return ListRubricsResponse.model_validate(body)
|
|
107
|
+
|
|
108
|
+
def version(self) -> VersionResponse:
|
|
109
|
+
body = self._call("version", {})
|
|
110
|
+
return VersionResponse.model_validate(body)
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def transport(self) -> Literal["http", "subprocess"]:
|
|
114
|
+
return self._transport
|
|
115
|
+
|
|
116
|
+
# ── Transport dispatch ──────────────────────────────────────────
|
|
117
|
+
|
|
118
|
+
def _resolve_transport(
|
|
119
|
+
self, requested: Transport, probe_timeout: float
|
|
120
|
+
) -> Literal["http", "subprocess"]:
|
|
121
|
+
if requested == "http":
|
|
122
|
+
return "http"
|
|
123
|
+
if requested == "subprocess":
|
|
124
|
+
return "subprocess"
|
|
125
|
+
# auto: probe HTTP first
|
|
126
|
+
try:
|
|
127
|
+
with httpx.Client(timeout=probe_timeout) as c:
|
|
128
|
+
r = c.get(f"{self.base_url}/healthz")
|
|
129
|
+
if r.status_code == 200:
|
|
130
|
+
return "http"
|
|
131
|
+
except (httpx.HTTPError, OSError):
|
|
132
|
+
pass
|
|
133
|
+
if shutil.which(self.cli_path) is None:
|
|
134
|
+
raise TransportError(
|
|
135
|
+
f"No HTTP server at {self.base_url} and no `{self.cli_path}` binary on PATH. "
|
|
136
|
+
"Either run `agent-eval serve` or `npm i -g @tangle-network/agent-eval`."
|
|
137
|
+
)
|
|
138
|
+
return "subprocess"
|
|
139
|
+
|
|
140
|
+
def _call(self, method: str, params: dict[str, Any]) -> Any:
|
|
141
|
+
if self._transport == "http":
|
|
142
|
+
return self._http_call(method, params)
|
|
143
|
+
return self._subprocess_call(method, params)
|
|
144
|
+
|
|
145
|
+
def _http_call(self, method: str, params: dict[str, Any]) -> Any:
|
|
146
|
+
path = _http_path_for(method)
|
|
147
|
+
try:
|
|
148
|
+
with httpx.Client(timeout=self.timeout_s, base_url=self.base_url) as c:
|
|
149
|
+
if path.method == "GET":
|
|
150
|
+
r = c.get(path.url)
|
|
151
|
+
else:
|
|
152
|
+
r = c.post(path.url, json=params)
|
|
153
|
+
except httpx.HTTPError as e:
|
|
154
|
+
raise TransportError(f"HTTP transport failed: {e}") from e
|
|
155
|
+
if r.status_code >= 400:
|
|
156
|
+
try:
|
|
157
|
+
raise from_error_body(r.status_code, r.json())
|
|
158
|
+
except (ValueError, json.JSONDecodeError):
|
|
159
|
+
raise TransportError(f"HTTP {r.status_code}: {r.text[:500]}")
|
|
160
|
+
try:
|
|
161
|
+
return r.json()
|
|
162
|
+
except json.JSONDecodeError as e:
|
|
163
|
+
raise TransportError(f"Server returned non-JSON body: {e}") from e
|
|
164
|
+
|
|
165
|
+
def _subprocess_call(self, method: str, params: dict[str, Any]) -> Any:
|
|
166
|
+
try:
|
|
167
|
+
proc = subprocess.run(
|
|
168
|
+
[self.cli_path, "rpc", method],
|
|
169
|
+
input=json.dumps(params),
|
|
170
|
+
capture_output=True,
|
|
171
|
+
text=True,
|
|
172
|
+
timeout=self.timeout_s,
|
|
173
|
+
check=False,
|
|
174
|
+
)
|
|
175
|
+
except (FileNotFoundError, subprocess.TimeoutExpired) as e:
|
|
176
|
+
raise TransportError(f"Subprocess transport failed: {e}") from e
|
|
177
|
+
if not proc.stdout:
|
|
178
|
+
raise TransportError(
|
|
179
|
+
f"agent-eval rpc {method} produced no output. stderr: {proc.stderr[:500]}"
|
|
180
|
+
)
|
|
181
|
+
try:
|
|
182
|
+
envelope = json.loads(proc.stdout.strip().splitlines()[-1])
|
|
183
|
+
except json.JSONDecodeError as e:
|
|
184
|
+
raise TransportError(f"agent-eval rpc returned non-JSON: {proc.stdout[:500]}") from e
|
|
185
|
+
if "error" in envelope:
|
|
186
|
+
# Map to the right exception class — same as HTTP path.
|
|
187
|
+
raise from_error_body(proc.returncode or 500, envelope)
|
|
188
|
+
if "result" not in envelope:
|
|
189
|
+
raise TransportError(f"Malformed RPC envelope: {envelope}")
|
|
190
|
+
return envelope["result"]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# ── Method → HTTP path mapping ──────────────────────────────────────
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class _HttpPath:
|
|
197
|
+
__slots__ = ("method", "url")
|
|
198
|
+
|
|
199
|
+
def __init__(self, method: str, url: str) -> None:
|
|
200
|
+
self.method = method
|
|
201
|
+
self.url = url
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
_PATHS = {
|
|
205
|
+
"judge": _HttpPath("POST", "/v1/judge"),
|
|
206
|
+
"listRubrics": _HttpPath("GET", "/v1/rubrics"),
|
|
207
|
+
"version": _HttpPath("GET", "/v1/version"),
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _http_path_for(method: str) -> _HttpPath:
|
|
212
|
+
try:
|
|
213
|
+
return _PATHS[method]
|
|
214
|
+
except KeyError as e:
|
|
215
|
+
raise AgentEvalError(f"Unknown method: {method}") from e
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Exception hierarchy.
|
|
2
|
+
|
|
3
|
+
All errors raised by this client subclass `AgentEvalError`. Catch the
|
|
4
|
+
specific ones (`RubricNotFoundError`, `ValidationError`) for cases that
|
|
5
|
+
are fixable in caller code; let `TransportError` bubble or retry it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AgentEvalError(Exception):
|
|
12
|
+
"""Base class for every error raised by this client."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, message: str, *, code: str | None = None, details: object = None) -> None:
|
|
15
|
+
super().__init__(message)
|
|
16
|
+
self.code = code
|
|
17
|
+
self.details = details
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TransportError(AgentEvalError):
|
|
21
|
+
"""The HTTP request or subprocess invocation failed at the transport layer.
|
|
22
|
+
|
|
23
|
+
Distinct from server-side errors (which arrive as 4xx with a typed
|
|
24
|
+
body — those map to other subclasses). TransportError = the request
|
|
25
|
+
couldn't be made or the response couldn't be parsed.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ValidationError(AgentEvalError):
|
|
30
|
+
"""Server rejected the request as malformed (HTTP 400 with code='validation_error')."""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class RubricNotFoundError(AgentEvalError):
|
|
34
|
+
"""Server has no rubric by that name (HTTP 404 with code='rubric_not_found')."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def from_error_body(status: int, body: object) -> AgentEvalError:
|
|
38
|
+
"""Map a server error envelope to the right exception class."""
|
|
39
|
+
code = None
|
|
40
|
+
message = "Unknown error"
|
|
41
|
+
details = None
|
|
42
|
+
if isinstance(body, dict):
|
|
43
|
+
err = body.get("error")
|
|
44
|
+
if isinstance(err, dict):
|
|
45
|
+
code = err.get("code")
|
|
46
|
+
message = err.get("message", message)
|
|
47
|
+
details = err.get("details")
|
|
48
|
+
if code == "rubric_not_found":
|
|
49
|
+
return RubricNotFoundError(message, code=code, details=details)
|
|
50
|
+
if code == "validation_error":
|
|
51
|
+
return ValidationError(message, code=code, details=details)
|
|
52
|
+
return AgentEvalError(f"HTTP {status}: {message}", code=code, details=details)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Data models that mirror the wire-protocol Zod schemas.
|
|
2
|
+
|
|
3
|
+
These pydantic models are kept in sync by hand for now — the surface is
|
|
4
|
+
small (six classes). When the wire surface grows past ~10 endpoints,
|
|
5
|
+
swap this file for `datamodel-code-generator -i openapi.json -o models.py`.
|
|
6
|
+
|
|
7
|
+
Every field name and type matches `src/wire/schemas.ts` exactly. If you
|
|
8
|
+
change one without changing the other, the dual-publish CI will fail.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _StrictModel(BaseModel):
|
|
19
|
+
"""Reject unknown fields — drift between TS and Python should fail loudly."""
|
|
20
|
+
|
|
21
|
+
model_config = ConfigDict(extra="forbid")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RubricDimension(_StrictModel):
|
|
25
|
+
"""A scoring axis within a rubric.
|
|
26
|
+
|
|
27
|
+
Composite scores combine each dimension by `weight`. The `min`/`max`
|
|
28
|
+
bounds are used to normalize raw scores into 0..1 before weighting.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
id: str = Field(..., description="Stable id like 'buyer_quality'.")
|
|
32
|
+
description: str = Field(..., description="One-line plain-English meaning.")
|
|
33
|
+
weight: float = Field(1.0, ge=0, description="Relative weight in composite. 0 disables.")
|
|
34
|
+
min: float = 0.0
|
|
35
|
+
max: float = 1.0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FailureMode(_StrictModel):
|
|
39
|
+
"""A negative pattern the judge looks for. Detected ones appear in result.failure_modes."""
|
|
40
|
+
|
|
41
|
+
id: str
|
|
42
|
+
description: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class Rubric(_StrictModel):
|
|
46
|
+
"""A complete rubric — what's being scored and how.
|
|
47
|
+
|
|
48
|
+
Pass this inline to `Client.judge(rubric=...)` or register a built-in
|
|
49
|
+
rubric server-side and use `Client.judge(rubric_name=...)`.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
name: str
|
|
53
|
+
description: str
|
|
54
|
+
system_prompt: str = Field(..., alias="systemPrompt")
|
|
55
|
+
dimensions: list[RubricDimension]
|
|
56
|
+
failure_modes: list[FailureMode] = Field(default_factory=list, alias="failureModes")
|
|
57
|
+
wins: list[FailureMode] = Field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class JudgeRequest(_StrictModel):
|
|
63
|
+
"""Input to /v1/judge. Provide either rubric_name or rubric (not both)."""
|
|
64
|
+
|
|
65
|
+
rubric_name: str | None = Field(None, alias="rubricName")
|
|
66
|
+
rubric: Rubric | None = None
|
|
67
|
+
content: str = Field(..., min_length=1, description="The text being judged.")
|
|
68
|
+
context: dict[str, Any] | None = Field(
|
|
69
|
+
None,
|
|
70
|
+
description="Free-form metadata surfaced to the judging LLM.",
|
|
71
|
+
)
|
|
72
|
+
model: str | None = Field(None, description="Override the judge model.")
|
|
73
|
+
|
|
74
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
75
|
+
|
|
76
|
+
@model_validator(mode="after")
|
|
77
|
+
def _exactly_one_rubric(self) -> JudgeRequest:
|
|
78
|
+
if (self.rubric_name is None) == (self.rubric is None):
|
|
79
|
+
raise ValueError("Provide exactly one of `rubric_name` or `rubric`.")
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class JudgeResult(_StrictModel):
|
|
84
|
+
"""Output of /v1/judge. The `composite` is the 0..1 score to gate on."""
|
|
85
|
+
|
|
86
|
+
composite: float = Field(..., ge=0, le=1)
|
|
87
|
+
dimensions: dict[str, float]
|
|
88
|
+
failure_modes: list[str] = Field(default_factory=list, alias="failureModes")
|
|
89
|
+
wins: list[str] = Field(default_factory=list)
|
|
90
|
+
rationale: str
|
|
91
|
+
rubric_version: str = Field(..., alias="rubricVersion")
|
|
92
|
+
model: str
|
|
93
|
+
duration_ms: int = Field(..., alias="durationMs")
|
|
94
|
+
|
|
95
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class RubricInfo(_StrictModel):
|
|
99
|
+
"""One entry in /v1/rubrics."""
|
|
100
|
+
|
|
101
|
+
name: str
|
|
102
|
+
description: str
|
|
103
|
+
dimensions: list[dict[str, Any]]
|
|
104
|
+
failure_modes: list[str] = Field(default_factory=list, alias="failureModes")
|
|
105
|
+
rubric_version: str = Field(..., alias="rubricVersion")
|
|
106
|
+
|
|
107
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ListRubricsResponse(_StrictModel):
|
|
111
|
+
"""Response from /v1/rubrics."""
|
|
112
|
+
|
|
113
|
+
rubrics: list[RubricInfo]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class VersionResponse(_StrictModel):
|
|
117
|
+
"""Response from /v1/version. Match `version` to your installed pip package."""
|
|
118
|
+
|
|
119
|
+
package: str
|
|
120
|
+
version: str
|
|
121
|
+
wire_version: str = Field(..., alias="wireVersion")
|
|
122
|
+
api_surface: list[str] = Field(..., alias="apiSurface")
|
|
123
|
+
|
|
124
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Schema mirror tests — defend against TS/Python drift.
|
|
2
|
+
|
|
3
|
+
Each test names the regression it would catch. The invariant: anything
|
|
4
|
+
the TypeScript JudgeRequest accepts/rejects, the Python JudgeRequest
|
|
5
|
+
must accept/reject the same way.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
from agent_eval_rpc.models import JudgeRequest, Rubric, RubricDimension
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
MIN_RUBRIC = Rubric(
|
|
15
|
+
name="r",
|
|
16
|
+
description="d",
|
|
17
|
+
systemPrompt="p",
|
|
18
|
+
dimensions=[RubricDimension(id="a", description="b")],
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_judge_request_accepts_rubric_name_alone() -> None:
|
|
23
|
+
JudgeRequest(rubric_name="anti-slop", content="hello")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_judge_request_accepts_inline_rubric_alone() -> None:
|
|
27
|
+
JudgeRequest(rubric=MIN_RUBRIC, content="hello")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_judge_request_rejects_both_rubric_name_and_rubric() -> None:
|
|
31
|
+
"""Regression: ambiguous selection — server must not have to choose."""
|
|
32
|
+
with pytest.raises(ValueError, match="exactly one"):
|
|
33
|
+
JudgeRequest(rubric_name="anti-slop", rubric=MIN_RUBRIC, content="hello")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_judge_request_rejects_neither_rubric_name_nor_rubric() -> None:
|
|
37
|
+
"""Regression: silently dispatching to default rubric hides bugs."""
|
|
38
|
+
with pytest.raises(ValueError, match="exactly one"):
|
|
39
|
+
JudgeRequest(content="hello")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_judge_request_rejects_empty_content() -> None:
|
|
43
|
+
"""Regression: empty content scored high because LLMs are agreeable."""
|
|
44
|
+
with pytest.raises(ValueError):
|
|
45
|
+
JudgeRequest(rubric_name="anti-slop", content="")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_rubric_dimension_defaults() -> None:
|
|
49
|
+
d = RubricDimension(id="x", description="y")
|
|
50
|
+
assert d.weight == 1.0
|
|
51
|
+
assert d.min == 0.0
|
|
52
|
+
assert d.max == 1.0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_rubric_round_trip_preserves_camelCase_aliases() -> None:
|
|
56
|
+
"""Wire format uses systemPrompt/failureModes; Python uses snake_case.
|
|
57
|
+
Round-trip via .model_dump(by_alias=True) must preserve the wire shape."""
|
|
58
|
+
r = Rubric(
|
|
59
|
+
name="r",
|
|
60
|
+
description="d",
|
|
61
|
+
systemPrompt="p",
|
|
62
|
+
dimensions=[RubricDimension(id="a", description="b")],
|
|
63
|
+
)
|
|
64
|
+
payload = r.model_dump(by_alias=True)
|
|
65
|
+
assert "systemPrompt" in payload
|
|
66
|
+
assert "failureModes" in payload
|
|
67
|
+
Rubric.model_validate(payload) # accepts its own output
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Integration tests against the real `agent-eval rpc` binary.
|
|
2
|
+
|
|
3
|
+
These run end-to-end against the bundled CLI in this repo's `dist/`.
|
|
4
|
+
We exercise every method that doesn't need a live LLM:
|
|
5
|
+
- version
|
|
6
|
+
- listRubrics
|
|
7
|
+
- judge with no rubric (validation error path)
|
|
8
|
+
- judge with bad rubric_name (rubric_not_found path)
|
|
9
|
+
|
|
10
|
+
Live judge calls (which DO hit an LLM) live in test_live_judge.py and
|
|
11
|
+
are gated by the AGENT_EVAL_LIVE=1 env var.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import shutil
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import pytest
|
|
19
|
+
|
|
20
|
+
from agent_eval_rpc import Client, RubricNotFoundError, ValidationError
|
|
21
|
+
|
|
22
|
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
23
|
+
CLI_DIST = REPO_ROOT / "dist" / "cli.js"
|
|
24
|
+
|
|
25
|
+
pytestmark = pytest.mark.skipif(
|
|
26
|
+
not CLI_DIST.exists(),
|
|
27
|
+
reason="run `pnpm build` in agent-eval root before these tests",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _client() -> Client:
|
|
32
|
+
"""Subprocess client that invokes the bundled CLI directly via node."""
|
|
33
|
+
|
|
34
|
+
class _NodeWrappedClient(Client):
|
|
35
|
+
def _subprocess_call(self, method: str, params): # type: ignore[override]
|
|
36
|
+
import json
|
|
37
|
+
import subprocess
|
|
38
|
+
|
|
39
|
+
proc = subprocess.run(
|
|
40
|
+
["node", str(CLI_DIST), "rpc", method],
|
|
41
|
+
input=json.dumps(params),
|
|
42
|
+
capture_output=True,
|
|
43
|
+
text=True,
|
|
44
|
+
timeout=self.timeout_s,
|
|
45
|
+
check=False,
|
|
46
|
+
)
|
|
47
|
+
if not proc.stdout:
|
|
48
|
+
raise RuntimeError(f"no stdout. stderr: {proc.stderr}")
|
|
49
|
+
envelope = json.loads(proc.stdout.strip().splitlines()[-1])
|
|
50
|
+
if "error" in envelope:
|
|
51
|
+
from agent_eval_rpc.errors import from_error_body
|
|
52
|
+
raise from_error_body(proc.returncode or 500, envelope)
|
|
53
|
+
return envelope["result"]
|
|
54
|
+
|
|
55
|
+
c = _NodeWrappedClient(transport="subprocess")
|
|
56
|
+
# Override the resolved transport to bypass shutil.which check
|
|
57
|
+
c._transport = "subprocess" # type: ignore[assignment]
|
|
58
|
+
return c
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_version_via_subprocess() -> None:
|
|
62
|
+
c = _client()
|
|
63
|
+
v = c.version()
|
|
64
|
+
assert v.package == "@tangle-network/agent-eval"
|
|
65
|
+
assert v.version
|
|
66
|
+
assert "judge" in v.api_surface
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_list_rubrics_includes_anti_slop() -> None:
|
|
70
|
+
c = _client()
|
|
71
|
+
rubrics = c.list_rubrics()
|
|
72
|
+
names = [r.name for r in rubrics.rubrics]
|
|
73
|
+
assert "anti-slop" in names
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_judge_unknown_rubric_name_raises_RubricNotFoundError() -> None:
|
|
77
|
+
"""Regression: server returns 404; client must raise the typed error, not bubble TransportError."""
|
|
78
|
+
c = _client()
|
|
79
|
+
with pytest.raises(RubricNotFoundError):
|
|
80
|
+
c.judge(content="hello world", rubric_name="no-such-rubric-xyz")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_judge_empty_content_raises_ValidationError() -> None:
|
|
84
|
+
"""Regression: pydantic should catch this before subprocess fires."""
|
|
85
|
+
c = _client()
|
|
86
|
+
with pytest.raises((ValidationError, ValueError)):
|
|
87
|
+
c.judge(content="", rubric_name="anti-slop")
|