agent-eval-rpc 0.21.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.egg-info/
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ build/
7
+ dist/
8
+ *.pyc
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-eval-rpc
3
+ Version: 0.21.0
4
+ Summary: Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client.
5
+ Project-URL: Homepage, https://github.com/tangle-network/agent-eval
6
+ Project-URL: Issues, https://github.com/tangle-network/agent-eval/issues
7
+ Project-URL: Documentation, https://github.com/tangle-network/agent-eval/blob/main/clients/python/README.md
8
+ Author: Tangle Network
9
+ License: MIT
10
+ Keywords: agent,evaluation,judge,llm,rubric
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Quality Assurance
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: httpx>=0.27
22
+ Requires-Dist: pydantic>=2.6
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
25
+ Requires-Dist: pytest>=8.0; extra == 'dev'
26
+ Requires-Dist: ruff>=0.6; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # agent-eval-rpc — Python client
30
+
31
+ Python client for [`@tangle-network/agent-eval`](https://github.com/tangle-network/agent-eval) — a content/code judging framework written in TypeScript. This package is a **thin transport adapter**: every judgement runs in the Node runtime, marshalled over HTTP or stdio RPC. Two languages, one implementation. No drift.
32
+
33
+ ## What you get
34
+
35
+ A function-call interface to score any string against a rubric:
36
+
37
+ ```python
38
+ from agent_eval_rpc import Client
39
+
40
+ client = Client() # auto-detects HTTP server, falls back to subprocess
41
+ result = client.judge(
42
+ content="We just launched zero-copy IO between agents and their workdir",
43
+ rubric_name="anti-slop",
44
+ )
45
+
46
+ print(result.composite) # 0.0..1.0 — single number to gate on
47
+ print(result.dimensions) # {"buyer_quality": 0.7, "voice": 0.8, "signal": 0.9}
48
+ print(result.failure_modes) # [] or ["ai-cadence", "marketing-tone", ...]
49
+ print(result.wins) # ["specific-component", "earned-detail", ...]
50
+ print(result.rationale) # "The post names a real architectural detail..."
51
+ ```
52
+
53
+ That's the entire surface for content judging.
54
+
55
+ ## Install
56
+
57
+ ```sh
58
+ cd clients/python
59
+ pip install -e .
60
+ ```
61
+
62
+ To use it, **one of**:
63
+
64
+ - `npm install -g @tangle-network/agent-eval` — gives you the `agent-eval` binary, used by the subprocess transport (works offline, slower per call due to Node startup ~500ms).
65
+ - Run a server: `agent-eval serve --port 5005` — gives you HTTP transport (~10ms per call once up).
66
+
67
+ The Python client picks whichever is available. Force one with `Client(transport="http")` or `Client(transport="subprocess")`.
68
+
69
+ ## Why the architecture works this way
70
+
71
+ The TypeScript package is the source of truth for evaluation logic. We don't reimplement rubrics, scoring, or judges in Python — we marshal JSON to the canonical runtime over a versioned wire protocol (defined as Zod schemas, exported as OpenAPI, mirrored in this package as pydantic models).
72
+
73
+ Adding a new method to the API means: define a Zod schema in `src/wire/schemas.ts`, write the handler in `src/wire/handlers.ts`, and the Python client picks it up on the next regeneration. **There is no separate Python implementation to maintain.**
74
+
75
+ This is the same pattern as the Anthropic SDK, Stripe SDK, and gRPC: one canonical implementation, language-specific transport clients.
76
+
77
+ ## API
78
+
79
+ ### `Client`
80
+
81
+ ```python
82
+ Client(
83
+ base_url: str | None = None, # AGENT_EVAL_URL or http://127.0.0.1:5005
84
+ cli_path: str | None = None, # AGENT_EVAL_CLI or 'agent-eval'
85
+ transport: "auto" | "http" | "subprocess" = "auto",
86
+ timeout_s: float = 120.0,
87
+ )
88
+ ```
89
+
90
+ ### `client.judge(...)`
91
+
92
+ Score a piece of content against a rubric.
93
+
94
+ ```python
95
+ def judge(
96
+ *,
97
+ content: str, # the text being judged
98
+ rubric_name: str | None = None, # OR
99
+ rubric: Rubric | dict | None = None, # an inline rubric definition
100
+ context: dict | None = None, # free-form metadata for the judge
101
+ model: str | None = None, # override the judge LLM
102
+ ) -> JudgeResult
103
+ ```
104
+
105
+ **Either** `rubric_name` (use a built-in like `"anti-slop"`) **or** `rubric` (an inline definition with your own dimensions/prompt). Not both.
106
+
107
+ **Returns** `JudgeResult`:
108
+ - `composite: float` — weighted score in 0..1. The single number to gate on.
109
+ - `dimensions: dict[str, float]` — per-axis scores (e.g. `{"buyer_quality": 0.7}`).
110
+ - `failure_modes: list[str]` — ids of negative patterns detected.
111
+ - `wins: list[str]` — ids of positive patterns detected.
112
+ - `rationale: str` — plain-English explanation.
113
+ - `rubric_version: str` — stable hash of the rubric used. Compare scores only when this matches.
114
+ - `model: str` — LLM that produced the judgement.
115
+ - `duration_ms: int` — wall-clock latency.
116
+
117
+ ### `client.list_rubrics()`
118
+
119
+ Return every rubric the server has registered, with their dimensions and stable `rubric_version`.
120
+
121
+ ```python
122
+ rubrics = client.list_rubrics()
123
+ for r in rubrics.rubrics:
124
+ print(r.name, r.description, r.rubric_version)
125
+ ```
126
+
127
+ ### `client.version()`
128
+
129
+ Return server + wire-protocol version. Match your `pip install` version to `version`; check `wire_version` for compatibility.
130
+
131
+ ```python
132
+ v = client.version()
133
+ assert v.version.startswith("0.20")
134
+ assert v.wire_version == "1.0.0"
135
+ ```
136
+
137
+ ## Defining a custom rubric
138
+
139
+ Built-in `anti-slop` is tuned for technical-buyer audiences. For different scoring, pass a `Rubric` inline:
140
+
141
+ ```python
142
+ from agent_eval_rpc import Client, Rubric, RubricDimension, FailureMode
143
+
144
+ rubric = Rubric(
145
+ name="my-rubric",
146
+ description="Does this commit message explain WHY, not just what?",
147
+ systemPrompt="You score commit messages. Score 0..1 on whether the WHY is clear...",
148
+ dimensions=[
149
+ RubricDimension(id="explains_why", description="Does the message say *why*?", weight=1.0),
150
+ ],
151
+ failureModes=[
152
+ FailureMode(id="what-not-why", description="States the change but not the reason"),
153
+ ],
154
+ )
155
+
156
+ result = client.judge(content="bumped the version", rubric=rubric)
157
+ ```
158
+
159
+ ## Errors
160
+
161
+ | Exception | When |
162
+ |---|---|
163
+ | `ValidationError` | Server (or pydantic) rejected the request as malformed. Fix your inputs. |
164
+ | `RubricNotFoundError` | Unknown `rubric_name`. Call `list_rubrics()` to see what's registered. |
165
+ | `TransportError` | HTTP unreachable or subprocess failed. Retry or check the server. |
166
+ | `AgentEvalError` | Base class — catches everything above. |
167
+
168
+ All errors carry `.code` and `.details` (the structured payload from the server).
169
+
170
+ ## Versioning
171
+
172
+ This package is **version-locked** to the npm package. `agent-eval-rpc==0.21.0` ↔ `@tangle-network/agent-eval@0.21.0`. CI verifies the npm package, Python package, runtime `__version__`, and release tag all agree before publish. If one registry publish fails after the other succeeds, retry the failed publish from the same tag or supersede with the next patch release.
173
+
174
+ `wire_version` is separate. It bumps only on breaking schema changes. Package versions can differ across releases as long as `wire_version` is the same.
175
+
176
+ ## Development
177
+
178
+ ```sh
179
+ # install in editable mode
180
+ pip install -e ".[dev]"
181
+
182
+ # unit tests (no Node required)
183
+ pytest tests/test_models.py
184
+
185
+ # integration tests against the bundled CLI
186
+ cd ../.. && pnpm build # build the agent-eval CLI in repo root
187
+ cd clients/python && pytest # runs subprocess tests against dist/cli.js
188
+ ```
189
+
190
+ ## Adding a new method
191
+
192
+ When the TS side adds a new endpoint (say `evaluateScenario`):
193
+ 1. Update `src/wire/schemas.ts` with `EvaluateScenarioRequestSchema` and `EvaluateScenarioResponseSchema`.
194
+ 2. Add a handler in `src/wire/handlers.ts`, route in `src/wire/server.ts`, and case in `src/wire/rpc.ts`.
195
+ 3. In this client, add the matching pydantic model in `models.py` and method on `Client`. The pattern is mechanical — copy the shape from `judge`.
196
+ 4. Test in both languages. Bump versions together.
197
+
198
+ A future iteration moves step 3 to `datamodel-code-generator -i openapi.json` so it's mechanical-and-automatic instead of mechanical-by-hand. Until the surface grows past ~10 endpoints, hand-written models are more readable.
@@ -0,0 +1,170 @@
1
+ # agent-eval-rpc — Python client
2
+
3
+ Python client for [`@tangle-network/agent-eval`](https://github.com/tangle-network/agent-eval) — a content/code judging framework written in TypeScript. This package is a **thin transport adapter**: every judgement runs in the Node runtime, marshalled over HTTP or stdio RPC. Two languages, one implementation. No drift.
4
+
5
+ ## What you get
6
+
7
+ A function-call interface to score any string against a rubric:
8
+
9
+ ```python
10
+ from agent_eval_rpc import Client
11
+
12
+ client = Client() # auto-detects HTTP server, falls back to subprocess
13
+ result = client.judge(
14
+ content="We just launched zero-copy IO between agents and their workdir",
15
+ rubric_name="anti-slop",
16
+ )
17
+
18
+ print(result.composite) # 0.0..1.0 — single number to gate on
19
+ print(result.dimensions) # {"buyer_quality": 0.7, "voice": 0.8, "signal": 0.9}
20
+ print(result.failure_modes) # [] or ["ai-cadence", "marketing-tone", ...]
21
+ print(result.wins) # ["specific-component", "earned-detail", ...]
22
+ print(result.rationale) # "The post names a real architectural detail..."
23
+ ```
24
+
25
+ That's the entire surface for content judging.
26
+
27
+ ## Install
28
+
29
+ ```sh
30
+ cd clients/python
31
+ pip install -e .
32
+ ```
33
+
34
+ To use it, **one of**:
35
+
36
+ - `npm install -g @tangle-network/agent-eval` — gives you the `agent-eval` binary, used by the subprocess transport (works offline, slower per call due to Node startup ~500ms).
37
+ - Run a server: `agent-eval serve --port 5005` — gives you HTTP transport (~10ms per call once up).
38
+
39
+ The Python client picks whichever is available. Force one with `Client(transport="http")` or `Client(transport="subprocess")`.
40
+
41
+ ## Why the architecture works this way
42
+
43
+ The TypeScript package is the source of truth for evaluation logic. We don't reimplement rubrics, scoring, or judges in Python — we marshal JSON to the canonical runtime over a versioned wire protocol (defined as Zod schemas, exported as OpenAPI, mirrored in this package as pydantic models).
44
+
45
+ Adding a new method to the API means: define a Zod schema in `src/wire/schemas.ts`, write the handler in `src/wire/handlers.ts`, and the Python client picks it up on the next regeneration. **There is no separate Python implementation to maintain.**
46
+
47
+ This is the same pattern as the Anthropic SDK, Stripe SDK, and gRPC: one canonical implementation, language-specific transport clients.
48
+
49
+ ## API
50
+
51
+ ### `Client`
52
+
53
+ ```python
54
+ Client(
55
+ base_url: str | None = None, # AGENT_EVAL_URL or http://127.0.0.1:5005
56
+ cli_path: str | None = None, # AGENT_EVAL_CLI or 'agent-eval'
57
+ transport: "auto" | "http" | "subprocess" = "auto",
58
+ timeout_s: float = 120.0,
59
+ )
60
+ ```
61
+
62
+ ### `client.judge(...)`
63
+
64
+ Score a piece of content against a rubric.
65
+
66
+ ```python
67
+ def judge(
68
+ *,
69
+ content: str, # the text being judged
70
+ rubric_name: str | None = None, # OR
71
+ rubric: Rubric | dict | None = None, # an inline rubric definition
72
+ context: dict | None = None, # free-form metadata for the judge
73
+ model: str | None = None, # override the judge LLM
74
+ ) -> JudgeResult
75
+ ```
76
+
77
+ **Either** `rubric_name` (use a built-in like `"anti-slop"`) **or** `rubric` (an inline definition with your own dimensions/prompt). Not both.
78
+
79
+ **Returns** `JudgeResult`:
80
+ - `composite: float` — weighted score in 0..1. The single number to gate on.
81
+ - `dimensions: dict[str, float]` — per-axis scores (e.g. `{"buyer_quality": 0.7}`).
82
+ - `failure_modes: list[str]` — ids of negative patterns detected.
83
+ - `wins: list[str]` — ids of positive patterns detected.
84
+ - `rationale: str` — plain-English explanation.
85
+ - `rubric_version: str` — stable hash of the rubric used. Compare scores only when this matches.
86
+ - `model: str` — LLM that produced the judgement.
87
+ - `duration_ms: int` — wall-clock latency.
88
+
89
+ ### `client.list_rubrics()`
90
+
91
+ Return every rubric the server has registered, with their dimensions and stable `rubric_version`.
92
+
93
+ ```python
94
+ rubrics = client.list_rubrics()
95
+ for r in rubrics.rubrics:
96
+ print(r.name, r.description, r.rubric_version)
97
+ ```
98
+
99
+ ### `client.version()`
100
+
101
+ Return server + wire-protocol version. Match your `pip install` version to `version`; check `wire_version` for compatibility.
102
+
103
+ ```python
104
+ v = client.version()
105
+ assert v.version.startswith("0.20")
106
+ assert v.wire_version == "1.0.0"
107
+ ```
108
+
109
+ ## Defining a custom rubric
110
+
111
+ Built-in `anti-slop` is tuned for technical-buyer audiences. For different scoring, pass a `Rubric` inline:
112
+
113
+ ```python
114
+ from agent_eval_rpc import Client, Rubric, RubricDimension, FailureMode
115
+
116
+ rubric = Rubric(
117
+ name="my-rubric",
118
+ description="Does this commit message explain WHY, not just what?",
119
+ systemPrompt="You score commit messages. Score 0..1 on whether the WHY is clear...",
120
+ dimensions=[
121
+ RubricDimension(id="explains_why", description="Does the message say *why*?", weight=1.0),
122
+ ],
123
+ failureModes=[
124
+ FailureMode(id="what-not-why", description="States the change but not the reason"),
125
+ ],
126
+ )
127
+
128
+ result = client.judge(content="bumped the version", rubric=rubric)
129
+ ```
130
+
131
+ ## Errors
132
+
133
+ | Exception | When |
134
+ |---|---|
135
+ | `ValidationError` | Server (or pydantic) rejected the request as malformed. Fix your inputs. |
136
+ | `RubricNotFoundError` | Unknown `rubric_name`. Call `list_rubrics()` to see what's registered. |
137
+ | `TransportError` | HTTP unreachable or subprocess failed. Retry or check the server. |
138
+ | `AgentEvalError` | Base class — catches everything above. |
139
+
140
+ All errors carry `.code` and `.details` (the structured payload from the server).
141
+
142
+ ## Versioning
143
+
144
+ This package is **version-locked** to the npm package. `agent-eval-rpc==0.21.0` ↔ `@tangle-network/agent-eval@0.21.0`. CI verifies the npm package, Python package, runtime `__version__`, and release tag all agree before publish. If one registry publish fails after the other succeeds, retry the failed publish from the same tag or supersede with the next patch release.
145
+
146
+ `wire_version` is separate. It bumps only on breaking schema changes. Package versions can differ across releases as long as `wire_version` is the same.
147
+
148
+ ## Development
149
+
150
+ ```sh
151
+ # install in editable mode
152
+ pip install -e ".[dev]"
153
+
154
+ # unit tests (no Node required)
155
+ pytest tests/test_models.py
156
+
157
+ # integration tests against the bundled CLI
158
+ cd ../.. && pnpm build # build the agent-eval CLI in repo root
159
+ cd clients/python && pytest # runs subprocess tests against dist/cli.js
160
+ ```
161
+
162
+ ## Adding a new method
163
+
164
+ When the TS side adds a new endpoint (say `evaluateScenario`):
165
+ 1. Update `src/wire/schemas.ts` with `EvaluateScenarioRequestSchema` and `EvaluateScenarioResponseSchema`.
166
+ 2. Add a handler in `src/wire/handlers.ts`, route in `src/wire/server.ts`, and case in `src/wire/rpc.ts`.
167
+ 3. In this client, add the matching pydantic model in `models.py` and method on `Client`. The pattern is mechanical — copy the shape from `judge`.
168
+ 4. Test in both languages. Bump versions together.
169
+
170
+ A future iteration moves step 3 to `datamodel-code-generator -i openapi.json` so it's mechanical-and-automatic instead of mechanical-by-hand. Until the surface grows past ~10 endpoints, hand-written models are more readable.
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.21"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "agent-eval-rpc"
7
+ version = "0.21.0"
8
+ description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Tangle Network" }]
13
+ keywords = ["evaluation", "llm", "rubric", "agent", "judge"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Software Development :: Quality Assurance",
24
+ ]
25
+ dependencies = [
26
+ "httpx>=0.27",
27
+ "pydantic>=2.6",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = [
32
+ "pytest>=8.0",
33
+ "pytest-asyncio>=0.23",
34
+ "ruff>=0.6",
35
+ ]
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/tangle-network/agent-eval"
39
+ Issues = "https://github.com/tangle-network/agent-eval/issues"
40
+ Documentation = "https://github.com/tangle-network/agent-eval/blob/main/clients/python/README.md"
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["src/agent_eval_rpc"]
44
+
45
+ [tool.ruff]
46
+ line-length = 100
47
+ target-version = "py310"
48
+
49
+ [tool.ruff.lint]
50
+ select = ["E", "F", "W", "I", "B", "UP"]
51
+
52
+ [tool.pytest.ini_options]
53
+ testpaths = ["tests"]
54
+ addopts = "-ra -q"
@@ -0,0 +1,67 @@
1
+ """agent-eval-rpc — Python RPC client for @tangle-network/agent-eval.
2
+
3
+ The TypeScript package is the source of truth for evaluation logic. This
4
+ client is a thin transport adapter — every judgement runs in the Node
5
+ runtime, marshalled over HTTP or stdio RPC. Two languages, one
6
+ implementation.
7
+
8
+ The package distributes as ``agent-eval-rpc`` on PyPI and imports as
9
+ ``agent_eval_rpc`` to make the wire-client nature explicit; the rubric
10
+ logic lives upstream in ``@tangle-network/agent-eval`` on npm.
11
+
12
+ Quickstart
13
+ ----------
14
+
15
+ from agent_eval_rpc import Client
16
+
17
+ client = Client() # auto-detects HTTP server, falls back to subprocess
18
+ result = client.judge(content="our scaffold supports zero-copy IO", rubric_name="anti-slop")
19
+ print(result.composite, result.failure_modes)
20
+
21
+ Or as a one-shot using the bundled `agent-eval` CLI:
22
+
23
+ result = client.judge(content="…", rubric={"name": "custom", ...})
24
+
25
+ See README.md for the full guide.
26
+ """
27
+
28
+ from importlib.metadata import PackageNotFoundError, version
29
+
30
+ from .client import Client
31
+ from .errors import (
32
+ AgentEvalError,
33
+ RubricNotFoundError,
34
+ TransportError,
35
+ ValidationError,
36
+ )
37
+ from .models import (
38
+ FailureMode,
39
+ JudgeRequest,
40
+ JudgeResult,
41
+ ListRubricsResponse,
42
+ Rubric,
43
+ RubricDimension,
44
+ RubricInfo,
45
+ VersionResponse,
46
+ )
47
+
48
+ try:
49
+ __version__ = version("agent-eval-rpc")
50
+ except PackageNotFoundError:
51
+ __version__ = "0.21.0"
52
+
53
+ __all__ = [
54
+ "Client",
55
+ "AgentEvalError",
56
+ "TransportError",
57
+ "RubricNotFoundError",
58
+ "ValidationError",
59
+ "JudgeRequest",
60
+ "JudgeResult",
61
+ "Rubric",
62
+ "RubricDimension",
63
+ "FailureMode",
64
+ "RubricInfo",
65
+ "ListRubricsResponse",
66
+ "VersionResponse",
67
+ ]
@@ -0,0 +1,215 @@
1
+ """Client — the public entry point.
2
+
3
+ Two transports, one API:
4
+
5
+ - HTTP (default if reachable): talks to a running `agent-eval serve`.
6
+ Best for live agent paths and high-frequency calls.
7
+ - Subprocess (fallback / explicit): shells out to `agent-eval rpc <method>`.
8
+ Best for batch / cron — no service to manage.
9
+
10
+ Auto-detection: if `base_url` reaches a running server in `auto_probe_timeout`
11
+ seconds, HTTP wins. Otherwise the client falls back to subprocess. Force one
12
+ transport with `transport="http"` or `transport="subprocess"`.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ from typing import Any, Literal
22
+
23
+ import httpx
24
+
25
+ from .errors import AgentEvalError, TransportError, from_error_body
26
+ from .models import (
27
+ JudgeRequest,
28
+ JudgeResult,
29
+ ListRubricsResponse,
30
+ Rubric,
31
+ VersionResponse,
32
+ )
33
+
34
+ Transport = Literal["http", "subprocess", "auto"]
35
+
36
+ DEFAULT_BASE_URL = "http://127.0.0.1:5005"
37
+ DEFAULT_CLI = "agent-eval"
38
+ DEFAULT_TIMEOUT_S = 120.0
39
+
40
+
41
+ class Client:
42
+ """Synchronous client for agent-eval.
43
+
44
+ Parameters
45
+ ----------
46
+ base_url:
47
+ Where to find the HTTP server. Defaults to AGENT_EVAL_URL env var
48
+ or http://127.0.0.1:5005.
49
+ cli_path:
50
+ Name or absolute path of the `agent-eval` binary used by the
51
+ subprocess transport. Defaults to AGENT_EVAL_CLI or 'agent-eval'.
52
+ transport:
53
+ 'auto' (default), 'http', or 'subprocess'.
54
+ timeout_s:
55
+ Per-call timeout, default 120 seconds. Judges are allowed up to
56
+ ~60s server-side, so 120s is comfortably above that.
57
+ auto_probe_timeout:
58
+ How long to wait for the HTTP /healthz check during auto-detect.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ base_url: str | None = None,
64
+ *,
65
+ cli_path: str | None = None,
66
+ transport: Transport = "auto",
67
+ timeout_s: float = DEFAULT_TIMEOUT_S,
68
+ auto_probe_timeout: float = 1.0,
69
+ ) -> None:
70
+ self.base_url = (base_url or os.environ.get("AGENT_EVAL_URL") or DEFAULT_BASE_URL).rstrip("/")
71
+ self.cli_path = cli_path or os.environ.get("AGENT_EVAL_CLI") or DEFAULT_CLI
72
+ self.timeout_s = timeout_s
73
+ self._transport = self._resolve_transport(transport, auto_probe_timeout)
74
+
75
+ # ── Public methods ──────────────────────────────────────────────
76
+
77
+ def judge(
78
+ self,
79
+ *,
80
+ content: str,
81
+ rubric_name: str | None = None,
82
+ rubric: Rubric | dict[str, Any] | None = None,
83
+ context: dict[str, Any] | None = None,
84
+ model: str | None = None,
85
+ ) -> JudgeResult:
86
+ """Score `content` against a rubric and return a typed result."""
87
+ # Validate locally so the user sees a Python-side error before the
88
+ # transport even fires. The server validates again as defense in depth.
89
+ rubric_value: Rubric | None
90
+ if isinstance(rubric, dict):
91
+ rubric_value = Rubric.model_validate(rubric)
92
+ else:
93
+ rubric_value = rubric
94
+ request = JudgeRequest(
95
+ rubric_name=rubric_name,
96
+ rubric=rubric_value,
97
+ content=content,
98
+ context=context,
99
+ model=model,
100
+ )
101
+ body = self._call("judge", request.model_dump(by_alias=True, exclude_none=True))
102
+ return JudgeResult.model_validate(body)
103
+
104
+ def list_rubrics(self) -> ListRubricsResponse:
105
+ body = self._call("listRubrics", {})
106
+ return ListRubricsResponse.model_validate(body)
107
+
108
+ def version(self) -> VersionResponse:
109
+ body = self._call("version", {})
110
+ return VersionResponse.model_validate(body)
111
+
112
+ @property
113
+ def transport(self) -> Literal["http", "subprocess"]:
114
+ return self._transport
115
+
116
+ # ── Transport dispatch ──────────────────────────────────────────
117
+
118
+ def _resolve_transport(
119
+ self, requested: Transport, probe_timeout: float
120
+ ) -> Literal["http", "subprocess"]:
121
+ if requested == "http":
122
+ return "http"
123
+ if requested == "subprocess":
124
+ return "subprocess"
125
+ # auto: probe HTTP first
126
+ try:
127
+ with httpx.Client(timeout=probe_timeout) as c:
128
+ r = c.get(f"{self.base_url}/healthz")
129
+ if r.status_code == 200:
130
+ return "http"
131
+ except (httpx.HTTPError, OSError):
132
+ pass
133
+ if shutil.which(self.cli_path) is None:
134
+ raise TransportError(
135
+ f"No HTTP server at {self.base_url} and no `{self.cli_path}` binary on PATH. "
136
+ "Either run `agent-eval serve` or `npm i -g @tangle-network/agent-eval`."
137
+ )
138
+ return "subprocess"
139
+
140
+ def _call(self, method: str, params: dict[str, Any]) -> Any:
141
+ if self._transport == "http":
142
+ return self._http_call(method, params)
143
+ return self._subprocess_call(method, params)
144
+
145
+ def _http_call(self, method: str, params: dict[str, Any]) -> Any:
146
+ path = _http_path_for(method)
147
+ try:
148
+ with httpx.Client(timeout=self.timeout_s, base_url=self.base_url) as c:
149
+ if path.method == "GET":
150
+ r = c.get(path.url)
151
+ else:
152
+ r = c.post(path.url, json=params)
153
+ except httpx.HTTPError as e:
154
+ raise TransportError(f"HTTP transport failed: {e}") from e
155
+ if r.status_code >= 400:
156
+ try:
157
+ raise from_error_body(r.status_code, r.json())
158
+ except (ValueError, json.JSONDecodeError):
159
+ raise TransportError(f"HTTP {r.status_code}: {r.text[:500]}")
160
+ try:
161
+ return r.json()
162
+ except json.JSONDecodeError as e:
163
+ raise TransportError(f"Server returned non-JSON body: {e}") from e
164
+
165
+ def _subprocess_call(self, method: str, params: dict[str, Any]) -> Any:
166
+ try:
167
+ proc = subprocess.run(
168
+ [self.cli_path, "rpc", method],
169
+ input=json.dumps(params),
170
+ capture_output=True,
171
+ text=True,
172
+ timeout=self.timeout_s,
173
+ check=False,
174
+ )
175
+ except (FileNotFoundError, subprocess.TimeoutExpired) as e:
176
+ raise TransportError(f"Subprocess transport failed: {e}") from e
177
+ if not proc.stdout:
178
+ raise TransportError(
179
+ f"agent-eval rpc {method} produced no output. stderr: {proc.stderr[:500]}"
180
+ )
181
+ try:
182
+ envelope = json.loads(proc.stdout.strip().splitlines()[-1])
183
+ except json.JSONDecodeError as e:
184
+ raise TransportError(f"agent-eval rpc returned non-JSON: {proc.stdout[:500]}") from e
185
+ if "error" in envelope:
186
+ # Map to the right exception class — same as HTTP path.
187
+ raise from_error_body(proc.returncode or 500, envelope)
188
+ if "result" not in envelope:
189
+ raise TransportError(f"Malformed RPC envelope: {envelope}")
190
+ return envelope["result"]
191
+
192
+
193
+ # ── Method → HTTP path mapping ──────────────────────────────────────
194
+
195
+
196
+ class _HttpPath:
197
+ __slots__ = ("method", "url")
198
+
199
+ def __init__(self, method: str, url: str) -> None:
200
+ self.method = method
201
+ self.url = url
202
+
203
+
204
+ _PATHS = {
205
+ "judge": _HttpPath("POST", "/v1/judge"),
206
+ "listRubrics": _HttpPath("GET", "/v1/rubrics"),
207
+ "version": _HttpPath("GET", "/v1/version"),
208
+ }
209
+
210
+
211
+ def _http_path_for(method: str) -> _HttpPath:
212
+ try:
213
+ return _PATHS[method]
214
+ except KeyError as e:
215
+ raise AgentEvalError(f"Unknown method: {method}") from e
@@ -0,0 +1,52 @@
1
+ """Exception hierarchy.
2
+
3
+ All errors raised by this client subclass `AgentEvalError`. Catch the
4
+ specific ones (`RubricNotFoundError`, `ValidationError`) for cases that
5
+ are fixable in caller code; let `TransportError` bubble or retry it.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+
11
+ class AgentEvalError(Exception):
12
+ """Base class for every error raised by this client."""
13
+
14
+ def __init__(self, message: str, *, code: str | None = None, details: object = None) -> None:
15
+ super().__init__(message)
16
+ self.code = code
17
+ self.details = details
18
+
19
+
20
+ class TransportError(AgentEvalError):
21
+ """The HTTP request or subprocess invocation failed at the transport layer.
22
+
23
+ Distinct from server-side errors (which arrive as 4xx with a typed
24
+ body — those map to other subclasses). TransportError = the request
25
+ couldn't be made or the response couldn't be parsed.
26
+ """
27
+
28
+
29
+ class ValidationError(AgentEvalError):
30
+ """Server rejected the request as malformed (HTTP 400 with code='validation_error')."""
31
+
32
+
33
+ class RubricNotFoundError(AgentEvalError):
34
+ """Server has no rubric by that name (HTTP 404 with code='rubric_not_found')."""
35
+
36
+
37
+ def from_error_body(status: int, body: object) -> AgentEvalError:
38
+ """Map a server error envelope to the right exception class."""
39
+ code = None
40
+ message = "Unknown error"
41
+ details = None
42
+ if isinstance(body, dict):
43
+ err = body.get("error")
44
+ if isinstance(err, dict):
45
+ code = err.get("code")
46
+ message = err.get("message", message)
47
+ details = err.get("details")
48
+ if code == "rubric_not_found":
49
+ return RubricNotFoundError(message, code=code, details=details)
50
+ if code == "validation_error":
51
+ return ValidationError(message, code=code, details=details)
52
+ return AgentEvalError(f"HTTP {status}: {message}", code=code, details=details)
@@ -0,0 +1,124 @@
1
+ """Data models that mirror the wire-protocol Zod schemas.
2
+
3
+ These pydantic models are kept in sync by hand for now — the surface is
4
+ small (six classes). When the wire surface grows past ~10 endpoints,
5
+ swap this file for `datamodel-code-generator -i openapi.json -o models.py`.
6
+
7
+ Every field name and type matches `src/wire/schemas.ts` exactly. If you
8
+ change one without changing the other, the dual-publish CI will fail.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
16
+
17
+
18
+ class _StrictModel(BaseModel):
19
+ """Reject unknown fields — drift between TS and Python should fail loudly."""
20
+
21
+ model_config = ConfigDict(extra="forbid")
22
+
23
+
24
+ class RubricDimension(_StrictModel):
25
+ """A scoring axis within a rubric.
26
+
27
+ Composite scores combine each dimension by `weight`. The `min`/`max`
28
+ bounds are used to normalize raw scores into 0..1 before weighting.
29
+ """
30
+
31
+ id: str = Field(..., description="Stable id like 'buyer_quality'.")
32
+ description: str = Field(..., description="One-line plain-English meaning.")
33
+ weight: float = Field(1.0, ge=0, description="Relative weight in composite. 0 disables.")
34
+ min: float = 0.0
35
+ max: float = 1.0
36
+
37
+
38
+ class FailureMode(_StrictModel):
39
+ """A negative pattern the judge looks for. Detected ones appear in result.failure_modes."""
40
+
41
+ id: str
42
+ description: str
43
+
44
+
45
+ class Rubric(_StrictModel):
46
+ """A complete rubric — what's being scored and how.
47
+
48
+ Pass this inline to `Client.judge(rubric=...)` or register a built-in
49
+ rubric server-side and use `Client.judge(rubric_name=...)`.
50
+ """
51
+
52
+ name: str
53
+ description: str
54
+ system_prompt: str = Field(..., alias="systemPrompt")
55
+ dimensions: list[RubricDimension]
56
+ failure_modes: list[FailureMode] = Field(default_factory=list, alias="failureModes")
57
+ wins: list[FailureMode] = Field(default_factory=list)
58
+
59
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
60
+
61
+
62
+ class JudgeRequest(_StrictModel):
63
+ """Input to /v1/judge. Provide either rubric_name or rubric (not both)."""
64
+
65
+ rubric_name: str | None = Field(None, alias="rubricName")
66
+ rubric: Rubric | None = None
67
+ content: str = Field(..., min_length=1, description="The text being judged.")
68
+ context: dict[str, Any] | None = Field(
69
+ None,
70
+ description="Free-form metadata surfaced to the judging LLM.",
71
+ )
72
+ model: str | None = Field(None, description="Override the judge model.")
73
+
74
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
75
+
76
+ @model_validator(mode="after")
77
+ def _exactly_one_rubric(self) -> JudgeRequest:
78
+ if (self.rubric_name is None) == (self.rubric is None):
79
+ raise ValueError("Provide exactly one of `rubric_name` or `rubric`.")
80
+ return self
81
+
82
+
83
+ class JudgeResult(_StrictModel):
84
+ """Output of /v1/judge. The `composite` is the 0..1 score to gate on."""
85
+
86
+ composite: float = Field(..., ge=0, le=1)
87
+ dimensions: dict[str, float]
88
+ failure_modes: list[str] = Field(default_factory=list, alias="failureModes")
89
+ wins: list[str] = Field(default_factory=list)
90
+ rationale: str
91
+ rubric_version: str = Field(..., alias="rubricVersion")
92
+ model: str
93
+ duration_ms: int = Field(..., alias="durationMs")
94
+
95
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
96
+
97
+
98
+ class RubricInfo(_StrictModel):
99
+ """One entry in /v1/rubrics."""
100
+
101
+ name: str
102
+ description: str
103
+ dimensions: list[dict[str, Any]]
104
+ failure_modes: list[str] = Field(default_factory=list, alias="failureModes")
105
+ rubric_version: str = Field(..., alias="rubricVersion")
106
+
107
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
108
+
109
+
110
+ class ListRubricsResponse(_StrictModel):
111
+ """Response from /v1/rubrics."""
112
+
113
+ rubrics: list[RubricInfo]
114
+
115
+
116
+ class VersionResponse(_StrictModel):
117
+ """Response from /v1/version. Match `version` to your installed pip package."""
118
+
119
+ package: str
120
+ version: str
121
+ wire_version: str = Field(..., alias="wireVersion")
122
+ api_surface: list[str] = Field(..., alias="apiSurface")
123
+
124
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
@@ -0,0 +1,67 @@
1
+ """Schema mirror tests — defend against TS/Python drift.
2
+
3
+ Each test names the regression it would catch. The invariant: anything
4
+ the TypeScript JudgeRequest accepts/rejects, the Python JudgeRequest
5
+ must accept/reject the same way.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import pytest
10
+
11
+ from agent_eval_rpc.models import JudgeRequest, Rubric, RubricDimension
12
+
13
+
14
+ MIN_RUBRIC = Rubric(
15
+ name="r",
16
+ description="d",
17
+ systemPrompt="p",
18
+ dimensions=[RubricDimension(id="a", description="b")],
19
+ )
20
+
21
+
22
+ def test_judge_request_accepts_rubric_name_alone() -> None:
23
+ JudgeRequest(rubric_name="anti-slop", content="hello")
24
+
25
+
26
+ def test_judge_request_accepts_inline_rubric_alone() -> None:
27
+ JudgeRequest(rubric=MIN_RUBRIC, content="hello")
28
+
29
+
30
+ def test_judge_request_rejects_both_rubric_name_and_rubric() -> None:
31
+ """Regression: ambiguous selection — server must not have to choose."""
32
+ with pytest.raises(ValueError, match="exactly one"):
33
+ JudgeRequest(rubric_name="anti-slop", rubric=MIN_RUBRIC, content="hello")
34
+
35
+
36
+ def test_judge_request_rejects_neither_rubric_name_nor_rubric() -> None:
37
+ """Regression: silently dispatching to default rubric hides bugs."""
38
+ with pytest.raises(ValueError, match="exactly one"):
39
+ JudgeRequest(content="hello")
40
+
41
+
42
+ def test_judge_request_rejects_empty_content() -> None:
43
+ """Regression: empty content scored high because LLMs are agreeable."""
44
+ with pytest.raises(ValueError):
45
+ JudgeRequest(rubric_name="anti-slop", content="")
46
+
47
+
48
+ def test_rubric_dimension_defaults() -> None:
49
+ d = RubricDimension(id="x", description="y")
50
+ assert d.weight == 1.0
51
+ assert d.min == 0.0
52
+ assert d.max == 1.0
53
+
54
+
55
+ def test_rubric_round_trip_preserves_camelCase_aliases() -> None:
56
+ """Wire format uses systemPrompt/failureModes; Python uses snake_case.
57
+ Round-trip via .model_dump(by_alias=True) must preserve the wire shape."""
58
+ r = Rubric(
59
+ name="r",
60
+ description="d",
61
+ systemPrompt="p",
62
+ dimensions=[RubricDimension(id="a", description="b")],
63
+ )
64
+ payload = r.model_dump(by_alias=True)
65
+ assert "systemPrompt" in payload
66
+ assert "failureModes" in payload
67
+ Rubric.model_validate(payload) # accepts its own output
@@ -0,0 +1,87 @@
1
+ """Integration tests against the real `agent-eval rpc` binary.
2
+
3
+ These run end-to-end against the bundled CLI in this repo's `dist/`.
4
+ We exercise every method that doesn't need a live LLM:
5
+ - version
6
+ - listRubrics
7
+ - judge with no rubric (validation error path)
8
+ - judge with bad rubric_name (rubric_not_found path)
9
+
10
+ Live judge calls (which DO hit an LLM) live in test_live_judge.py and
11
+ are gated by the AGENT_EVAL_LIVE=1 env var.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import shutil
16
+ from pathlib import Path
17
+
18
+ import pytest
19
+
20
+ from agent_eval_rpc import Client, RubricNotFoundError, ValidationError
21
+
22
+ REPO_ROOT = Path(__file__).resolve().parents[3]
23
+ CLI_DIST = REPO_ROOT / "dist" / "cli.js"
24
+
25
+ pytestmark = pytest.mark.skipif(
26
+ not CLI_DIST.exists(),
27
+ reason="run `pnpm build` in agent-eval root before these tests",
28
+ )
29
+
30
+
31
+ def _client() -> Client:
32
+ """Subprocess client that invokes the bundled CLI directly via node."""
33
+
34
+ class _NodeWrappedClient(Client):
35
+ def _subprocess_call(self, method: str, params): # type: ignore[override]
36
+ import json
37
+ import subprocess
38
+
39
+ proc = subprocess.run(
40
+ ["node", str(CLI_DIST), "rpc", method],
41
+ input=json.dumps(params),
42
+ capture_output=True,
43
+ text=True,
44
+ timeout=self.timeout_s,
45
+ check=False,
46
+ )
47
+ if not proc.stdout:
48
+ raise RuntimeError(f"no stdout. stderr: {proc.stderr}")
49
+ envelope = json.loads(proc.stdout.strip().splitlines()[-1])
50
+ if "error" in envelope:
51
+ from agent_eval_rpc.errors import from_error_body
52
+ raise from_error_body(proc.returncode or 500, envelope)
53
+ return envelope["result"]
54
+
55
+ c = _NodeWrappedClient(transport="subprocess")
56
+ # Override the resolved transport to bypass shutil.which check
57
+ c._transport = "subprocess" # type: ignore[assignment]
58
+ return c
59
+
60
+
61
+ def test_version_via_subprocess() -> None:
62
+ c = _client()
63
+ v = c.version()
64
+ assert v.package == "@tangle-network/agent-eval"
65
+ assert v.version
66
+ assert "judge" in v.api_surface
67
+
68
+
69
+ def test_list_rubrics_includes_anti_slop() -> None:
70
+ c = _client()
71
+ rubrics = c.list_rubrics()
72
+ names = [r.name for r in rubrics.rubrics]
73
+ assert "anti-slop" in names
74
+
75
+
76
+ def test_judge_unknown_rubric_name_raises_RubricNotFoundError() -> None:
77
+ """Regression: server returns 404; client must raise the typed error, not bubble TransportError."""
78
+ c = _client()
79
+ with pytest.raises(RubricNotFoundError):
80
+ c.judge(content="hello world", rubric_name="no-such-rubric-xyz")
81
+
82
+
83
+ def test_judge_empty_content_raises_ValidationError() -> None:
84
+ """Regression: pydantic should catch this before subprocess fires."""
85
+ c = _client()
86
+ with pytest.raises((ValidationError, ValueError)):
87
+ c.judge(content="", rubric_name="anti-slop")