avp-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avp/__init__.py +31 -0
- avp/commission.py +236 -0
- avp/content.py +273 -0
- avp/data/__init__.py +0 -0
- avp/data/prices.json +21945 -0
- avp/descriptor.py +204 -0
- avp/envelope.py +108 -0
- avp/gen_ai.py +160 -0
- avp/history.py +86 -0
- avp/pricing.py +138 -0
- avp/sink.py +62 -0
- avp/trajectory.py +530 -0
- avp_cli/__init__.py +82 -0
- avp_cli/agent.py +566 -0
- avp_cli/agent_install.py +331 -0
- avp_cli/agent_manifest.py +73 -0
- avp_cli/agents.py +258 -0
- avp_cli/brand.py +46 -0
- avp_cli/broker.py +227 -0
- avp_cli/catalog/__init__.py +128 -0
- avp_cli/catalog/capitals.json +67 -0
- avp_cli/catalog/custom.json +35 -0
- avp_cli/catalog/parsebench.json +44 -0
- avp_cli/cli.py +1858 -0
- avp_cli/commission.py +144 -0
- avp_cli/config.py +250 -0
- avp_cli/console.py +51 -0
- avp_cli/environment.py +218 -0
- avp_cli/eval/__init__.py +0 -0
- avp_cli/eval/dataset.py +37 -0
- avp_cli/eval/engine.py +426 -0
- avp_cli/eval/report.py +178 -0
- avp_cli/eval/scoring.py +260 -0
- avp_cli/eval/setup.py +69 -0
- avp_cli/images.py +119 -0
- avp_cli/library.py +95 -0
- avp_cli/live.py +185 -0
- avp_cli/observability.py +128 -0
- avp_cli/onboarding.py +80 -0
- avp_cli/osb.py +347 -0
- avp_cli/paths.py +47 -0
- avp_cli/run_manifest.py +113 -0
- avp_cli/state.py +195 -0
- avp_cli/vault.py +116 -0
- avp_cli/viz.py +303 -0
- avp_cli-0.1.0.dist-info/METADATA +359 -0
- avp_cli-0.1.0.dist-info/RECORD +49 -0
- avp_cli-0.1.0.dist-info/WHEEL +4 -0
- avp_cli-0.1.0.dist-info/entry_points.txt +2 -0
avp/trajectory.py
ADDED
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
"""avp.trajectory — Pydantic types for the AVP Trajectory Spec.
|
|
2
|
+
|
|
3
|
+
Defines the agent-emitted event stream: CloudEvents envelopes, typed
|
|
4
|
+
`data` payloads (one per event type), the `Event` discriminated union,
|
|
5
|
+
and the `parse_event` / `event_to_wire` helpers. This module mirrors
|
|
6
|
+
the [Trajectory spec](../../../../spec/v0.1/trajectory.md).
|
|
7
|
+
|
|
8
|
+
Consumers wanting only the event stream can:
|
|
9
|
+
|
|
10
|
+
from avp.trajectory import (
|
|
11
|
+
AgentStartedEvent,
|
|
12
|
+
AssistantMessageEvent,
|
|
13
|
+
parse_event,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
…without dragging in Commission / Descriptor types they don't use.
|
|
17
|
+
|
|
18
|
+
The wire format is built on:
|
|
19
|
+
|
|
20
|
+
- **CloudEvents 1.0** for the event envelope (`specversion`, `id`,
|
|
21
|
+
`source`, `type`, `subject`, `time`, `datacontenttype`, `data`).
|
|
22
|
+
- **OpenTelemetry span identification** (`trace_id`, `span_id`,
|
|
23
|
+
`parent_span_id`) so downstream consumers reconstruct the run's span
|
|
24
|
+
tree.
|
|
25
|
+
|
|
26
|
+
All AVP-defined `data` attributes (token / cost / model / tool /
|
|
27
|
+
subagent / refusal / step / content) live under the single `avp.*` namespace.
|
|
28
|
+
See `spec/v0.1/trajectory.md` for the normative attribute reference.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
from enum import StrEnum
|
|
34
|
+
from typing import Annotated, Any, Literal
|
|
35
|
+
|
|
36
|
+
from pydantic import BaseModel, Field, TypeAdapter, ValidationError
|
|
37
|
+
|
|
38
|
+
from avp.commission import Commission
|
|
39
|
+
from avp.content import AVPContentBlock, ToolResultBlock
|
|
40
|
+
from avp.descriptor import (
|
|
41
|
+
AgentDescriptor,
|
|
42
|
+
McpServerDecl,
|
|
43
|
+
SkillDecl,
|
|
44
|
+
SubagentDecl,
|
|
45
|
+
ToolDecl,
|
|
46
|
+
)
|
|
47
|
+
from avp.envelope import (
|
|
48
|
+
_OPEN,
|
|
49
|
+
SOURCE_AGENT,
|
|
50
|
+
ZERO_SPAN_ID,
|
|
51
|
+
_CloudEventBase,
|
|
52
|
+
_SpanData,
|
|
53
|
+
new_event_id,
|
|
54
|
+
new_span_id,
|
|
55
|
+
new_trace_id,
|
|
56
|
+
now_iso,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Reverse-DNS event types per CloudEvents convention. All AVP-defined types
|
|
60
|
+
# live under the `avp.` namespace.
|
|
61
|
+
T_RUN_REQUESTED = "avp.run_requested"
|
|
62
|
+
T_AGENT_DESCRIBED = "avp.agent_described"
|
|
63
|
+
T_AGENT_STARTED = "avp.agent_started"
|
|
64
|
+
T_AGENT_STOPPED = "avp.agent_stopped"
|
|
65
|
+
T_ASSISTANT_MESSAGE = "avp.assistant_message"
|
|
66
|
+
T_TOOL_INVOKED = "avp.tool_invoked"
|
|
67
|
+
T_TOOL_RETURNED = "avp.tool_returned"
|
|
68
|
+
T_ERROR_OCCURRED = "avp.error_occurred"
|
|
69
|
+
T_SUBAGENT_INVOKED = "avp.subagent_invoked"
|
|
70
|
+
T_SUBAGENT_RETURNED = "avp.subagent_returned"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class StopReason(StrEnum):
|
|
74
|
+
"""Why a run terminated. v0.1 keeps the enum tight: model said done,
|
|
75
|
+
model declined, agent crashed, or operator interrupted. Cap-driven
|
|
76
|
+
stop reasons (turn / token / cost / duration limits) are not part of
|
|
77
|
+
v0.1; agents that need bounded execution wire it externally
|
|
78
|
+
(subprocess timeouts, supervisor SIGKILL)."""
|
|
79
|
+
|
|
80
|
+
converged = "converged"
|
|
81
|
+
error = "error"
|
|
82
|
+
interrupted = "interrupted"
|
|
83
|
+
refused = "refused"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ErrorCode(StrEnum):
|
|
87
|
+
rate_limit = "rate_limit"
|
|
88
|
+
context_limit = "context_limit"
|
|
89
|
+
auth_error = "auth_error"
|
|
90
|
+
agent_crash = "agent_crash"
|
|
91
|
+
unsupported_model = "unsupported_model"
|
|
92
|
+
unsupported_provider = "unsupported_provider"
|
|
93
|
+
commission_collision = "commission_collision"
|
|
94
|
+
mcp_connect_failed = "mcp_connect_failed"
|
|
95
|
+
unknown = "unknown"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Usage(BaseModel):
|
|
99
|
+
"""Per-turn token accounting carried on `assistant_message.avp.usage`.
|
|
100
|
+
|
|
101
|
+
`input_tokens` is the total input tokens for the turn, INCLUDING
|
|
102
|
+
cache-read tokens. `cache_read_input_tokens` and
|
|
103
|
+
`cache_creation_input_tokens` are informational breakdowns already
|
|
104
|
+
accounted for inside `input_tokens`; consumers MUST NOT double-count
|
|
105
|
+
them when summing. `reasoning_output_tokens` is the subset of
|
|
106
|
+
`output_tokens` the provider attributes to internal reasoning (o-series
|
|
107
|
+
reasoning tokens, Anthropic extended-thinking output).
|
|
108
|
+
|
|
109
|
+
`extra="allow"` so provider-specific token categories the spec
|
|
110
|
+
doesn't enumerate (vision tokens, audio output tokens, ...) round-trip
|
|
111
|
+
through `avp.usage` verbatim without requiring spec churn.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
model_config = _OPEN
|
|
115
|
+
input_tokens: int = Field(ge=0)
|
|
116
|
+
output_tokens: int = Field(ge=0)
|
|
117
|
+
cache_read_input_tokens: int | None = Field(default=None, ge=0)
|
|
118
|
+
cache_creation_input_tokens: int | None = Field(default=None, ge=0)
|
|
119
|
+
reasoning_output_tokens: int | None = Field(default=None, ge=0)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class SubagentUsage(BaseModel):
|
|
123
|
+
"""Narrow totals carrier for the in-process subagent rollup.
|
|
124
|
+
|
|
125
|
+
Used ONLY on `subagent_returned.data["avp.subagent.usage"]` when the
|
|
126
|
+
parent agent's SDK does not expose the child's per-turn events (e.g.
|
|
127
|
+
Claude Agent SDK's Task tool). `extra="allow"` so SDK-specific fields
|
|
128
|
+
(total_tokens, tool_uses, duration_ms) round-trip verbatim.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
model_config = _OPEN
|
|
132
|
+
cost_usd: float = Field(ge=0)
|
|
133
|
+
tokens_input: int = Field(ge=0)
|
|
134
|
+
tokens_output: int = Field(ge=0)
|
|
135
|
+
turns: int = Field(ge=0)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ── Data payloads (per-event-type) ────────────────────────────────────────────
|
|
139
|
+
#
|
|
140
|
+
# Every AVP event's `data` field carries an OTel span triple plus the
|
|
141
|
+
# event-type-specific attributes. Field names with dots (the OTel/MCP/JSON-RPC
|
|
142
|
+
# wire form) are declared via Pydantic aliases; Python attribute names use
|
|
143
|
+
# underscores. `model_dump(by_alias=True)` produces the wire form on emit.
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class RunRequestedData(_SpanData):
|
|
147
|
+
"""Payload of avp.run_requested events.
|
|
148
|
+
|
|
149
|
+
Anchors the trajectory. When relaying a Commission, carries the full
|
|
150
|
+
snapshot under `avp.commission` plus `avp.supervisor.*` for attribution,
|
|
151
|
+
making the trajectory self-contained for audit. Without a Commission
|
|
152
|
+
(library-invocation path), those fields are absent — per spec §2.1,
|
|
153
|
+
absence (not `"unknown"`) is the canonical signal.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
supervisor_name: str | None = Field(default=None, min_length=1, alias="avp.supervisor.name")
|
|
157
|
+
supervisor_version: str | None = Field(default=None, alias="avp.supervisor.version")
|
|
158
|
+
commission: Commission | None = Field(default=None, alias="avp.commission")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class AgentDescribedData(_SpanData):
|
|
162
|
+
"""Payload of avp.agent_described events.
|
|
163
|
+
|
|
164
|
+
The agent's published Descriptor, emitted between `run_requested`
|
|
165
|
+
and `agent_started`. `avp.descriptor` SHOULD be consistent with what
|
|
166
|
+
`<agent> describe` prints to stdout for the same agent build;
|
|
167
|
+
pre-flight describe MAY omit MCP-surfaced tool entries (those whose
|
|
168
|
+
`avp.mcp_server_id` is set) and per-server `mcp_servers[].status`,
|
|
169
|
+
both of which require a startup dial.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
descriptor: AgentDescriptor = Field(alias="avp.descriptor")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class AgentStartedData(_SpanData):
|
|
176
|
+
"""Payload of avp.agent_started events."""
|
|
177
|
+
|
|
178
|
+
provider_name: str | None = Field(default=None, alias="avp.provider.name")
|
|
179
|
+
operation_name: Literal["invoke_agent", "chat"] | None = Field(
|
|
180
|
+
default=None, alias="avp.operation.name"
|
|
181
|
+
)
|
|
182
|
+
request_model: str | None = Field(default=None, alias="avp.request.model")
|
|
183
|
+
prompt: str | None = Field(default=None, alias="avp.prompt")
|
|
184
|
+
system_prompt: str | None = Field(default=None, alias="avp.system_prompt")
|
|
185
|
+
tools: list[ToolDecl] | None = Field(default=None, alias="avp.tools")
|
|
186
|
+
mcp_servers: list[McpServerDecl] | None = Field(default=None, alias="avp.mcp_servers")
|
|
187
|
+
skills: list[SkillDecl] | None = Field(default=None, alias="avp.skills")
|
|
188
|
+
subagents: list[SubagentDecl] | None = Field(default=None, alias="avp.subagents")
|
|
189
|
+
thread_id: str | None = Field(default=None, alias="avp.thread_id")
|
|
190
|
+
session_id: str | None = Field(default=None, alias="avp.session_id")
|
|
191
|
+
tags: list[str] | None = Field(default=None, alias="avp.tags")
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class AgentStoppedData(_SpanData):
|
|
195
|
+
"""Payload of avp.agent_stopped events. Terminator of the trajectory.
|
|
196
|
+
|
|
197
|
+
Carries `avp.reason` (why the run ended) and an optional `avp.output`
|
|
198
|
+
payload. The agent does NOT publish cumulative totals on this event.
|
|
199
|
+
Per-turn deltas live on each `assistant_message`; consumers reduce
|
|
200
|
+
the stream to compute totals.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
reason: StopReason = Field(alias="avp.reason")
|
|
204
|
+
output: Any | None = Field(default=None, alias="avp.output")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class AssistantMessageData(_SpanData):
|
|
208
|
+
"""Payload of avp.assistant_message events.
|
|
209
|
+
|
|
210
|
+
Carries the full content the model produced this turn under
|
|
211
|
+
`avp.content` (a `list[AVPContentBlock]`) plus per-turn token / cost
|
|
212
|
+
deltas. Reconstructing a provider message array is a direct read of
|
|
213
|
+
`avp.content` per turn, paired with the `avp.tool_result` blocks from
|
|
214
|
+
intervening `tool_returned` events to form the user-role tool-result
|
|
215
|
+
messages.
|
|
216
|
+
|
|
217
|
+
Refusal metadata: when the provider declined the turn, the refusal
|
|
218
|
+
text appears as a `RefusalBlock` (or `TextBlock` for providers that
|
|
219
|
+
don't typify it) inside `avp.content`, the upstream finish-reason
|
|
220
|
+
string surfaces on `avp.response.finish_reasons`, and the
|
|
221
|
+
provider's safety category (when given, free-form because every
|
|
222
|
+
provider names them differently) surfaces on `avp.refusal.category`.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
step: int = Field(ge=0, alias="avp.step")
|
|
226
|
+
duration_ms: int = Field(ge=0, alias="avp.duration_ms")
|
|
227
|
+
content: list[AVPContentBlock] = Field(alias="avp.content")
|
|
228
|
+
provider_name: str | None = Field(default=None, alias="avp.provider.name")
|
|
229
|
+
request_model: str | None = Field(default=None, alias="avp.request.model")
|
|
230
|
+
response_model: str | None = Field(default=None, alias="avp.response.model")
|
|
231
|
+
response_finish_reasons: list[str] | None = Field(
|
|
232
|
+
default=None, alias="avp.response.finish_reasons"
|
|
233
|
+
)
|
|
234
|
+
response_time_to_first_chunk: float | None = Field(
|
|
235
|
+
default=None, ge=0, alias="avp.response.time_to_first_chunk"
|
|
236
|
+
)
|
|
237
|
+
usage: Usage = Field(alias="avp.usage")
|
|
238
|
+
cost_usd: float = Field(ge=0, alias="avp.cost_usd")
|
|
239
|
+
cost_source: Literal["computed", "reported", "unknown"] | None = Field(
|
|
240
|
+
default=None, alias="avp.cost.source"
|
|
241
|
+
)
|
|
242
|
+
refusal_category: str | None = Field(default=None, alias="avp.refusal.category")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class ToolInvokedData(_SpanData):
|
|
246
|
+
step: int = Field(ge=0, alias="avp.step")
|
|
247
|
+
tool_call_id: str = Field(min_length=1, alias="avp.tool.call_id")
|
|
248
|
+
tool_name: str = Field(alias="avp.tool.name")
|
|
249
|
+
tool_input: dict[str, Any] = Field(alias="avp.tool.input")
|
|
250
|
+
tool_dispatch_target: Literal["mcp_server", "local"] | None = Field(
|
|
251
|
+
default=None, alias="avp.tool.dispatch_target"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class ToolReturnedData(_SpanData):
|
|
256
|
+
"""Tool result sent back to the model.
|
|
257
|
+
|
|
258
|
+
`avp.tool_result` is a `content.ToolResultBlock` carrying
|
|
259
|
+
`tool_use_id`, `content` (string or nested text/image/document
|
|
260
|
+
blocks), and `is_error`. Rejections set `is_error=True` with the
|
|
261
|
+
reason in `content[0].text`. During reconstruction this block
|
|
262
|
+
becomes one entry of the next user-role message's content array.
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
step: int = Field(ge=0, alias="avp.step")
|
|
266
|
+
tool_call_id: str = Field(min_length=1, alias="avp.tool.call_id")
|
|
267
|
+
tool_name: str = Field(alias="avp.tool.name")
|
|
268
|
+
duration_ms: int = Field(ge=0, alias="avp.duration_ms")
|
|
269
|
+
tool_result: ToolResultBlock = Field(alias="avp.tool_result")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class SubagentInvokedData(_SpanData):
|
|
273
|
+
"""Parent agent delegates to a declared subagent.
|
|
274
|
+
|
|
275
|
+
The event's `span_id` IS the subagent's frame span. Events emitted by
|
|
276
|
+
the subagent's sub-loop set `parent_span_id` to this frame (or chain
|
|
277
|
+
through descendants of it), so the trajectory reconstructs as a nested
|
|
278
|
+
tree. The subagent's declared name surfaces on `avp.subagent.name`;
|
|
279
|
+
the event type itself signals an `invoke_agent`-style operation, so no
|
|
280
|
+
separate operation-name field is carried on the wire.
|
|
281
|
+
|
|
282
|
+
`avp.subagent.run_id` is reserved for future use when the subagent runs
|
|
283
|
+
as a separate commissioned trajectory. Absent for in-process subagents.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
step: int = Field(ge=0, alias="avp.step")
|
|
287
|
+
subagent_name: str = Field(alias="avp.subagent.name")
|
|
288
|
+
subagent_description: str | None = Field(default=None, alias="avp.subagent.description")
|
|
289
|
+
subagent_invocation_id: str = Field(min_length=1, alias="avp.subagent.invocation_id")
|
|
290
|
+
subagent_input: dict[str, Any] = Field(alias="avp.subagent.input")
|
|
291
|
+
subagent_run_id: str | None = Field(default=None, min_length=1, alias="avp.subagent.run_id")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
class SubagentReturnedData(_SpanData):
|
|
295
|
+
"""Closes the subagent's frame. `span_id` matches the corresponding
|
|
296
|
+
`subagent_invoked` event so consumers can pair them.
|
|
297
|
+
|
|
298
|
+
`avp.subagent.reason` is a `StopReason`; on the error path,
|
|
299
|
+
`reason = error` and `avp.subagent.result.text` carries the error
|
|
300
|
+
string. The paired `tool_returned` mirrors this: `is_error = true`
|
|
301
|
+
when `reason = error`, with the same `Error: ...` content.
|
|
302
|
+
|
|
303
|
+
`avp.subagent.usage` is OPTIONAL and intended only for the in-process
|
|
304
|
+
fallback: parent agents whose SDK black-boxes the child loop (no
|
|
305
|
+
per-turn AssistantMessages exposed to the parent) carry the child's
|
|
306
|
+
totals here as the only signal the supervisor receives of the child's
|
|
307
|
+
spend. Agents that emit the child's per-turn events into the parent's
|
|
308
|
+
trajectory with proper span parentage (`parent_span_id` = this event's
|
|
309
|
+
`span_id`) MUST omit this field; the supervisor reconstructs from the
|
|
310
|
+
raw stream. Managed subagents (separate `run_id`, separate trajectory)
|
|
311
|
+
MUST also omit it; the supervisor reads the child's trajectory.
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
step: int = Field(ge=0, alias="avp.step")
|
|
315
|
+
subagent_name: str = Field(alias="avp.subagent.name")
|
|
316
|
+
subagent_invocation_id: str = Field(min_length=1, alias="avp.subagent.invocation_id")
|
|
317
|
+
duration_ms: int = Field(ge=0, alias="avp.duration_ms")
|
|
318
|
+
subagent_result_text: str = Field(alias="avp.subagent.result.text")
|
|
319
|
+
subagent_result_structured: Any | None = Field(
|
|
320
|
+
default=None, alias="avp.subagent.result.structured"
|
|
321
|
+
)
|
|
322
|
+
subagent_reason: StopReason = Field(alias="avp.subagent.reason")
|
|
323
|
+
subagent_usage: SubagentUsage | None = Field(default=None, alias="avp.subagent.usage")
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class ErrorOccurredData(_SpanData):
|
|
327
|
+
error_code: ErrorCode = Field(alias="avp.error.code")
|
|
328
|
+
error_message: str = Field(alias="avp.error.message")
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# ── CloudEvents 1.0 envelope (event types) ────────────────────────────────────
|
|
332
|
+
#
|
|
333
|
+
# Each event is a CloudEvent. `type` discriminates the union. `source` is the
|
|
334
|
+
# producer URI. `subject` carries the run_id. `data` carries the typed payload.
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
class RunRequestedEvent(_CloudEventBase):
|
|
338
|
+
"""First event of the trajectory. The agent is the sole producer on the
|
|
339
|
+
wire (spec §8 conformance #1), so `source` is `avp://agent`. Supervisor
|
|
340
|
+
attribution, when a Commission is in use, lives inside `data` as
|
|
341
|
+
`avp.supervisor.*` plus the full `avp.commission` snapshot — that's what
|
|
342
|
+
makes the trajectory self-contained for audit without resort to the
|
|
343
|
+
envelope's `source` field.
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
type: Literal["avp.run_requested"] = T_RUN_REQUESTED
|
|
347
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
348
|
+
data: RunRequestedData
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
class AgentDescribedEvent(_CloudEventBase):
|
|
352
|
+
"""Second event of the trajectory. The agent's "whoami":
|
|
353
|
+
self-published manifest of everything triggerable without supervisor
|
|
354
|
+
configuration. Carries the same JSON `<agent> describe` prints to
|
|
355
|
+
stdout for this agent build.
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
type: Literal["avp.agent_described"] = T_AGENT_DESCRIBED
|
|
359
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
360
|
+
data: AgentDescribedData
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
class AgentStartedEvent(_CloudEventBase):
|
|
364
|
+
type: Literal["avp.agent_started"] = T_AGENT_STARTED
|
|
365
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
366
|
+
data: AgentStartedData
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class AgentStoppedEvent(_CloudEventBase):
|
|
370
|
+
type: Literal["avp.agent_stopped"] = T_AGENT_STOPPED
|
|
371
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
372
|
+
data: AgentStoppedData
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
class AssistantMessageEvent(_CloudEventBase):
|
|
376
|
+
type: Literal["avp.assistant_message"] = T_ASSISTANT_MESSAGE
|
|
377
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
378
|
+
data: AssistantMessageData
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
class ToolInvokedEvent(_CloudEventBase):
|
|
382
|
+
type: Literal["avp.tool_invoked"] = T_TOOL_INVOKED
|
|
383
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
384
|
+
data: ToolInvokedData
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
class ToolReturnedEvent(_CloudEventBase):
|
|
388
|
+
type: Literal["avp.tool_returned"] = T_TOOL_RETURNED
|
|
389
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
390
|
+
data: ToolReturnedData
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
class SubagentInvokedEvent(_CloudEventBase):
|
|
394
|
+
type: Literal["avp.subagent_invoked"] = T_SUBAGENT_INVOKED
|
|
395
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
396
|
+
data: SubagentInvokedData
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class SubagentReturnedEvent(_CloudEventBase):
|
|
400
|
+
type: Literal["avp.subagent_returned"] = T_SUBAGENT_RETURNED
|
|
401
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
402
|
+
data: SubagentReturnedData
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
class ErrorOccurredEvent(_CloudEventBase):
|
|
406
|
+
type: Literal["avp.error_occurred"] = T_ERROR_OCCURRED
|
|
407
|
+
source: Literal["avp://agent"] = SOURCE_AGENT
|
|
408
|
+
data: ErrorOccurredData
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class UnknownEvent(_CloudEventBase):
|
|
412
|
+
"""Catch-all for CloudEvents whose `type` is not in the AVP-defined
|
|
413
|
+
union. Validates the CloudEvents 1.0 envelope plus the AVP span
|
|
414
|
+
triple on `data`; the rest of `data` is free-form. Consumers MUST
|
|
415
|
+
pass through unknown event types without error (spec §4), so any
|
|
416
|
+
well-formed CloudEvent — forward-compat AVP additions, vendor-
|
|
417
|
+
namespaced events under `acme.*`, etc. — round-trips through here.
|
|
418
|
+
|
|
419
|
+
`id` and `time` are re-declared without `default_factory` so missing
|
|
420
|
+
envelope fields error rather than silently getting fabricated values.
|
|
421
|
+
"""
|
|
422
|
+
|
|
423
|
+
id: str = Field(min_length=1)
|
|
424
|
+
time: str = Field(min_length=1)
|
|
425
|
+
source: str = Field(min_length=1)
|
|
426
|
+
type: str = Field(min_length=1)
|
|
427
|
+
data: _SpanData
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
# ── Discriminated unions ──────────────────────────────────────────────────────
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
_AGENT_EVENT_TYPES = (
|
|
434
|
+
RunRequestedEvent,
|
|
435
|
+
AgentDescribedEvent,
|
|
436
|
+
AgentStartedEvent,
|
|
437
|
+
AgentStoppedEvent,
|
|
438
|
+
AssistantMessageEvent,
|
|
439
|
+
ToolInvokedEvent,
|
|
440
|
+
ToolReturnedEvent,
|
|
441
|
+
SubagentInvokedEvent,
|
|
442
|
+
SubagentReturnedEvent,
|
|
443
|
+
ErrorOccurredEvent,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
Event = Annotated[
|
|
447
|
+
RunRequestedEvent
|
|
448
|
+
| AgentDescribedEvent
|
|
449
|
+
| AgentStartedEvent
|
|
450
|
+
| AgentStoppedEvent
|
|
451
|
+
| AssistantMessageEvent
|
|
452
|
+
| ToolInvokedEvent
|
|
453
|
+
| ToolReturnedEvent
|
|
454
|
+
| SubagentInvokedEvent
|
|
455
|
+
| SubagentReturnedEvent
|
|
456
|
+
| ErrorOccurredEvent,
|
|
457
|
+
Field(discriminator="type"),
|
|
458
|
+
]
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def parse_event(payload: dict[str, Any]) -> Event | UnknownEvent:
|
|
462
|
+
"""Parse an agent-emitted event payload.
|
|
463
|
+
|
|
464
|
+
Known types validate via the `Event` discriminated-union TypeAdapter.
|
|
465
|
+
Unknown types validate as `UnknownEvent`: envelope + span triple are
|
|
466
|
+
enforced; the rest of `data` is opaque. Per spec/v0.1/README.md §4,
|
|
467
|
+
consumers MUST pass through unknown types without error.
|
|
468
|
+
"""
|
|
469
|
+
_adapter = TypeAdapter(Event)
|
|
470
|
+
try:
|
|
471
|
+
return _adapter.validate_python(payload)
|
|
472
|
+
except ValidationError as e:
|
|
473
|
+
if any(err.get("type") == "union_tag_invalid" for err in e.errors()):
|
|
474
|
+
return UnknownEvent.model_validate(payload)
|
|
475
|
+
raise
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def event_to_wire(event: BaseModel) -> dict[str, Any]:
|
|
479
|
+
"""Serialize an event Pydantic model to the wire-form dict.
|
|
480
|
+
|
|
481
|
+
Always uses aliases (the dotted forms like `avp.usage.input_tokens`)
|
|
482
|
+
so the output is what consumers see on the wire.
|
|
483
|
+
"""
|
|
484
|
+
return event.model_dump(by_alias=True, exclude_none=True, mode="json")
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
__all__ = [
|
|
488
|
+
"SOURCE_AGENT",
|
|
489
|
+
"T_AGENT_DESCRIBED",
|
|
490
|
+
"T_AGENT_STARTED",
|
|
491
|
+
"T_AGENT_STOPPED",
|
|
492
|
+
"T_ASSISTANT_MESSAGE",
|
|
493
|
+
"T_ERROR_OCCURRED",
|
|
494
|
+
"T_RUN_REQUESTED",
|
|
495
|
+
"T_SUBAGENT_INVOKED",
|
|
496
|
+
"T_SUBAGENT_RETURNED",
|
|
497
|
+
"T_TOOL_INVOKED",
|
|
498
|
+
"T_TOOL_RETURNED",
|
|
499
|
+
"ZERO_SPAN_ID",
|
|
500
|
+
"AgentDescribedData",
|
|
501
|
+
"AgentDescribedEvent",
|
|
502
|
+
"AgentStartedData",
|
|
503
|
+
"AgentStartedEvent",
|
|
504
|
+
"AgentStoppedData",
|
|
505
|
+
"AgentStoppedEvent",
|
|
506
|
+
"AssistantMessageData",
|
|
507
|
+
"AssistantMessageEvent",
|
|
508
|
+
"ErrorOccurredData",
|
|
509
|
+
"ErrorOccurredEvent",
|
|
510
|
+
"Event",
|
|
511
|
+
"RunRequestedData",
|
|
512
|
+
"RunRequestedEvent",
|
|
513
|
+
"SubagentInvokedData",
|
|
514
|
+
"SubagentInvokedEvent",
|
|
515
|
+
"SubagentReturnedData",
|
|
516
|
+
"SubagentReturnedEvent",
|
|
517
|
+
"SubagentUsage",
|
|
518
|
+
"ToolInvokedData",
|
|
519
|
+
"ToolInvokedEvent",
|
|
520
|
+
"ToolReturnedData",
|
|
521
|
+
"ToolReturnedEvent",
|
|
522
|
+
"UnknownEvent",
|
|
523
|
+
"Usage",
|
|
524
|
+
"event_to_wire",
|
|
525
|
+
"new_event_id",
|
|
526
|
+
"new_span_id",
|
|
527
|
+
"new_trace_id",
|
|
528
|
+
"now_iso",
|
|
529
|
+
"parse_event",
|
|
530
|
+
]
|
avp_cli/__init__.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""avp_cli — the local AVP CLI (`avp`): build, run, and iterate on Commissions.
|
|
2
|
+
|
|
3
|
+
An eval is a JSON config file (no user code); the CLI is the engine. It loads a
|
|
4
|
+
config, composes a Commission per setup, runs each against a real agent (Goose /
|
|
5
|
+
Claude Code) via the agent's `run --commission --out` manifest contract, scores
|
|
6
|
+
each run, and ranks a board by accuracy / pass-rate / cost / turns.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from avp_cli.agent import load_manifest, run_agent
|
|
12
|
+
from avp_cli.agents import ResolvedAgent, known_agents, preflight, resolve_agent
|
|
13
|
+
from avp_cli.config import EvalConfigError, eval_from_dict, load_eval
|
|
14
|
+
from avp_cli.eval.dataset import Dataset, Item
|
|
15
|
+
from avp_cli.eval.engine import (
|
|
16
|
+
Board,
|
|
17
|
+
Eval,
|
|
18
|
+
RunObserver,
|
|
19
|
+
RunResult,
|
|
20
|
+
SetupRow,
|
|
21
|
+
extract_final_output,
|
|
22
|
+
run_eval,
|
|
23
|
+
run_matrix,
|
|
24
|
+
)
|
|
25
|
+
from avp_cli.eval.report import (
|
|
26
|
+
board_table,
|
|
27
|
+
board_to_dict,
|
|
28
|
+
comparison_table,
|
|
29
|
+
dump_json,
|
|
30
|
+
failures,
|
|
31
|
+
)
|
|
32
|
+
from avp_cli.eval.scoring import (
|
|
33
|
+
ExactMatchScorer,
|
|
34
|
+
FidelityScorer,
|
|
35
|
+
FinalOutput,
|
|
36
|
+
LLMJudgeScorer,
|
|
37
|
+
Score,
|
|
38
|
+
Scorer,
|
|
39
|
+
StructuralMatchScorer,
|
|
40
|
+
)
|
|
41
|
+
from avp_cli.eval.setup import Setup
|
|
42
|
+
from avp_cli.observability import Summary, ToolUsage, render, summarize, tool_tally
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"Board",
|
|
46
|
+
"Dataset",
|
|
47
|
+
"Eval",
|
|
48
|
+
"EvalConfigError",
|
|
49
|
+
"ExactMatchScorer",
|
|
50
|
+
"FidelityScorer",
|
|
51
|
+
"FinalOutput",
|
|
52
|
+
"Item",
|
|
53
|
+
"LLMJudgeScorer",
|
|
54
|
+
"ResolvedAgent",
|
|
55
|
+
"RunObserver",
|
|
56
|
+
"RunResult",
|
|
57
|
+
"Score",
|
|
58
|
+
"Scorer",
|
|
59
|
+
"Setup",
|
|
60
|
+
"SetupRow",
|
|
61
|
+
"StructuralMatchScorer",
|
|
62
|
+
"Summary",
|
|
63
|
+
"ToolUsage",
|
|
64
|
+
"board_table",
|
|
65
|
+
"board_to_dict",
|
|
66
|
+
"comparison_table",
|
|
67
|
+
"dump_json",
|
|
68
|
+
"eval_from_dict",
|
|
69
|
+
"extract_final_output",
|
|
70
|
+
"failures",
|
|
71
|
+
"known_agents",
|
|
72
|
+
"load_eval",
|
|
73
|
+
"load_manifest",
|
|
74
|
+
"preflight",
|
|
75
|
+
"render",
|
|
76
|
+
"resolve_agent",
|
|
77
|
+
"run_agent",
|
|
78
|
+
"run_eval",
|
|
79
|
+
"run_matrix",
|
|
80
|
+
"summarize",
|
|
81
|
+
"tool_tally",
|
|
82
|
+
]
|