agentversion 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ """Compatibility classification for AgentVersion.
2
+
3
+ Given a diff between two manifests, classifies what action is needed
4
+ for existing data (keep / repair / replay / drop).
5
+
6
+ See spec/reference.md §3 for the compatibility decision taxonomy and
7
+ spec/compatibility-policy.md for the user-configurable policy schema.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Literal
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+ from agentversion.constants import SPEC_VERSION
17
+ from agentversion.diff import ManifestDiff
18
+
19
+ # --- Mapping: surface changes → reason codes ---
20
+
21
+ _SURFACE_TO_REASON_CODES: dict[str, list[str]] = {
22
+ "prompt_stack": ["prompt_policy_changed", "prompt_format_changed"],
23
+ "model_runtime": ["prompt_policy_changed"],
24
+ "tool_registry": ["tool_missing", "tool_schema_incompatible", "tool_semantics_changed"],
25
+ "skill_registry": ["skill_missing", "skill_content_changed"],
26
+ "workflow": ["workflow_surface_changed"],
27
+ "subagents": ["subagent_interface_changed"],
28
+ "output_contract": ["output_contract_changed"],
29
+ "guardrails": ["guardrail_policy_changed"],
30
+ "context_config": ["context_config_changed"],
31
+ "environment": [
32
+ "region_changed",
33
+ "infra_image_changed",
34
+ "external_service_pin_changed",
35
+ "runtime_version_changed",
36
+ ],
37
+ }
38
+
39
+
40
+ # Decision priority — used when combining per-surface verdicts.
41
+ # Higher number = more conservative (overrides milder verdicts).
42
+ PolicyAction = Literal["keep", "repair", "flag", "replay", "drop"]
43
+ _ACTION_PRIORITY: dict[str, int] = {
44
+ "keep": 0,
45
+ "repair": 1,
46
+ "flag": 2,
47
+ "replay": 3,
48
+ "drop": 4,
49
+ }
50
+
51
+
52
+ # --- Pydantic models ---
53
+
54
+
55
+ class SurfaceRules(BaseModel):
56
+ """Per-severity action rules for a single contract surface."""
57
+
58
+ on_minor: PolicyAction = "keep"
59
+ on_moderate: PolicyAction = "flag"
60
+ on_major: PolicyAction = "drop"
61
+
62
+
63
+ class CompatibilityPolicy(BaseModel):
64
+ """User-configurable mapping from change severity to action per surface.
65
+
66
+ See ``schemas/compatibility-policy.schema.json`` for the JSON Schema.
67
+ """
68
+
69
+ kind: Literal["compatibility_policy"] = "compatibility_policy"
70
+ version: str = "0.1"
71
+ name: str = "default"
72
+ preset: Literal["strict", "default", "permissive", "custom"] | None = "default"
73
+
74
+ prompt_stack: SurfaceRules | None = None
75
+ model_runtime: SurfaceRules | None = None
76
+ tool_registry: SurfaceRules | None = None
77
+ skill_registry: SurfaceRules | None = None
78
+ workflow: SurfaceRules | None = None
79
+ subagents: SurfaceRules | None = None
80
+ output_contract: SurfaceRules | None = None
81
+ guardrails: SurfaceRules | None = None
82
+ context_config: SurfaceRules | None = None
83
+ environment: SurfaceRules | None = None
84
+
85
+ def rules_for(self, surface: str) -> SurfaceRules:
86
+ """Return rules for a surface, falling back to defaults if unset."""
87
+ v = getattr(self, surface, None)
88
+ return v if v is not None else SurfaceRules()
89
+
90
+
91
+ class CompatibilityReport(BaseModel):
92
+ """Report summarizing the impact of a manifest change on existing data.
93
+
94
+ Based on a diff, provides a recommended decision and the reason codes
95
+ that led to that recommendation.
96
+ """
97
+
98
+ spec_version: str = SPEC_VERSION
99
+ kind: Literal["compatibility_report"] = "compatibility_report"
100
+ old_manifest_id: str
101
+ new_manifest_id: str
102
+ recommended_decision: Literal["keep", "repair", "replay", "drop"]
103
+ reason_codes: list[str] = Field(default_factory=list)
104
+ breaking_surfaces: list[str] = Field(default_factory=list)
105
+ non_breaking_surfaces: list[str] = Field(default_factory=list)
106
+ summary: str = ""
107
+
108
+
109
+ def _reason_codes_for(diff: ManifestDiff) -> list[str]:
110
+ """Collect reason codes from every changed surface, preserving order."""
111
+ out: list[str] = []
112
+ for change in diff.changed_surfaces:
113
+ for code in _SURFACE_TO_REASON_CODES.get(change.surface, []):
114
+ if code not in out:
115
+ out.append(code)
116
+ return out
117
+
118
+
119
+ def _classify_with_default_rules(diff: ManifestDiff) -> CompatibilityReport:
120
+ """Built-in fallback classifier when no policy is supplied."""
121
+ breaking = [c for c in diff.changed_surfaces if c.change_type == "breaking"]
122
+ non_breaking = [c for c in diff.changed_surfaces if c.change_type == "non_breaking"]
123
+ breaking_names = [c.surface for c in breaking]
124
+ non_breaking_names = [c.surface for c in non_breaking]
125
+ reason_codes = _reason_codes_for(diff)
126
+
127
+ if not breaking:
128
+ return CompatibilityReport(
129
+ old_manifest_id=diff.old_manifest_id,
130
+ new_manifest_id=diff.new_manifest_id,
131
+ recommended_decision="keep",
132
+ reason_codes=reason_codes,
133
+ breaking_surfaces=breaking_names,
134
+ non_breaking_surfaces=non_breaking_names,
135
+ summary="Only non-breaking changes — data remains valid.",
136
+ )
137
+
138
+ # Output-contract-only breaking change is repairable via schema migration.
139
+ if breaking_names == ["output_contract"]:
140
+ return CompatibilityReport(
141
+ old_manifest_id=diff.old_manifest_id,
142
+ new_manifest_id=diff.new_manifest_id,
143
+ recommended_decision="repair",
144
+ reason_codes=reason_codes,
145
+ breaking_surfaces=breaking_names,
146
+ non_breaking_surfaces=non_breaking_names,
147
+ summary=(
148
+ "Output contract changed — existing data may need schema migration "
149
+ "but can be repaired without full replay."
150
+ ),
151
+ )
152
+
153
+ return CompatibilityReport(
154
+ old_manifest_id=diff.old_manifest_id,
155
+ new_manifest_id=diff.new_manifest_id,
156
+ recommended_decision="replay",
157
+ reason_codes=reason_codes,
158
+ breaking_surfaces=breaking_names,
159
+ non_breaking_surfaces=non_breaking_names,
160
+ summary=(
161
+ f"Breaking changes in {', '.join(breaking_names)} — "
162
+ f"existing data should be replayed against the new agent version."
163
+ ),
164
+ )
165
+
166
+
167
+ def _classify_with_policy(diff: ManifestDiff, policy: CompatibilityPolicy) -> CompatibilityReport:
168
+ """Apply a user-supplied policy to derive the recommended decision.
169
+
170
+ For each changed surface, look up ``policy.rules_for(surface).on_<severity>``.
171
+ Combine per-surface actions by priority (drop > replay > flag > repair > keep).
172
+ Map ``flag`` to ``replay`` for the report's ``recommended_decision`` since
173
+ the report enum only has four values; the raw flag verdict is preserved in
174
+ ``summary`` for callers that want it.
175
+ """
176
+ breaking_names = [c.surface for c in diff.changed_surfaces if c.change_type == "breaking"]
177
+ non_breaking_names = [
178
+ c.surface for c in diff.changed_surfaces if c.change_type == "non_breaking"
179
+ ]
180
+ reason_codes = _reason_codes_for(diff)
181
+
182
+ per_surface: list[tuple[str, str]] = [] # (surface, action)
183
+ for change in diff.changed_surfaces:
184
+ rules = policy.rules_for(change.surface)
185
+ if change.severity == "major":
186
+ action = rules.on_major
187
+ elif change.severity == "moderate":
188
+ action = rules.on_moderate
189
+ else:
190
+ action = rules.on_minor
191
+ per_surface.append((change.surface, action))
192
+
193
+ if not per_surface:
194
+ return CompatibilityReport(
195
+ old_manifest_id=diff.old_manifest_id,
196
+ new_manifest_id=diff.new_manifest_id,
197
+ recommended_decision="keep",
198
+ summary="No changes detected — all data remains valid.",
199
+ )
200
+
201
+ worst_surface, worst_action = max(per_surface, key=lambda sa: _ACTION_PRIORITY[sa[1]])
202
+
203
+ # `flag` collapses to `replay` for the four-value enum (and the caller can
204
+ # always re-derive the original per-surface verdicts).
205
+ decision: Literal["keep", "repair", "replay", "drop"]
206
+ if worst_action == "flag":
207
+ decision = "replay"
208
+ else:
209
+ decision = worst_action # type: ignore[assignment]
210
+
211
+ summary = (
212
+ f"Policy {policy.name!r}: worst surface = {worst_surface} → {worst_action}"
213
+ + (f" (mapped to {decision})" if worst_action == "flag" else "")
214
+ )
215
+
216
+ return CompatibilityReport(
217
+ old_manifest_id=diff.old_manifest_id,
218
+ new_manifest_id=diff.new_manifest_id,
219
+ recommended_decision=decision,
220
+ reason_codes=reason_codes,
221
+ breaking_surfaces=breaking_names,
222
+ non_breaking_surfaces=non_breaking_names,
223
+ summary=summary,
224
+ )
225
+
226
+
227
+ def classify_compatibility(
228
+ diff: ManifestDiff,
229
+ policy: CompatibilityPolicy | None = None,
230
+ ) -> CompatibilityReport:
231
+ """Classify the compatibility impact of a manifest diff.
232
+
233
+ Args:
234
+ diff: A computed ``ManifestDiff``.
235
+ policy: Optional user-configurable policy. When supplied, per-surface
236
+ severity → action rules drive the decision. When omitted, the
237
+ built-in fallback applies:
238
+
239
+ - No changes → keep
240
+ - Only non-breaking changes → keep
241
+ - Breaking changes in ``output_contract`` only → repair
242
+ - Any other breaking changes → replay
243
+
244
+ Returns:
245
+ A ``CompatibilityReport`` with the recommended decision.
246
+ """
247
+ if not diff.changed_surfaces:
248
+ return CompatibilityReport(
249
+ old_manifest_id=diff.old_manifest_id,
250
+ new_manifest_id=diff.new_manifest_id,
251
+ recommended_decision="keep",
252
+ summary="No changes detected — all data remains valid.",
253
+ )
254
+
255
+ if policy is not None:
256
+ return _classify_with_policy(diff, policy)
257
+
258
+ return _classify_with_default_rules(diff)
@@ -0,0 +1,8 @@
1
+ """Shared constants for AgentVersion.
2
+
3
+ ``SPEC_VERSION`` is the single source of truth for the spec version that every
4
+ emitted object declares in its ``spec_version`` field. Per
5
+ spec/versioning-policy.md, v1.0 is the floor — there is no v0.x.
6
+ """
7
+
8
+ SPEC_VERSION = "1.0.0"
@@ -0,0 +1,248 @@
1
+ """Dataset models for the AgentVersion.
2
+
3
+ Defines canonical schemas for trace-derived objects: tasks, episodes,
4
+ steps, and dataset snapshots.
5
+
6
+ See spec/reference.md §2.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from datetime import datetime
12
+ from typing import Any, Literal
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+ from agentversion._shared import Message
17
+ from agentversion.constants import SPEC_VERSION
18
+
19
+ # --- 2.1 Task (§2.1) ---
20
+
21
+
22
+ class TaskSource(BaseModel):
23
+ """Where this task came from."""
24
+
25
+ type: str # e.g. "production", "synthetic", "manual"
26
+ system: str | None = None
27
+ external_id: str | None = None
28
+
29
+
30
+ class TaskInput(BaseModel):
31
+ """Input for a task (supports multi-turn)."""
32
+
33
+ messages: list[Message]
34
+
35
+
36
+ class Task(BaseModel):
37
+ """A task object representing a unit of work for an agent.
38
+
39
+ See spec/reference.md §2.1.
40
+ """
41
+
42
+ spec_version: str = SPEC_VERSION
43
+ kind: Literal["task"] = "task"
44
+ task_id: str
45
+ source: TaskSource | None = None
46
+ created_at: datetime
47
+ input: TaskInput
48
+ attachments: list[dict[str, Any]] = Field(default_factory=list)
49
+ metadata: dict[str, Any] = Field(default_factory=dict)
50
+ tags: list[str] = Field(default_factory=list)
51
+
52
+
53
+ # --- 2.2 Episode (§2.2) ---
54
+
55
+
56
+ class EpisodeSource(BaseModel):
57
+ """Where this episode came from."""
58
+
59
+ type: str # e.g. "production_trace", "replay", "synthetic"
60
+ system: str | None = None
61
+ external_trace_id: str | None = None
62
+
63
+
64
+ class EpisodeResult(BaseModel):
65
+ """Result of an episode execution."""
66
+
67
+ final_output: dict[str, Any] | None = None
68
+ success_label: bool | None = None
69
+
70
+
71
+ class EpisodeLineage(BaseModel):
72
+ """Lineage tracking for an episode."""
73
+
74
+ parent_episode_id: str | None = None
75
+ derived_from: str | None = None # "original", "replay", "repair"
76
+
77
+
78
+ class ObservabilityRefs(BaseModel):
79
+ """References to observability systems."""
80
+
81
+ otel_trace_id: str | None = None
82
+ otel_span_id: str | None = None
83
+ source_url: str | None = None
84
+
85
+
86
+ class Episode(BaseModel):
87
+ """An episode representing one execution attempt of a task.
88
+
89
+ See spec/reference.md §2.2.
90
+ """
91
+
92
+ spec_version: str = SPEC_VERSION
93
+ kind: Literal["episode"] = "episode"
94
+ episode_id: str
95
+ task_id: str
96
+ source: EpisodeSource | None = None
97
+ manifest_id: str | None = None
98
+ status: Literal["success", "failure", "error", "timeout", "cancelled"]
99
+ started_at: datetime | None = None
100
+ ended_at: datetime | None = None
101
+ step_ids: list[str] = Field(default_factory=list)
102
+ result: EpisodeResult | None = None
103
+ lineage: EpisodeLineage | None = None
104
+ observability_refs: ObservabilityRefs | None = None
105
+
106
+
107
+ # --- 2.3 Step (§2.3) ---
108
+
109
+
110
+ class StepActor(BaseModel):
111
+ """Who/what performed this step."""
112
+
113
+ type: str # "agent", "tool", "user", "system"
114
+ name: str | None = None
115
+
116
+
117
+ class StepInput(BaseModel):
118
+ """Input to a step."""
119
+
120
+ messages: list[Message] | None = None
121
+
122
+
123
+ class ToolCallOutput(BaseModel):
124
+ """A tool call output."""
125
+
126
+ name: str
127
+ arguments: dict[str, Any] = Field(default_factory=dict)
128
+
129
+
130
+ class StepOutput(BaseModel):
131
+ """Output of a step."""
132
+
133
+ tool_call: ToolCallOutput | None = None
134
+ text: str | None = None
135
+
136
+
137
+ class SchemaRefs(BaseModel):
138
+ """References to schemas used in this step."""
139
+
140
+ tool_input_schema_hash: str | None = None
141
+ tool_output_schema_hash: str | None = None
142
+
143
+
144
+ class TokenUsage(BaseModel):
145
+ """Token usage statistics."""
146
+
147
+ input_tokens: int | None = None
148
+ output_tokens: int | None = None
149
+
150
+
151
+ STEP_TYPES = [
152
+ "llm_call",
153
+ "tool_call",
154
+ "router_decision",
155
+ "subagent_handoff",
156
+ "validator_check",
157
+ "memory_read",
158
+ "memory_write",
159
+ "retrieval",
160
+ "system_event",
161
+ ]
162
+
163
+
164
+ class Step(BaseModel):
165
+ """An atomic step within an episode.
166
+
167
+ See spec/reference.md §2.3.
168
+ """
169
+
170
+ spec_version: str = SPEC_VERSION
171
+ kind: Literal["step"] = "step"
172
+ step_id: str
173
+ episode_id: str
174
+ index: int
175
+ step_type: str # one of STEP_TYPES
176
+ started_at: datetime | None = None
177
+ ended_at: datetime | None = None
178
+ actor: StepActor | None = None
179
+ input: StepInput | None = None
180
+ output: StepOutput | None = None
181
+ schema_refs: SchemaRefs | None = None
182
+ observability_refs: ObservabilityRefs | None = None
183
+ metadata: dict[str, Any] = Field(default_factory=dict)
184
+
185
+
186
+ # --- 2.4 Dataset Snapshot (§2.4) ---
187
+
188
+
189
+ class DataClassification(BaseModel):
190
+ """Compliance labels on a dataset snapshot (§3n).
191
+
192
+ Used for filtering ("show me datasets I can ship outside the EU"),
193
+ retention enforcement, and consent tracking.
194
+ """
195
+
196
+ pii_state: Literal["raw", "redacted", "synthetic", "none"] = "none"
197
+ retention_days: int | None = Field(None, ge=1)
198
+ residency: list[str] = Field(default_factory=list) # e.g. ["us-east-1", "eu-west-1"]
199
+ redaction_policy_ref: str | None = None
200
+ consent_basis: Literal["consent", "contract", "legitimate_interest", "legal_obligation", "vital_interest", "public_task"] | None = None
201
+
202
+
203
+ class SelectionPolicy(BaseModel):
204
+ """How items were selected for this snapshot."""
205
+
206
+ source_types: list[str] = Field(default_factory=list)
207
+ required_episode_status: str | None = None
208
+ required_policy_compliance: bool | None = None
209
+ pii_states: list[Literal["raw", "redacted", "synthetic", "none"]] = Field(
210
+ default_factory=list,
211
+ description=(
212
+ "Filter: episodes whose data_classification.pii_state is in this list "
213
+ "are eligible. Empty list = no filter."
214
+ ),
215
+ )
216
+
217
+
218
+ class ItemRef(BaseModel):
219
+ """Reference to a specific task/episode/step combination."""
220
+
221
+ task_id: str
222
+ episode_id: str | None = None
223
+ step_id: str | None = None
224
+
225
+
226
+ class SnapshotLineage(BaseModel):
227
+ """Lineage tracking for a dataset snapshot."""
228
+
229
+ source_snapshot_ids: list[str] = Field(default_factory=list)
230
+ built_from_manifest_ids: list[str] = Field(default_factory=list)
231
+
232
+
233
+ class DatasetSnapshot(BaseModel):
234
+ """A curated frozen dataset snapshot with provenance.
235
+
236
+ See spec/reference.md §2.4.
237
+ """
238
+
239
+ spec_version: str = SPEC_VERSION
240
+ kind: Literal["dataset_snapshot"] = "dataset_snapshot"
241
+ snapshot_id: str
242
+ name: str
243
+ dataset_type: str # e.g. "sft", "eval", "preference"
244
+ created_at: datetime
245
+ selection_policy: SelectionPolicy | None = None
246
+ item_refs: list[ItemRef] = Field(default_factory=list)
247
+ lineage: SnapshotLineage | None = None
248
+ data_classification: DataClassification | None = None