PyPI - agentversion - Versions diffs - 0.1.0__py3-none-any.whl - Mend

agentversion 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

agentversion/__init__.py +31 -0
agentversion/_shared.py +23 -0
agentversion/cli.py +407 -0
agentversion/compatibility.py +258 -0
agentversion/constants.py +8 -0
agentversion/dataset.py +248 -0
agentversion/decision.py +249 -0
agentversion/diff.py +740 -0
agentversion/hasher.py +162 -0
agentversion/ids.py +324 -0
agentversion/manifest.py +405 -0
agentversion/py.typed +0 -0
agentversion/refs.py +128 -0
agentversion/replay.py +166 -0
agentversion/validator.py +346 -0
agentversion-0.1.0.dist-info/METADATA +252 -0
agentversion-0.1.0.dist-info/RECORD +20 -0
agentversion-0.1.0.dist-info/WHEEL +4 -0
agentversion-0.1.0.dist-info/entry_points.txt +2 -0
agentversion-0.1.0.dist-info/licenses/LICENSE +190 -0

agentversion/compatibility.py ADDED Viewed

@@ -0,0 +1,258 @@
+"""Compatibility classification for AgentVersion.
+Given a diff between two manifests, classifies what action is needed
+for existing data (keep / repair / replay / drop).
+See spec/reference.md §3 for the compatibility decision taxonomy and
+spec/compatibility-policy.md for the user-configurable policy schema.
+"""
+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel, Field
+from agentversion.constants import SPEC_VERSION
+from agentversion.diff import ManifestDiff
+# --- Mapping: surface changes → reason codes ---
+_SURFACE_TO_REASON_CODES: dict[str, list[str]] = {
+    "prompt_stack": ["prompt_policy_changed", "prompt_format_changed"],
+    "model_runtime": ["prompt_policy_changed"],
+    "tool_registry": ["tool_missing", "tool_schema_incompatible", "tool_semantics_changed"],
+    "skill_registry": ["skill_missing", "skill_content_changed"],
+    "workflow": ["workflow_surface_changed"],
+    "subagents": ["subagent_interface_changed"],
+    "output_contract": ["output_contract_changed"],
+    "guardrails": ["guardrail_policy_changed"],
+    "context_config": ["context_config_changed"],
+    "environment": [
+        "region_changed",
+        "infra_image_changed",
+        "external_service_pin_changed",
+        "runtime_version_changed",
+    ],
+}
+# Decision priority — used when combining per-surface verdicts.
+# Higher number = more conservative (overrides milder verdicts).
+PolicyAction = Literal["keep", "repair", "flag", "replay", "drop"]
+_ACTION_PRIORITY: dict[str, int] = {
+    "keep": 0,
+    "repair": 1,
+    "flag": 2,
+    "replay": 3,
+    "drop": 4,
+}
+# --- Pydantic models ---
+class SurfaceRules(BaseModel):
+    """Per-severity action rules for a single contract surface."""
+    on_minor: PolicyAction = "keep"
+    on_moderate: PolicyAction = "flag"
+    on_major: PolicyAction = "drop"
+class CompatibilityPolicy(BaseModel):
+    """User-configurable mapping from change severity to action per surface.
+    See ``schemas/compatibility-policy.schema.json`` for the JSON Schema.
+    """
+    kind: Literal["compatibility_policy"] = "compatibility_policy"
+    version: str = "0.1"
+    name: str = "default"
+    preset: Literal["strict", "default", "permissive", "custom"] | None = "default"
+    prompt_stack: SurfaceRules | None = None
+    model_runtime: SurfaceRules | None = None
+    tool_registry: SurfaceRules | None = None
+    skill_registry: SurfaceRules | None = None
+    workflow: SurfaceRules | None = None
+    subagents: SurfaceRules | None = None
+    output_contract: SurfaceRules | None = None
+    guardrails: SurfaceRules | None = None
+    context_config: SurfaceRules | None = None
+    environment: SurfaceRules | None = None
+    def rules_for(self, surface: str) -> SurfaceRules:
+        """Return rules for a surface, falling back to defaults if unset."""
+        v = getattr(self, surface, None)
+        return v if v is not None else SurfaceRules()
+class CompatibilityReport(BaseModel):
+    """Report summarizing the impact of a manifest change on existing data.
+    Based on a diff, provides a recommended decision and the reason codes
+    that led to that recommendation.
+    """
+    spec_version: str = SPEC_VERSION
+    kind: Literal["compatibility_report"] = "compatibility_report"
+    old_manifest_id: str
+    new_manifest_id: str
+    recommended_decision: Literal["keep", "repair", "replay", "drop"]
+    reason_codes: list[str] = Field(default_factory=list)
+    breaking_surfaces: list[str] = Field(default_factory=list)
+    non_breaking_surfaces: list[str] = Field(default_factory=list)
+    summary: str = ""
+def _reason_codes_for(diff: ManifestDiff) -> list[str]:
+    """Collect reason codes from every changed surface, preserving order."""
+    out: list[str] = []
+    for change in diff.changed_surfaces:
+        for code in _SURFACE_TO_REASON_CODES.get(change.surface, []):
+            if code not in out:
+                out.append(code)
+    return out
+def _classify_with_default_rules(diff: ManifestDiff) -> CompatibilityReport:
+    """Built-in fallback classifier when no policy is supplied."""
+    breaking = [c for c in diff.changed_surfaces if c.change_type == "breaking"]
+    non_breaking = [c for c in diff.changed_surfaces if c.change_type == "non_breaking"]
+    breaking_names = [c.surface for c in breaking]
+    non_breaking_names = [c.surface for c in non_breaking]
+    reason_codes = _reason_codes_for(diff)
+    if not breaking:
+        return CompatibilityReport(
+            old_manifest_id=diff.old_manifest_id,
+            new_manifest_id=diff.new_manifest_id,
+            recommended_decision="keep",
+            reason_codes=reason_codes,
+            breaking_surfaces=breaking_names,
+            non_breaking_surfaces=non_breaking_names,
+            summary="Only non-breaking changes — data remains valid.",
+        )
+    # Output-contract-only breaking change is repairable via schema migration.
+    if breaking_names == ["output_contract"]:
+        return CompatibilityReport(
+            old_manifest_id=diff.old_manifest_id,
+            new_manifest_id=diff.new_manifest_id,
+            recommended_decision="repair",
+            reason_codes=reason_codes,
+            breaking_surfaces=breaking_names,
+            non_breaking_surfaces=non_breaking_names,
+            summary=(
+                "Output contract changed — existing data may need schema migration "
+                "but can be repaired without full replay."
+            ),
+        )
+    return CompatibilityReport(
+        old_manifest_id=diff.old_manifest_id,
+        new_manifest_id=diff.new_manifest_id,
+        recommended_decision="replay",
+        reason_codes=reason_codes,
+        breaking_surfaces=breaking_names,
+        non_breaking_surfaces=non_breaking_names,
+        summary=(
+            f"Breaking changes in {', '.join(breaking_names)} — "
+            f"existing data should be replayed against the new agent version."
+        ),
+    )
+def _classify_with_policy(diff: ManifestDiff, policy: CompatibilityPolicy) -> CompatibilityReport:
+    """Apply a user-supplied policy to derive the recommended decision.
+    For each changed surface, look up ``policy.rules_for(surface).on_<severity>``.
+    Combine per-surface actions by priority (drop > replay > flag > repair > keep).
+    Map ``flag`` to ``replay`` for the report's ``recommended_decision`` since
+    the report enum only has four values; the raw flag verdict is preserved in
+    ``summary`` for callers that want it.
+    """
+    breaking_names = [c.surface for c in diff.changed_surfaces if c.change_type == "breaking"]
+    non_breaking_names = [
+        c.surface for c in diff.changed_surfaces if c.change_type == "non_breaking"
+    ]
+    reason_codes = _reason_codes_for(diff)
+    per_surface: list[tuple[str, str]] = []  # (surface, action)
+    for change in diff.changed_surfaces:
+        rules = policy.rules_for(change.surface)
+        if change.severity == "major":
+            action = rules.on_major
+        elif change.severity == "moderate":
+            action = rules.on_moderate
+        else:
+            action = rules.on_minor
+        per_surface.append((change.surface, action))
+    if not per_surface:
+        return CompatibilityReport(
+            old_manifest_id=diff.old_manifest_id,
+            new_manifest_id=diff.new_manifest_id,
+            recommended_decision="keep",
+            summary="No changes detected — all data remains valid.",
+        )
+    worst_surface, worst_action = max(per_surface, key=lambda sa: _ACTION_PRIORITY[sa[1]])
+    # `flag` collapses to `replay` for the four-value enum (and the caller can
+    # always re-derive the original per-surface verdicts).
+    decision: Literal["keep", "repair", "replay", "drop"]
+    if worst_action == "flag":
+        decision = "replay"
+    else:
+        decision = worst_action  # type: ignore[assignment]
+    summary = (
+        f"Policy {policy.name!r}: worst surface = {worst_surface} → {worst_action}"
+        + (f" (mapped to {decision})" if worst_action == "flag" else "")
+    )
+    return CompatibilityReport(
+        old_manifest_id=diff.old_manifest_id,
+        new_manifest_id=diff.new_manifest_id,
+        recommended_decision=decision,
+        reason_codes=reason_codes,
+        breaking_surfaces=breaking_names,
+        non_breaking_surfaces=non_breaking_names,
+        summary=summary,
+    )
+def classify_compatibility(
+    diff: ManifestDiff,
+    policy: CompatibilityPolicy | None = None,
+) -> CompatibilityReport:
+    """Classify the compatibility impact of a manifest diff.
+    Args:
+        diff: A computed ``ManifestDiff``.
+        policy: Optional user-configurable policy. When supplied, per-surface
+            severity → action rules drive the decision. When omitted, the
+            built-in fallback applies:
+            - No changes → keep
+            - Only non-breaking changes → keep
+            - Breaking changes in ``output_contract`` only → repair
+            - Any other breaking changes → replay
+    Returns:
+        A ``CompatibilityReport`` with the recommended decision.
+    """
+    if not diff.changed_surfaces:
+        return CompatibilityReport(
+            old_manifest_id=diff.old_manifest_id,
+            new_manifest_id=diff.new_manifest_id,
+            recommended_decision="keep",
+            summary="No changes detected — all data remains valid.",
+        )
+    if policy is not None:
+        return _classify_with_policy(diff, policy)
+    return _classify_with_default_rules(diff)

agentversion/constants.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Shared constants for AgentVersion.
+``SPEC_VERSION`` is the single source of truth for the spec version that every
+emitted object declares in its ``spec_version`` field. Per
+spec/versioning-policy.md, v1.0 is the floor — there is no v0.x.
+"""
+SPEC_VERSION = "1.0.0"

agentversion/dataset.py ADDED Viewed

@@ -0,0 +1,248 @@
+"""Dataset models for the AgentVersion.
+Defines canonical schemas for trace-derived objects: tasks, episodes,
+steps, and dataset snapshots.
+See spec/reference.md §2.
+"""
+from __future__ import annotations
+from datetime import datetime
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+from agentversion._shared import Message
+from agentversion.constants import SPEC_VERSION
+# --- 2.1 Task (§2.1) ---
+class TaskSource(BaseModel):
+    """Where this task came from."""
+    type: str  # e.g. "production", "synthetic", "manual"
+    system: str | None = None
+    external_id: str | None = None
+class TaskInput(BaseModel):
+    """Input for a task (supports multi-turn)."""
+    messages: list[Message]
+class Task(BaseModel):
+    """A task object representing a unit of work for an agent.
+    See spec/reference.md §2.1.
+    """
+    spec_version: str = SPEC_VERSION
+    kind: Literal["task"] = "task"
+    task_id: str
+    source: TaskSource | None = None
+    created_at: datetime
+    input: TaskInput
+    attachments: list[dict[str, Any]] = Field(default_factory=list)
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    tags: list[str] = Field(default_factory=list)
+# --- 2.2 Episode (§2.2) ---
+class EpisodeSource(BaseModel):
+    """Where this episode came from."""
+    type: str  # e.g. "production_trace", "replay", "synthetic"
+    system: str | None = None
+    external_trace_id: str | None = None
+class EpisodeResult(BaseModel):
+    """Result of an episode execution."""
+    final_output: dict[str, Any] | None = None
+    success_label: bool | None = None
+class EpisodeLineage(BaseModel):
+    """Lineage tracking for an episode."""
+    parent_episode_id: str | None = None
+    derived_from: str | None = None  # "original", "replay", "repair"
+class ObservabilityRefs(BaseModel):
+    """References to observability systems."""
+    otel_trace_id: str | None = None
+    otel_span_id: str | None = None
+    source_url: str | None = None
+class Episode(BaseModel):
+    """An episode representing one execution attempt of a task.
+    See spec/reference.md §2.2.
+    """
+    spec_version: str = SPEC_VERSION
+    kind: Literal["episode"] = "episode"
+    episode_id: str
+    task_id: str
+    source: EpisodeSource | None = None
+    manifest_id: str | None = None
+    status: Literal["success", "failure", "error", "timeout", "cancelled"]
+    started_at: datetime | None = None
+    ended_at: datetime | None = None
+    step_ids: list[str] = Field(default_factory=list)
+    result: EpisodeResult | None = None
+    lineage: EpisodeLineage | None = None
+    observability_refs: ObservabilityRefs | None = None
+# --- 2.3 Step (§2.3) ---
+class StepActor(BaseModel):
+    """Who/what performed this step."""
+    type: str  # "agent", "tool", "user", "system"
+    name: str | None = None
+class StepInput(BaseModel):
+    """Input to a step."""
+    messages: list[Message] | None = None
+class ToolCallOutput(BaseModel):
+    """A tool call output."""
+    name: str
+    arguments: dict[str, Any] = Field(default_factory=dict)
+class StepOutput(BaseModel):
+    """Output of a step."""
+    tool_call: ToolCallOutput | None = None
+    text: str | None = None
+class SchemaRefs(BaseModel):
+    """References to schemas used in this step."""
+    tool_input_schema_hash: str | None = None
+    tool_output_schema_hash: str | None = None
+class TokenUsage(BaseModel):
+    """Token usage statistics."""
+    input_tokens: int | None = None
+    output_tokens: int | None = None
+STEP_TYPES = [
+    "llm_call",
+    "tool_call",
+    "router_decision",
+    "subagent_handoff",
+    "validator_check",
+    "memory_read",
+    "memory_write",
+    "retrieval",
+    "system_event",
+]
+class Step(BaseModel):
+    """An atomic step within an episode.
+    See spec/reference.md §2.3.
+    """
+    spec_version: str = SPEC_VERSION
+    kind: Literal["step"] = "step"
+    step_id: str
+    episode_id: str
+    index: int
+    step_type: str  # one of STEP_TYPES
+    started_at: datetime | None = None
+    ended_at: datetime | None = None
+    actor: StepActor | None = None
+    input: StepInput | None = None
+    output: StepOutput | None = None
+    schema_refs: SchemaRefs | None = None
+    observability_refs: ObservabilityRefs | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+# --- 2.4 Dataset Snapshot (§2.4) ---
+class DataClassification(BaseModel):
+    """Compliance labels on a dataset snapshot (§3n).
+    Used for filtering ("show me datasets I can ship outside the EU"),
+    retention enforcement, and consent tracking.
+    """
+    pii_state: Literal["raw", "redacted", "synthetic", "none"] = "none"
+    retention_days: int | None = Field(None, ge=1)
+    residency: list[str] = Field(default_factory=list)  # e.g. ["us-east-1", "eu-west-1"]
+    redaction_policy_ref: str | None = None
+    consent_basis: Literal["consent", "contract", "legitimate_interest", "legal_obligation", "vital_interest", "public_task"] | None = None
+class SelectionPolicy(BaseModel):
+    """How items were selected for this snapshot."""
+    source_types: list[str] = Field(default_factory=list)
+    required_episode_status: str | None = None
+    required_policy_compliance: bool | None = None
+    pii_states: list[Literal["raw", "redacted", "synthetic", "none"]] = Field(
+        default_factory=list,
+        description=(
+            "Filter: episodes whose data_classification.pii_state is in this list "
+            "are eligible. Empty list = no filter."
+        ),
+    )
+class ItemRef(BaseModel):
+    """Reference to a specific task/episode/step combination."""
+    task_id: str
+    episode_id: str | None = None
+    step_id: str | None = None
+class SnapshotLineage(BaseModel):
+    """Lineage tracking for a dataset snapshot."""
+    source_snapshot_ids: list[str] = Field(default_factory=list)
+    built_from_manifest_ids: list[str] = Field(default_factory=list)
+class DatasetSnapshot(BaseModel):
+    """A curated frozen dataset snapshot with provenance.
+    See spec/reference.md §2.4.
+    """
+    spec_version: str = SPEC_VERSION
+    kind: Literal["dataset_snapshot"] = "dataset_snapshot"
+    snapshot_id: str
+    name: str
+    dataset_type: str  # e.g. "sft", "eval", "preference"
+    created_at: datetime
+    selection_policy: SelectionPolicy | None = None
+    item_refs: list[ItemRef] = Field(default_factory=list)
+    lineage: SnapshotLineage | None = None
+    data_classification: DataClassification | None = None