psystack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psystack/__init__.py +3 -0
- psystack/__main__.py +5 -0
- psystack/adapters/__init__.py +0 -0
- psystack/adapters/f1/__init__.py +0 -0
- psystack/adapters/f1/controllers.py +56 -0
- psystack/adapters/f1/degrade.py +31 -0
- psystack/adapters/f1/env.py +48 -0
- psystack/adapters/f1/factory.py +182 -0
- psystack/adapters/f1/live_viewer.py +143 -0
- psystack/adapters/f1/planner.py +39 -0
- psystack/adapters/f1/signals.py +353 -0
- psystack/adapters/f1/world_model.py +75 -0
- psystack/adapters/registry.py +35 -0
- psystack/cli/__init__.py +0 -0
- psystack/cli/app.py +21 -0
- psystack/cli/version_check.py +32 -0
- psystack/cli/wizard/__init__.py +3 -0
- psystack/cli/wizard/discovery.py +65 -0
- psystack/cli/wizard/models.py +38 -0
- psystack/cli/wizard/questions.py +174 -0
- psystack/cli/wizard/review.py +54 -0
- psystack/cli/wizard/service.py +181 -0
- psystack/core/__init__.py +0 -0
- psystack/core/config.py +77 -0
- psystack/core/contracts.py +124 -0
- psystack/core/signal_schema.py +54 -0
- psystack/evaluation/__init__.py +0 -0
- psystack/evaluation/metrics/__init__.py +22 -0
- psystack/evaluation/metrics/offtrack.py +30 -0
- psystack/evaluation/metrics/prediction_error.py +71 -0
- psystack/evaluation/metrics/progress.py +22 -0
- psystack/evaluation/metrics/reward.py +22 -0
- psystack/evaluation/metrics/survival.py +22 -0
- psystack/models/__init__.py +42 -0
- psystack/models/case.py +30 -0
- psystack/models/comparison.py +30 -0
- psystack/models/episode.py +82 -0
- psystack/models/evaluation_result.py +51 -0
- psystack/models/event.py +40 -0
- psystack/models/evidence.py +18 -0
- psystack/models/explanation.py +23 -0
- psystack/models/isolation.py +35 -0
- psystack/models/manifest.py +24 -0
- psystack/models/metric.py +14 -0
- psystack/models/project.py +25 -0
- psystack/models/run.py +50 -0
- psystack/models/signal.py +14 -0
- psystack/models/swap.py +25 -0
- psystack/pipeline/__init__.py +0 -0
- psystack/pipeline/case_io.py +22 -0
- psystack/pipeline/compare/__init__.py +4 -0
- psystack/pipeline/compare/decision.py +20 -0
- psystack/pipeline/compare/execution.py +50 -0
- psystack/pipeline/compare/service.py +95 -0
- psystack/pipeline/compare/stats.py +60 -0
- psystack/pipeline/compare_module.py +259 -0
- psystack/pipeline/context.py +194 -0
- psystack/pipeline/episodes.py +109 -0
- psystack/pipeline/event_extraction.py +253 -0
- psystack/pipeline/events/__init__.py +6 -0
- psystack/pipeline/events/config.py +41 -0
- psystack/pipeline/events/detection.py +231 -0
- psystack/pipeline/events/divergence.py +106 -0
- psystack/pipeline/isolation/__init__.py +4 -0
- psystack/pipeline/isolation/attribution.py +187 -0
- psystack/pipeline/isolation/designs.py +35 -0
- psystack/pipeline/isolation/executor.py +60 -0
- psystack/pipeline/isolation/planner.py +10 -0
- psystack/pipeline/live_update.py +59 -0
- psystack/pipeline/metrics_util.py +65 -0
- psystack/pipeline/paired_runner.py +185 -0
- psystack/pipeline/runner.py +107 -0
- psystack/pipeline/stages/__init__.py +22 -0
- psystack/pipeline/stages/attribute.py +78 -0
- psystack/pipeline/stages/base.py +18 -0
- psystack/pipeline/stages/compare.py +37 -0
- psystack/pipeline/stages/events.py +53 -0
- psystack/pipeline/stages/isolate.py +88 -0
- psystack/pipeline/stages/report.py +59 -0
- psystack/pipeline/staleness.py +33 -0
- psystack/pipeline/state.py +31 -0
- psystack/pipeline/workspace.py +177 -0
- psystack/reporting/__init__.py +0 -0
- psystack/reporting/bundle.py +74 -0
- psystack/reporting/evidence.py +28 -0
- psystack/reporting/renderers/__init__.py +0 -0
- psystack/reporting/renderers/console.py +27 -0
- psystack/reporting/renderers/html.py +28 -0
- psystack/reporting/renderers/json.py +13 -0
- psystack/reporting/templates/investigation_report.html.j2 +85 -0
- psystack/reporting/templates/report.html.j2 +99 -0
- psystack/reporting/types.py +33 -0
- psystack/tui/__init__.py +0 -0
- psystack/tui/actions.py +78 -0
- psystack/tui/app.py +1188 -0
- psystack/tui/detection.py +241 -0
- psystack/tui/screens/__init__.py +1 -0
- psystack/tui/screens/attribution.py +252 -0
- psystack/tui/screens/case_history.py +131 -0
- psystack/tui/screens/case_verdict.py +657 -0
- psystack/tui/screens/command_palette.py +70 -0
- psystack/tui/screens/drawers/__init__.py +1 -0
- psystack/tui/screens/drawers/context_drawer.py +90 -0
- psystack/tui/screens/drawers/evidence_drawer.py +113 -0
- psystack/tui/screens/error_modal.py +54 -0
- psystack/tui/screens/investigation.py +686 -0
- psystack/tui/screens/run_builder.py +492 -0
- psystack/tui/screens/workspace_picker.py +69 -0
- psystack/tui/services.py +769 -0
- psystack/tui/state.py +137 -0
- psystack/tui/styles/app.tcss +224 -0
- psystack/tui/views/__init__.py +0 -0
- psystack/tui/widgets/__init__.py +0 -0
- psystack/tui/widgets/action_bar.py +42 -0
- psystack/tui/widgets/artifact_list.py +38 -0
- psystack/tui/widgets/artifact_preview.py +34 -0
- psystack/tui/widgets/attribution_decision_card.py +55 -0
- psystack/tui/widgets/case_bar.py +108 -0
- psystack/tui/widgets/causal_sequence.py +73 -0
- psystack/tui/widgets/comparability_summary.py +48 -0
- psystack/tui/widgets/context_rail.py +69 -0
- psystack/tui/widgets/effect_table.py +32 -0
- psystack/tui/widgets/event_navigator.py +176 -0
- psystack/tui/widgets/explanation_card.py +67 -0
- psystack/tui/widgets/falsifier_list.py +73 -0
- psystack/tui/widgets/focus_signals_strip.py +22 -0
- psystack/tui/widgets/help_overlay.py +85 -0
- psystack/tui/widgets/isolation_case_detail.py +67 -0
- psystack/tui/widgets/isolation_case_table.py +50 -0
- psystack/tui/widgets/live_run_monitor.py +337 -0
- psystack/tui/widgets/metric_detail.py +93 -0
- psystack/tui/widgets/metric_table.py +71 -0
- psystack/tui/widgets/progress_summary.py +300 -0
- psystack/tui/widgets/run_config_panel.py +163 -0
- psystack/tui/widgets/run_monitor.py +91 -0
- psystack/tui/widgets/section_title.py +15 -0
- psystack/tui/widgets/signal_timeline.py +206 -0
- psystack/tui/widgets/status_badge.py +52 -0
- psystack/tui/widgets/step_inspector.py +105 -0
- psystack/tui/widgets/tier_indicator.py +44 -0
- psystack/tui/widgets/track_map.py +137 -0
- psystack/tui/widgets/transport_bar.py +152 -0
- psystack/tui/widgets/verdict_strip.py +103 -0
- psystack-0.1.0.dist-info/METADATA +42 -0
- psystack-0.1.0.dist-info/RECORD +149 -0
- psystack-0.1.0.dist-info/WHEEL +5 -0
- psystack-0.1.0.dist-info/entry_points.txt +5 -0
- psystack-0.1.0.dist-info/licenses/LICENSE +21 -0
- psystack-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OffTrackRateMetric:
|
|
7
|
+
"""Fraction of steps spent off-track."""
|
|
8
|
+
|
|
9
|
+
def metric_id(self) -> str:
|
|
10
|
+
return "offtrack_rate"
|
|
11
|
+
|
|
12
|
+
def higher_is_better(self) -> bool:
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
def compute(self, episodes: list[dict[str, Any]]) -> dict[str, Any]:
|
|
16
|
+
per_episode = []
|
|
17
|
+
for ep in episodes:
|
|
18
|
+
steps = ep["steps"]
|
|
19
|
+
if not steps:
|
|
20
|
+
per_episode.append(0.0)
|
|
21
|
+
continue
|
|
22
|
+
off = sum(1 for s in steps if not s["info"].get("on_track", True))
|
|
23
|
+
per_episode.append(off / len(steps))
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
"primary_value": sum(per_episode) / len(per_episode) if per_episode else 0.0,
|
|
27
|
+
"unit": "fraction",
|
|
28
|
+
"per_episode": per_episode,
|
|
29
|
+
"breakdown": {},
|
|
30
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import torch
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class WorldModelPredictionError:
|
|
10
|
+
"""MSE between predicted z_{t+1} and encode_target(obs_{t+1}).
|
|
11
|
+
|
|
12
|
+
This metric re-runs episodes to capture rasters (not stored in episode data).
|
|
13
|
+
It uses a scripted straight-ahead policy for determinism.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def metric_id(self) -> str:
|
|
17
|
+
return "prediction_error"
|
|
18
|
+
|
|
19
|
+
def higher_is_better(self) -> bool:
|
|
20
|
+
return False
|
|
21
|
+
|
|
22
|
+
def compute(self, episodes: list[dict[str, Any]]) -> dict[str, Any]:
|
|
23
|
+
# prediction_error requires world_model and env passed via episode metadata
|
|
24
|
+
# If not available, return NaN
|
|
25
|
+
if not episodes or "_world_model" not in episodes[0]:
|
|
26
|
+
return {
|
|
27
|
+
"primary_value": float("nan"),
|
|
28
|
+
"unit": "mse",
|
|
29
|
+
"per_episode": [],
|
|
30
|
+
"breakdown": {"note": "world_model not available for prediction_error"},
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
world_model = episodes[0]["_world_model"]
|
|
34
|
+
env = episodes[0]["_env"]
|
|
35
|
+
seed = episodes[0].get("_seed", 42)
|
|
36
|
+
num_episodes = len(episodes)
|
|
37
|
+
max_eval_steps = 200 # limit steps for prediction error eval
|
|
38
|
+
|
|
39
|
+
per_episode = []
|
|
40
|
+
for ep_idx in range(num_episodes):
|
|
41
|
+
np.random.seed(seed + ep_idx)
|
|
42
|
+
torch.manual_seed(seed + ep_idx)
|
|
43
|
+
|
|
44
|
+
obs = env.reset(seed=seed + ep_idx)
|
|
45
|
+
errors = []
|
|
46
|
+
|
|
47
|
+
# Scripted policy: gentle throttle, no steering
|
|
48
|
+
action = np.array([0.0, 0.3, 0.0], dtype=np.float32)
|
|
49
|
+
|
|
50
|
+
for _ in range(max_eval_steps):
|
|
51
|
+
z_t = world_model.encode(obs)
|
|
52
|
+
z_pred = world_model.predict(z_t, action)
|
|
53
|
+
|
|
54
|
+
obs_next, _, done, _ = env.step(action)
|
|
55
|
+
z_target = world_model.encode_target(obs_next)
|
|
56
|
+
|
|
57
|
+
mse = torch.mean((z_pred - z_target) ** 2).item()
|
|
58
|
+
errors.append(mse)
|
|
59
|
+
|
|
60
|
+
obs = obs_next
|
|
61
|
+
if done:
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
per_episode.append(sum(errors) / len(errors) if errors else 0.0)
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"primary_value": sum(per_episode) / len(per_episode) if per_episode else 0.0,
|
|
68
|
+
"unit": "mse",
|
|
69
|
+
"per_episode": per_episode,
|
|
70
|
+
"breakdown": {},
|
|
71
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ProgressMetric:
|
|
7
|
+
"""Track progress at episode end (from env.get_progress())."""
|
|
8
|
+
|
|
9
|
+
def metric_id(self) -> str:
|
|
10
|
+
return "progress"
|
|
11
|
+
|
|
12
|
+
def higher_is_better(self) -> bool:
|
|
13
|
+
return True
|
|
14
|
+
|
|
15
|
+
def compute(self, episodes: list[dict[str, Any]]) -> dict[str, Any]:
|
|
16
|
+
per_episode = [ep["final_track_progress"] for ep in episodes]
|
|
17
|
+
return {
|
|
18
|
+
"primary_value": sum(per_episode) / len(per_episode) if per_episode else 0.0,
|
|
19
|
+
"unit": "fraction",
|
|
20
|
+
"per_episode": per_episode,
|
|
21
|
+
"breakdown": {},
|
|
22
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CumulativeRewardMetric:
|
|
7
|
+
"""Sum of rewards over the episode."""
|
|
8
|
+
|
|
9
|
+
def metric_id(self) -> str:
|
|
10
|
+
return "cumulative_reward"
|
|
11
|
+
|
|
12
|
+
def higher_is_better(self) -> bool:
|
|
13
|
+
return True
|
|
14
|
+
|
|
15
|
+
def compute(self, episodes: list[dict[str, Any]]) -> dict[str, Any]:
|
|
16
|
+
per_episode = [ep["total_reward"] for ep in episodes]
|
|
17
|
+
return {
|
|
18
|
+
"primary_value": sum(per_episode) / len(per_episode) if per_episode else 0.0,
|
|
19
|
+
"unit": "reward",
|
|
20
|
+
"per_episode": per_episode,
|
|
21
|
+
"breakdown": {},
|
|
22
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SurvivalStepsMetric:
|
|
7
|
+
"""Number of steps before episode termination."""
|
|
8
|
+
|
|
9
|
+
def metric_id(self) -> str:
|
|
10
|
+
return "survival_steps"
|
|
11
|
+
|
|
12
|
+
def higher_is_better(self) -> bool:
|
|
13
|
+
return True
|
|
14
|
+
|
|
15
|
+
def compute(self, episodes: list[dict[str, Any]]) -> dict[str, Any]:
|
|
16
|
+
per_episode = [float(ep["total_steps"]) for ep in episodes]
|
|
17
|
+
return {
|
|
18
|
+
"primary_value": sum(per_episode) / len(per_episode) if per_episode else 0.0,
|
|
19
|
+
"unit": "steps",
|
|
20
|
+
"per_episode": per_episode,
|
|
21
|
+
"breakdown": {},
|
|
22
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from .case import Case
|
|
2
|
+
from .comparison import ComparisonReport, MetricComparison
|
|
3
|
+
from .event import Event, EventType
|
|
4
|
+
from .evidence import EvidencePack
|
|
5
|
+
from .explanation import Explanation
|
|
6
|
+
from .isolation import (
|
|
7
|
+
AttributionTable,
|
|
8
|
+
EffectEstimate,
|
|
9
|
+
IsolationCase,
|
|
10
|
+
IsolationPlan,
|
|
11
|
+
IsolationResultBundle,
|
|
12
|
+
)
|
|
13
|
+
from .manifest import RunManifest
|
|
14
|
+
from .metric import MetricResult
|
|
15
|
+
from .project import DiscoveredAssets, Project
|
|
16
|
+
from .run import Run
|
|
17
|
+
from .signal import SignalValue
|
|
18
|
+
from .swap import SwapFactors, SwapTestResult, SwapTestSpec
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"AttributionTable",
|
|
22
|
+
"Case",
|
|
23
|
+
"ComparisonReport",
|
|
24
|
+
"DiscoveredAssets",
|
|
25
|
+
"EffectEstimate",
|
|
26
|
+
"Event",
|
|
27
|
+
"EventType",
|
|
28
|
+
"EvidencePack",
|
|
29
|
+
"Explanation",
|
|
30
|
+
"IsolationCase",
|
|
31
|
+
"IsolationPlan",
|
|
32
|
+
"IsolationResultBundle",
|
|
33
|
+
"MetricComparison",
|
|
34
|
+
"MetricResult",
|
|
35
|
+
"Project",
|
|
36
|
+
"Run",
|
|
37
|
+
"RunManifest",
|
|
38
|
+
"SignalValue",
|
|
39
|
+
"SwapFactors",
|
|
40
|
+
"SwapTestSpec",
|
|
41
|
+
"SwapTestResult",
|
|
42
|
+
]
|
psystack/models/case.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Case model — a complete comparison investigation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
from psystack.models.event import Event
|
|
10
|
+
from psystack.models.explanation import Explanation
|
|
11
|
+
from psystack.models.run import Run
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Case(BaseModel):
|
|
15
|
+
"""A fully specified comparison case between two runs."""
|
|
16
|
+
|
|
17
|
+
model_config = ConfigDict(extra="ignore")
|
|
18
|
+
|
|
19
|
+
id: str
|
|
20
|
+
schema_version: int = 1
|
|
21
|
+
project_id: str = ""
|
|
22
|
+
track_ref: str = ""
|
|
23
|
+
episode_count: int = 5
|
|
24
|
+
eval_seeds: list[int] | None = None
|
|
25
|
+
alignment_method: str = "progress"
|
|
26
|
+
shared_env_overrides: dict[str, Any] = Field(default_factory=dict)
|
|
27
|
+
run_a: Run
|
|
28
|
+
run_b: Run | None = None
|
|
29
|
+
events: list[Event] = Field(default_factory=list)
|
|
30
|
+
explanations: list[Explanation] = Field(default_factory=list)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
ComparisonStatus = Literal["regression", "improvement", "no_change"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MetricComparison(BaseModel):
|
|
11
|
+
metric_id: str
|
|
12
|
+
baseline_value: float
|
|
13
|
+
candidate_value: float
|
|
14
|
+
delta: float
|
|
15
|
+
delta_badness: float
|
|
16
|
+
higher_is_better: bool
|
|
17
|
+
baseline_per_episode: list[float] = Field(default_factory=list)
|
|
18
|
+
candidate_per_episode: list[float] = Field(default_factory=list)
|
|
19
|
+
ci_low: float | None = None
|
|
20
|
+
ci_high: float | None = None
|
|
21
|
+
p_value: float | None = None
|
|
22
|
+
p_value_adj: float | None = None
|
|
23
|
+
significant: bool = False
|
|
24
|
+
status: ComparisonStatus
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ComparisonReport(BaseModel):
|
|
28
|
+
baseline_run_dir: str
|
|
29
|
+
candidate_run_dir: str
|
|
30
|
+
metrics: list[MetricComparison]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Episode outcome models for Phase 2 — run-level outcomes and verdict summary."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# ── Display name mapping ─────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
METRIC_DISPLAY_NAMES: dict[str, str] = {
|
|
13
|
+
"final_track_progress": "completion",
|
|
14
|
+
"off_track_rate": "off-track rate",
|
|
15
|
+
"total_reward": "total reward",
|
|
16
|
+
"fastest_lap_time": "fastest lap",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ── EpisodeRecord ────────────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
class EpisodeRecord(BaseModel):
|
|
23
|
+
"""Per-episode outcome record from a single run side (a or b).
|
|
24
|
+
|
|
25
|
+
Fields map directly to values in runs/{side}/episodes.json.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="ignore")
|
|
29
|
+
|
|
30
|
+
episode_idx: int
|
|
31
|
+
final_track_progress: float = 0.0
|
|
32
|
+
total_reward: float = 0.0
|
|
33
|
+
termination: str | None = None
|
|
34
|
+
fastest_lap_time: float | None = None
|
|
35
|
+
lap_count: int = 0
|
|
36
|
+
completed: bool = False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ── EpisodeOutcome ───────────────────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
class EpisodeOutcome(BaseModel):
|
|
42
|
+
"""Aggregate outcome for one side (a or b) across all episodes."""
|
|
43
|
+
|
|
44
|
+
model_config = ConfigDict(extra="ignore")
|
|
45
|
+
|
|
46
|
+
side: str
|
|
47
|
+
episodes: list[EpisodeRecord]
|
|
48
|
+
mean_progress: float
|
|
49
|
+
completion_rate: float
|
|
50
|
+
mean_reward: float
|
|
51
|
+
off_track_rate: float
|
|
52
|
+
fastest_lap: float | None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ── OutcomeSummary ───────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
class OutcomeSummary(BaseModel):
|
|
58
|
+
"""Persisted summary of A vs B comparison outcomes.
|
|
59
|
+
|
|
60
|
+
Written to analysis/outcomes.json after run_analysis() completes.
|
|
61
|
+
All display strings are pre-computed so the TUI only loads and renders.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
model_config = ConfigDict(extra="ignore")
|
|
65
|
+
|
|
66
|
+
verdict: Literal["regression", "improvement", "no_change", "mixed"]
|
|
67
|
+
primary_metric: str
|
|
68
|
+
primary_metric_display: str
|
|
69
|
+
baseline_value: float
|
|
70
|
+
candidate_value: float
|
|
71
|
+
delta_pct: float
|
|
72
|
+
significant: bool
|
|
73
|
+
regression_count: int
|
|
74
|
+
improvement_count: int
|
|
75
|
+
no_change_count: int
|
|
76
|
+
verdict_headline: str
|
|
77
|
+
primary_metric_line: str
|
|
78
|
+
findings_count_line: str
|
|
79
|
+
top_run: dict[str, str] | None = None # e.g., {"side": "b", "episode_id": "ep_0002"}
|
|
80
|
+
recommended_run_ids: list[str] = Field(default_factory=list)
|
|
81
|
+
episodes_a: list[EpisodeRecord] = Field(default_factory=list)
|
|
82
|
+
episodes_b: list[EpisodeRecord] = Field(default_factory=list)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""EvaluationResult -- typed immutable snapshot of a completed evaluation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ConfigSnapshot(BaseModel):
|
|
12
|
+
"""Frozen copy of the case definition and adapter info at time of evaluation."""
|
|
13
|
+
|
|
14
|
+
model_config = ConfigDict(extra="ignore")
|
|
15
|
+
|
|
16
|
+
case_id: str
|
|
17
|
+
track_ref: str = ""
|
|
18
|
+
episode_count: int = 0
|
|
19
|
+
eval_seeds: list[int] | None = None
|
|
20
|
+
run_a_world_model_ref: str = ""
|
|
21
|
+
run_b_world_model_ref: str = ""
|
|
22
|
+
run_a_planner_ref: str = ""
|
|
23
|
+
run_b_planner_ref: str = ""
|
|
24
|
+
adapter_name: str = ""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class EvaluationResult(BaseModel):
|
|
28
|
+
"""Immutable snapshot of a completed evaluation.
|
|
29
|
+
|
|
30
|
+
Saved to analysis/result.json after run_analysis() completes.
|
|
31
|
+
Bundles episode records, outcome summary, events data, and a
|
|
32
|
+
config snapshot of the case definition at time of evaluation.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="ignore")
|
|
36
|
+
|
|
37
|
+
schema_version: int = 1
|
|
38
|
+
created_at: str = Field(default_factory=lambda: datetime.now(tz=timezone.utc).isoformat())
|
|
39
|
+
|
|
40
|
+
# Config snapshot -- frozen copy of case definition at eval time
|
|
41
|
+
config: ConfigSnapshot
|
|
42
|
+
|
|
43
|
+
# Episode data per side
|
|
44
|
+
episodes_a: list[dict[str, Any]] = Field(default_factory=list)
|
|
45
|
+
episodes_b: list[dict[str, Any]] = Field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
# Outcome summary (the verdict + metrics)
|
|
48
|
+
outcomes: dict[str, Any] = Field(default_factory=dict)
|
|
49
|
+
|
|
50
|
+
# Events data (optional -- event extraction can fail)
|
|
51
|
+
events: dict[str, Any] | None = None
|
psystack/models/event.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Event model — a detected divergence point between two runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from psystack.models.signal import SignalValue
|
|
10
|
+
|
|
11
|
+
EventType = Literal[
|
|
12
|
+
"first_signal_divergence",
|
|
13
|
+
"first_action_divergence",
|
|
14
|
+
"first_risk_spike",
|
|
15
|
+
"first_boundary_collapse",
|
|
16
|
+
"terminal",
|
|
17
|
+
"max_metric_gap",
|
|
18
|
+
# Phase 3 event types
|
|
19
|
+
"first_divergence",
|
|
20
|
+
"divergence_window",
|
|
21
|
+
"risk_spike",
|
|
22
|
+
"off_track_terminal",
|
|
23
|
+
"max_gap",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Event(BaseModel):
|
|
28
|
+
"""A detected divergence event at a specific step."""
|
|
29
|
+
|
|
30
|
+
id: str
|
|
31
|
+
type: EventType
|
|
32
|
+
step: int
|
|
33
|
+
time_s: float | None = None
|
|
34
|
+
severity: Literal["info", "warning", "critical"] = "warning"
|
|
35
|
+
score: float = 0.0
|
|
36
|
+
persistence_k: int = 1
|
|
37
|
+
active_signals: list[SignalValue] = Field(default_factory=list)
|
|
38
|
+
local_window: tuple[int, int] | None = None
|
|
39
|
+
evidence_refs: list[str] = Field(default_factory=list)
|
|
40
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from .comparison import ComparisonReport
|
|
6
|
+
from .isolation import AttributionTable, IsolationResultBundle
|
|
7
|
+
from .manifest import RunManifest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EvidencePack(BaseModel):
|
|
11
|
+
pack_id: str
|
|
12
|
+
created_at: str
|
|
13
|
+
summary: str
|
|
14
|
+
baseline_manifest: RunManifest
|
|
15
|
+
candidate_manifest: RunManifest
|
|
16
|
+
compare: ComparisonReport
|
|
17
|
+
isolation: IsolationResultBundle | None = None
|
|
18
|
+
attributions: list[AttributionTable] = Field(default_factory=list)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Explanation model — a tier-aware attribution hypothesis for an event."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from psystack.models.signal import SignalValue
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Explanation(BaseModel):
|
|
13
|
+
"""An evidence-backed explanation for a detected event."""
|
|
14
|
+
|
|
15
|
+
id: str
|
|
16
|
+
event_id: str
|
|
17
|
+
label: str
|
|
18
|
+
confidence: float = 0.0
|
|
19
|
+
tier: Literal["tier_0", "tier_1", "tier_2", "tier_3"] = "tier_0"
|
|
20
|
+
support_basis: list[str] = Field(default_factory=list)
|
|
21
|
+
competing: list[str] = Field(default_factory=list)
|
|
22
|
+
supporting_signals: list[SignalValue] = Field(default_factory=list)
|
|
23
|
+
falsifiers: list[str] = Field(default_factory=list)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from psystack.models.swap import SwapFactors, SwapTestResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class IsolationCase(BaseModel):
|
|
9
|
+
test_id: str
|
|
10
|
+
factors: SwapFactors
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class IsolationPlan(BaseModel):
|
|
14
|
+
design: str = "screening_v1"
|
|
15
|
+
cases: list[IsolationCase] = Field(default_factory=list)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IsolationResultBundle(BaseModel):
|
|
19
|
+
design: str
|
|
20
|
+
cases: list[IsolationCase] = Field(default_factory=list)
|
|
21
|
+
swap_results: list[SwapTestResult] = Field(default_factory=list)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EffectEstimate(BaseModel):
|
|
25
|
+
factor: str
|
|
26
|
+
effect: float
|
|
27
|
+
confidence: float | None = None
|
|
28
|
+
support_tests: list[str] = Field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AttributionTable(BaseModel):
|
|
32
|
+
metric_id: str
|
|
33
|
+
main_effects: list[EffectEstimate] = Field(default_factory=list)
|
|
34
|
+
interaction_effects: list[EffectEstimate] = Field(default_factory=list)
|
|
35
|
+
decision: str
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from psystack.models.run import Run
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RunManifest(BaseModel):
|
|
12
|
+
run_id: str
|
|
13
|
+
description: str = ""
|
|
14
|
+
world_model_weights: str
|
|
15
|
+
planner_config: dict[str, Any]
|
|
16
|
+
env_config: dict[str, Any]
|
|
17
|
+
num_episodes: int = 20
|
|
18
|
+
seed: int = 42
|
|
19
|
+
|
|
20
|
+
def to_run(self) -> Run:
|
|
21
|
+
"""Convert to a Run domain model."""
|
|
22
|
+
from psystack.models.run import Run
|
|
23
|
+
|
|
24
|
+
return Run.from_manifest(self)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MetricResult(BaseModel):
|
|
9
|
+
metric_id: str
|
|
10
|
+
value: float
|
|
11
|
+
unit: str | None = None
|
|
12
|
+
higher_is_better: bool
|
|
13
|
+
per_episode: list[float] = []
|
|
14
|
+
breakdown: dict[str, Any] = {}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Project model — represents a discovered adapter-backed repo."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DiscoveredAssets(BaseModel):
|
|
12
|
+
"""Assets discovered by adapter's detect_project."""
|
|
13
|
+
|
|
14
|
+
weights: list[dict[str, Any]] = Field(default_factory=list)
|
|
15
|
+
envs: list[str] = Field(default_factory=list)
|
|
16
|
+
scenarios: list[str] = Field(default_factory=list)
|
|
17
|
+
configs: list[str] = Field(default_factory=list)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Project(BaseModel):
|
|
21
|
+
"""A PsyStack project rooted at a repo with a bound adapter."""
|
|
22
|
+
|
|
23
|
+
project_root: Path
|
|
24
|
+
adapter_name: str
|
|
25
|
+
discovered_assets: DiscoveredAssets = Field(default_factory=DiscoveredAssets)
|
psystack/models/run.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Run model — a single execution configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
from psystack.models.manifest import RunManifest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Run(BaseModel):
|
|
13
|
+
"""A fully specified run configuration."""
|
|
14
|
+
|
|
15
|
+
model_config = ConfigDict(extra="ignore", populate_by_name=True)
|
|
16
|
+
|
|
17
|
+
id: str
|
|
18
|
+
|
|
19
|
+
# Component references
|
|
20
|
+
world_model_ref: str = ""
|
|
21
|
+
planner_ref: str = ""
|
|
22
|
+
seed: int = 42
|
|
23
|
+
num_episodes: int = Field(default=20, alias="horizon")
|
|
24
|
+
|
|
25
|
+
# Full configs (hydrated from refs)
|
|
26
|
+
planner_config: dict[str, Any] = Field(default_factory=dict)
|
|
27
|
+
env_config: dict[str, Any] = Field(default_factory=dict)
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def from_manifest(cls, manifest: RunManifest) -> Run:
|
|
31
|
+
"""Create a Run from a legacy RunManifest."""
|
|
32
|
+
return cls(
|
|
33
|
+
id=manifest.run_id,
|
|
34
|
+
world_model_ref=manifest.world_model_weights,
|
|
35
|
+
planner_config=manifest.planner_config,
|
|
36
|
+
env_config=manifest.env_config,
|
|
37
|
+
seed=manifest.seed,
|
|
38
|
+
num_episodes=manifest.num_episodes,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def to_manifest(self) -> RunManifest:
|
|
42
|
+
"""Convert back to a RunManifest for pipeline compatibility."""
|
|
43
|
+
return RunManifest(
|
|
44
|
+
run_id=self.id,
|
|
45
|
+
world_model_weights=self.world_model_ref,
|
|
46
|
+
planner_config=self.planner_config,
|
|
47
|
+
env_config=self.env_config,
|
|
48
|
+
num_episodes=self.num_episodes,
|
|
49
|
+
seed=self.seed,
|
|
50
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Signal value model for event detection and display."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SignalValue(BaseModel):
|
|
9
|
+
"""A single named signal measurement at a point in time."""
|
|
10
|
+
|
|
11
|
+
name: str
|
|
12
|
+
value: float
|
|
13
|
+
unit: str | None = None
|
|
14
|
+
display_format: str = ".3f"
|