psystack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psystack/__init__.py +3 -0
- psystack/__main__.py +5 -0
- psystack/adapters/__init__.py +0 -0
- psystack/adapters/f1/__init__.py +0 -0
- psystack/adapters/f1/controllers.py +56 -0
- psystack/adapters/f1/degrade.py +31 -0
- psystack/adapters/f1/env.py +48 -0
- psystack/adapters/f1/factory.py +182 -0
- psystack/adapters/f1/live_viewer.py +143 -0
- psystack/adapters/f1/planner.py +39 -0
- psystack/adapters/f1/signals.py +353 -0
- psystack/adapters/f1/world_model.py +75 -0
- psystack/adapters/registry.py +35 -0
- psystack/cli/__init__.py +0 -0
- psystack/cli/app.py +21 -0
- psystack/cli/version_check.py +32 -0
- psystack/cli/wizard/__init__.py +3 -0
- psystack/cli/wizard/discovery.py +65 -0
- psystack/cli/wizard/models.py +38 -0
- psystack/cli/wizard/questions.py +174 -0
- psystack/cli/wizard/review.py +54 -0
- psystack/cli/wizard/service.py +181 -0
- psystack/core/__init__.py +0 -0
- psystack/core/config.py +77 -0
- psystack/core/contracts.py +124 -0
- psystack/core/signal_schema.py +54 -0
- psystack/evaluation/__init__.py +0 -0
- psystack/evaluation/metrics/__init__.py +22 -0
- psystack/evaluation/metrics/offtrack.py +30 -0
- psystack/evaluation/metrics/prediction_error.py +71 -0
- psystack/evaluation/metrics/progress.py +22 -0
- psystack/evaluation/metrics/reward.py +22 -0
- psystack/evaluation/metrics/survival.py +22 -0
- psystack/models/__init__.py +42 -0
- psystack/models/case.py +30 -0
- psystack/models/comparison.py +30 -0
- psystack/models/episode.py +82 -0
- psystack/models/evaluation_result.py +51 -0
- psystack/models/event.py +40 -0
- psystack/models/evidence.py +18 -0
- psystack/models/explanation.py +23 -0
- psystack/models/isolation.py +35 -0
- psystack/models/manifest.py +24 -0
- psystack/models/metric.py +14 -0
- psystack/models/project.py +25 -0
- psystack/models/run.py +50 -0
- psystack/models/signal.py +14 -0
- psystack/models/swap.py +25 -0
- psystack/pipeline/__init__.py +0 -0
- psystack/pipeline/case_io.py +22 -0
- psystack/pipeline/compare/__init__.py +4 -0
- psystack/pipeline/compare/decision.py +20 -0
- psystack/pipeline/compare/execution.py +50 -0
- psystack/pipeline/compare/service.py +95 -0
- psystack/pipeline/compare/stats.py +60 -0
- psystack/pipeline/compare_module.py +259 -0
- psystack/pipeline/context.py +194 -0
- psystack/pipeline/episodes.py +109 -0
- psystack/pipeline/event_extraction.py +253 -0
- psystack/pipeline/events/__init__.py +6 -0
- psystack/pipeline/events/config.py +41 -0
- psystack/pipeline/events/detection.py +231 -0
- psystack/pipeline/events/divergence.py +106 -0
- psystack/pipeline/isolation/__init__.py +4 -0
- psystack/pipeline/isolation/attribution.py +187 -0
- psystack/pipeline/isolation/designs.py +35 -0
- psystack/pipeline/isolation/executor.py +60 -0
- psystack/pipeline/isolation/planner.py +10 -0
- psystack/pipeline/live_update.py +59 -0
- psystack/pipeline/metrics_util.py +65 -0
- psystack/pipeline/paired_runner.py +185 -0
- psystack/pipeline/runner.py +107 -0
- psystack/pipeline/stages/__init__.py +22 -0
- psystack/pipeline/stages/attribute.py +78 -0
- psystack/pipeline/stages/base.py +18 -0
- psystack/pipeline/stages/compare.py +37 -0
- psystack/pipeline/stages/events.py +53 -0
- psystack/pipeline/stages/isolate.py +88 -0
- psystack/pipeline/stages/report.py +59 -0
- psystack/pipeline/staleness.py +33 -0
- psystack/pipeline/state.py +31 -0
- psystack/pipeline/workspace.py +177 -0
- psystack/reporting/__init__.py +0 -0
- psystack/reporting/bundle.py +74 -0
- psystack/reporting/evidence.py +28 -0
- psystack/reporting/renderers/__init__.py +0 -0
- psystack/reporting/renderers/console.py +27 -0
- psystack/reporting/renderers/html.py +28 -0
- psystack/reporting/renderers/json.py +13 -0
- psystack/reporting/templates/investigation_report.html.j2 +85 -0
- psystack/reporting/templates/report.html.j2 +99 -0
- psystack/reporting/types.py +33 -0
- psystack/tui/__init__.py +0 -0
- psystack/tui/actions.py +78 -0
- psystack/tui/app.py +1188 -0
- psystack/tui/detection.py +241 -0
- psystack/tui/screens/__init__.py +1 -0
- psystack/tui/screens/attribution.py +252 -0
- psystack/tui/screens/case_history.py +131 -0
- psystack/tui/screens/case_verdict.py +657 -0
- psystack/tui/screens/command_palette.py +70 -0
- psystack/tui/screens/drawers/__init__.py +1 -0
- psystack/tui/screens/drawers/context_drawer.py +90 -0
- psystack/tui/screens/drawers/evidence_drawer.py +113 -0
- psystack/tui/screens/error_modal.py +54 -0
- psystack/tui/screens/investigation.py +686 -0
- psystack/tui/screens/run_builder.py +492 -0
- psystack/tui/screens/workspace_picker.py +69 -0
- psystack/tui/services.py +769 -0
- psystack/tui/state.py +137 -0
- psystack/tui/styles/app.tcss +224 -0
- psystack/tui/views/__init__.py +0 -0
- psystack/tui/widgets/__init__.py +0 -0
- psystack/tui/widgets/action_bar.py +42 -0
- psystack/tui/widgets/artifact_list.py +38 -0
- psystack/tui/widgets/artifact_preview.py +34 -0
- psystack/tui/widgets/attribution_decision_card.py +55 -0
- psystack/tui/widgets/case_bar.py +108 -0
- psystack/tui/widgets/causal_sequence.py +73 -0
- psystack/tui/widgets/comparability_summary.py +48 -0
- psystack/tui/widgets/context_rail.py +69 -0
- psystack/tui/widgets/effect_table.py +32 -0
- psystack/tui/widgets/event_navigator.py +176 -0
- psystack/tui/widgets/explanation_card.py +67 -0
- psystack/tui/widgets/falsifier_list.py +73 -0
- psystack/tui/widgets/focus_signals_strip.py +22 -0
- psystack/tui/widgets/help_overlay.py +85 -0
- psystack/tui/widgets/isolation_case_detail.py +67 -0
- psystack/tui/widgets/isolation_case_table.py +50 -0
- psystack/tui/widgets/live_run_monitor.py +337 -0
- psystack/tui/widgets/metric_detail.py +93 -0
- psystack/tui/widgets/metric_table.py +71 -0
- psystack/tui/widgets/progress_summary.py +300 -0
- psystack/tui/widgets/run_config_panel.py +163 -0
- psystack/tui/widgets/run_monitor.py +91 -0
- psystack/tui/widgets/section_title.py +15 -0
- psystack/tui/widgets/signal_timeline.py +206 -0
- psystack/tui/widgets/status_badge.py +52 -0
- psystack/tui/widgets/step_inspector.py +105 -0
- psystack/tui/widgets/tier_indicator.py +44 -0
- psystack/tui/widgets/track_map.py +137 -0
- psystack/tui/widgets/transport_bar.py +152 -0
- psystack/tui/widgets/verdict_strip.py +103 -0
- psystack-0.1.0.dist-info/METADATA +42 -0
- psystack-0.1.0.dist-info/RECORD +149 -0
- psystack-0.1.0.dist-info/WHEEL +5 -0
- psystack-0.1.0.dist-info/entry_points.txt +5 -0
- psystack-0.1.0.dist-info/licenses/LICENSE +21 -0
- psystack-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Shared metric computation logic used by both compare and isolation pipelines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from psystack.core.contracts import AdapterFactory
|
|
9
|
+
from psystack.models import MetricResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def compute_and_filter_metrics(
|
|
13
|
+
episodes: list[dict[str, Any]],
|
|
14
|
+
factory: AdapterFactory,
|
|
15
|
+
world_model: Any = None,
|
|
16
|
+
env: Any = None,
|
|
17
|
+
seed: int = 42,
|
|
18
|
+
skip_metrics: set[str] | None = None,
|
|
19
|
+
) -> list[MetricResult]:
|
|
20
|
+
"""Compute all metrics on episodes, filtering NaN values and optionally skipping metrics.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
episodes: Episode data dicts.
|
|
24
|
+
factory: Adapter factory providing metric plugins.
|
|
25
|
+
world_model: Optional world model for prediction_error metric.
|
|
26
|
+
env: Optional env for prediction_error metric.
|
|
27
|
+
seed: Random seed.
|
|
28
|
+
skip_metrics: Set of metric IDs to skip (e.g. {"prediction_error"}).
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of MetricResult with NaN values excluded.
|
|
32
|
+
"""
|
|
33
|
+
skip = skip_metrics or set()
|
|
34
|
+
|
|
35
|
+
# Inject world_model/env for prediction_error metric
|
|
36
|
+
if world_model is not None and env is not None:
|
|
37
|
+
for ep in episodes:
|
|
38
|
+
ep["_world_model"] = world_model
|
|
39
|
+
ep["_env"] = env
|
|
40
|
+
ep["_seed"] = seed
|
|
41
|
+
|
|
42
|
+
results = []
|
|
43
|
+
for metric in factory.get_metrics():
|
|
44
|
+
if metric.metric_id() in skip:
|
|
45
|
+
continue
|
|
46
|
+
raw = metric.compute(episodes)
|
|
47
|
+
val = raw["primary_value"]
|
|
48
|
+
if math.isnan(val):
|
|
49
|
+
continue
|
|
50
|
+
results.append(MetricResult(
|
|
51
|
+
metric_id=metric.metric_id(),
|
|
52
|
+
value=val,
|
|
53
|
+
unit=raw.get("unit"),
|
|
54
|
+
higher_is_better=metric.higher_is_better(),
|
|
55
|
+
per_episode=raw.get("per_episode", []),
|
|
56
|
+
breakdown=raw.get("breakdown", {}),
|
|
57
|
+
))
|
|
58
|
+
|
|
59
|
+
# Clean up injected refs
|
|
60
|
+
for ep in episodes:
|
|
61
|
+
ep.pop("_world_model", None)
|
|
62
|
+
ep.pop("_env", None)
|
|
63
|
+
ep.pop("_seed", None)
|
|
64
|
+
|
|
65
|
+
return results
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Paired episode runner — lockstep A/B execution with pair-aware telemetry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import torch
|
|
11
|
+
|
|
12
|
+
from psystack.core.contracts import EnvPlugin, PlannerPlugin
|
|
13
|
+
from psystack.pipeline.episodes import _serialize_info
|
|
14
|
+
from psystack.pipeline.live_update import LivePairFrame, LiveStepUpdate
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EvalCancelled(Exception):
|
|
18
|
+
"""Raised when the user cancels a running evaluation."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_paired_episodes(
|
|
22
|
+
env_a: EnvPlugin,
|
|
23
|
+
env_b: EnvPlugin,
|
|
24
|
+
planner_a: PlannerPlugin,
|
|
25
|
+
planner_b: PlannerPlugin,
|
|
26
|
+
num_episodes: int,
|
|
27
|
+
seed: int = 42,
|
|
28
|
+
pair_callback: Callable[[LivePairFrame], None] | None = None,
|
|
29
|
+
max_steps: int = 0,
|
|
30
|
+
cancel_event: threading.Event | None = None,
|
|
31
|
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
|
32
|
+
"""Run N episode pairs in lockstep.
|
|
33
|
+
|
|
34
|
+
Both envs are stepped once per tick. One LivePairFrame emitted per tick.
|
|
35
|
+
If one side finishes early, its update freezes.
|
|
36
|
+
|
|
37
|
+
Returns (episodes_a, episodes_b) in the same format as run_episodes().
|
|
38
|
+
"""
|
|
39
|
+
all_episodes_a: list[dict[str, Any]] = []
|
|
40
|
+
all_episodes_b: list[dict[str, Any]] = []
|
|
41
|
+
|
|
42
|
+
for ep_idx in range(num_episodes):
|
|
43
|
+
ep_seed = seed + ep_idx
|
|
44
|
+
|
|
45
|
+
# Deterministic seeding
|
|
46
|
+
np.random.seed(ep_seed)
|
|
47
|
+
torch.manual_seed(ep_seed)
|
|
48
|
+
if torch.cuda.is_available():
|
|
49
|
+
torch.cuda.manual_seed_all(ep_seed)
|
|
50
|
+
|
|
51
|
+
planner_a.reset()
|
|
52
|
+
planner_b.reset()
|
|
53
|
+
obs_a = env_a.reset(seed=ep_seed)
|
|
54
|
+
obs_b = env_b.reset(seed=ep_seed)
|
|
55
|
+
|
|
56
|
+
steps_a: list[dict[str, Any]] = []
|
|
57
|
+
steps_b: list[dict[str, Any]] = []
|
|
58
|
+
total_reward_a = 0.0
|
|
59
|
+
total_reward_b = 0.0
|
|
60
|
+
done_a = False
|
|
61
|
+
done_b = False
|
|
62
|
+
termination_a = "max_steps"
|
|
63
|
+
termination_b = "max_steps"
|
|
64
|
+
info_a: dict[str, Any] = {}
|
|
65
|
+
info_b: dict[str, Any] = {}
|
|
66
|
+
|
|
67
|
+
# Frozen terminal state for the side that finishes first
|
|
68
|
+
last_update_a: LiveStepUpdate | None = None
|
|
69
|
+
last_update_b: LiveStepUpdate | None = None
|
|
70
|
+
# Track last known progress — env may reset to 0 after done=True
|
|
71
|
+
last_progress_a = 0.0
|
|
72
|
+
last_progress_b = 0.0
|
|
73
|
+
|
|
74
|
+
tick = 0
|
|
75
|
+
while not (done_a and done_b):
|
|
76
|
+
if cancel_event is not None and cancel_event.is_set():
|
|
77
|
+
raise EvalCancelled("Evaluation cancelled by user")
|
|
78
|
+
# Step A
|
|
79
|
+
if not done_a:
|
|
80
|
+
car_state_a = env_a.get_car_state()
|
|
81
|
+
progress_a = env_a.get_progress()
|
|
82
|
+
action_a = planner_a.act(obs_a, car_state=car_state_a)
|
|
83
|
+
obs_next_a, reward_a, done_a, info_a = env_a.step(action_a)
|
|
84
|
+
last_progress_a = float(progress_a)
|
|
85
|
+
|
|
86
|
+
action_list_a = action_a.tolist() if hasattr(action_a, "tolist") else list(action_a)
|
|
87
|
+
steps_a.append({
|
|
88
|
+
"obs": {"aux": obs_a["aux"].tolist() if hasattr(obs_a["aux"], "tolist") else obs_a["aux"]},
|
|
89
|
+
"action": action_list_a,
|
|
90
|
+
"reward": float(reward_a),
|
|
91
|
+
"done": done_a,
|
|
92
|
+
"info": _serialize_info(info_a),
|
|
93
|
+
"car_state": car_state_a,
|
|
94
|
+
"track_progress": float(progress_a),
|
|
95
|
+
})
|
|
96
|
+
total_reward_a += reward_a
|
|
97
|
+
obs_a = obs_next_a
|
|
98
|
+
|
|
99
|
+
if done_a and "termination" in info_a:
|
|
100
|
+
termination_a = info_a["termination"]
|
|
101
|
+
|
|
102
|
+
last_update_a = LiveStepUpdate(
|
|
103
|
+
run_id="a", episode_idx=ep_idx, episode_total=num_episodes,
|
|
104
|
+
step=tick, progress=float(progress_a), reward=float(reward_a),
|
|
105
|
+
done=done_a,
|
|
106
|
+
termination=info_a.get("termination") if done_a else None,
|
|
107
|
+
state=car_state_a, action=action_list_a, info=_serialize_info(info_a),
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if cancel_event is not None and cancel_event.is_set():
|
|
111
|
+
raise EvalCancelled("Evaluation cancelled by user")
|
|
112
|
+
# Step B
|
|
113
|
+
if not done_b:
|
|
114
|
+
car_state_b = env_b.get_car_state()
|
|
115
|
+
progress_b = env_b.get_progress()
|
|
116
|
+
action_b = planner_b.act(obs_b, car_state=car_state_b)
|
|
117
|
+
obs_next_b, reward_b, done_b, info_b = env_b.step(action_b)
|
|
118
|
+
last_progress_b = float(progress_b)
|
|
119
|
+
|
|
120
|
+
action_list_b = action_b.tolist() if hasattr(action_b, "tolist") else list(action_b)
|
|
121
|
+
steps_b.append({
|
|
122
|
+
"obs": {"aux": obs_b["aux"].tolist() if hasattr(obs_b["aux"], "tolist") else obs_b["aux"]},
|
|
123
|
+
"action": action_list_b,
|
|
124
|
+
"reward": float(reward_b),
|
|
125
|
+
"done": done_b,
|
|
126
|
+
"info": _serialize_info(info_b),
|
|
127
|
+
"car_state": car_state_b,
|
|
128
|
+
"track_progress": float(progress_b),
|
|
129
|
+
})
|
|
130
|
+
total_reward_b += reward_b
|
|
131
|
+
obs_b = obs_next_b
|
|
132
|
+
|
|
133
|
+
if done_b and "termination" in info_b:
|
|
134
|
+
termination_b = info_b["termination"]
|
|
135
|
+
|
|
136
|
+
last_update_b = LiveStepUpdate(
|
|
137
|
+
run_id="b", episode_idx=ep_idx, episode_total=num_episodes,
|
|
138
|
+
step=tick, progress=float(progress_b), reward=float(reward_b),
|
|
139
|
+
done=done_b,
|
|
140
|
+
termination=info_b.get("termination") if done_b else None,
|
|
141
|
+
state=car_state_b, action=action_list_b, info=_serialize_info(info_b),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Emit pair frame
|
|
145
|
+
if pair_callback is not None:
|
|
146
|
+
frame = LivePairFrame(
|
|
147
|
+
episode_idx=ep_idx,
|
|
148
|
+
episode_total=num_episodes,
|
|
149
|
+
tick=tick,
|
|
150
|
+
a=last_update_a,
|
|
151
|
+
b=last_update_b,
|
|
152
|
+
both_done=done_a and done_b,
|
|
153
|
+
max_steps=max_steps,
|
|
154
|
+
)
|
|
155
|
+
pair_callback(frame)
|
|
156
|
+
|
|
157
|
+
tick += 1
|
|
158
|
+
|
|
159
|
+
# Build episode records — use last tracked progress, not env.get_progress()
|
|
160
|
+
# which may return 0 if the env auto-resets on done=True
|
|
161
|
+
final_progress_a = last_progress_a
|
|
162
|
+
final_progress_b = last_progress_b
|
|
163
|
+
|
|
164
|
+
all_episodes_a.append({
|
|
165
|
+
"episode_id": f"ep_{ep_idx:04d}",
|
|
166
|
+
"steps": steps_a,
|
|
167
|
+
"total_steps": len(steps_a),
|
|
168
|
+
"final_track_progress": float(final_progress_a),
|
|
169
|
+
"total_reward": float(total_reward_a),
|
|
170
|
+
"termination": termination_a,
|
|
171
|
+
"fastest_lap_time": info_a.get("fastest_lap_time"),
|
|
172
|
+
"lap_count": info_a.get("lap_count", 0),
|
|
173
|
+
})
|
|
174
|
+
all_episodes_b.append({
|
|
175
|
+
"episode_id": f"ep_{ep_idx:04d}",
|
|
176
|
+
"steps": steps_b,
|
|
177
|
+
"total_steps": len(steps_b),
|
|
178
|
+
"final_track_progress": float(final_progress_b),
|
|
179
|
+
"total_reward": float(total_reward_b),
|
|
180
|
+
"termination": termination_b,
|
|
181
|
+
"fastest_lap_time": info_b.get("fastest_lap_time"),
|
|
182
|
+
"lap_count": info_b.get("lap_count", 0),
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
return all_episodes_a, all_episodes_b
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Stage runner — orchestrates pipeline stages with progress display."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Protocol, runtime_checkable
|
|
8
|
+
|
|
9
|
+
from psystack.pipeline.context import RunContext
|
|
10
|
+
from psystack.pipeline.stages.base import Stage
|
|
11
|
+
from psystack.pipeline.state import StageResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@runtime_checkable
|
|
15
|
+
class StageObserver(Protocol):
|
|
16
|
+
def on_stage_start(self, name: str) -> None: ...
|
|
17
|
+
def on_stage_complete(self, name: str, result: StageResult) -> None: ...
|
|
18
|
+
def on_stage_fail(self, name: str, error: str) -> None: ...
|
|
19
|
+
def on_stage_skip(self, name: str, reason: str) -> None: ...
|
|
20
|
+
def on_stage_reuse(self, name: str) -> None: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _NullObserver:
|
|
24
|
+
"""Default no-op observer for CLI usage."""
|
|
25
|
+
def on_stage_start(self, name: str) -> None: pass
|
|
26
|
+
def on_stage_complete(self, name: str, result: StageResult) -> None: pass
|
|
27
|
+
def on_stage_fail(self, name: str, error: str) -> None: pass
|
|
28
|
+
def on_stage_skip(self, name: str, reason: str) -> None: pass
|
|
29
|
+
def on_stage_reuse(self, name: str) -> None: pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def utc_now() -> str:
|
|
33
|
+
return datetime.now(timezone.utc).isoformat()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_stages(
|
|
37
|
+
ctx: RunContext,
|
|
38
|
+
stages: tuple[Stage, ...],
|
|
39
|
+
*,
|
|
40
|
+
observer: StageObserver | None = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
obs = observer or _NullObserver()
|
|
43
|
+
all_names = [stage.name for stage in stages]
|
|
44
|
+
|
|
45
|
+
# Only use Rich Progress when no observer (CLI mode)
|
|
46
|
+
if observer is None:
|
|
47
|
+
from rich.progress import Progress
|
|
48
|
+
progress_ctx = Progress()
|
|
49
|
+
else:
|
|
50
|
+
progress_ctx = contextlib.nullcontext()
|
|
51
|
+
|
|
52
|
+
with progress_ctx as progress:
|
|
53
|
+
if progress is not None:
|
|
54
|
+
task_id = progress.add_task("PsyStack pipeline", total=len(stages))
|
|
55
|
+
|
|
56
|
+
for stage in stages:
|
|
57
|
+
# 1. Explicit skip
|
|
58
|
+
if stage.name in ctx.skip:
|
|
59
|
+
reason = "explicitly skipped"
|
|
60
|
+
ctx.mark_skipped(stage.name, reason=reason)
|
|
61
|
+
ctx.save_state()
|
|
62
|
+
obs.on_stage_skip(stage.name, reason)
|
|
63
|
+
if progress is not None:
|
|
64
|
+
progress.advance(task_id)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
# 2. Outside selected range — preserves existing completed state
|
|
68
|
+
if not ctx.stage_selected(stage.name, all_names):
|
|
69
|
+
reason = "outside selected range"
|
|
70
|
+
ctx.mark_skipped(stage.name, reason=reason)
|
|
71
|
+
ctx.save_state()
|
|
72
|
+
obs.on_stage_skip(stage.name, reason)
|
|
73
|
+
if progress is not None:
|
|
74
|
+
progress.advance(task_id)
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
# 3. Dependency check
|
|
78
|
+
if not ctx.prereqs_satisfied(stage.requires):
|
|
79
|
+
raise RuntimeError(
|
|
80
|
+
f"Stage '{stage.name}' requires completed stages: {stage.requires}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# 4. Resume: reuse if up-to-date — keeps completed status
|
|
84
|
+
if ctx.resume and stage.is_up_to_date(ctx):
|
|
85
|
+
ctx.mark_reused(stage.name)
|
|
86
|
+
ctx.save_state()
|
|
87
|
+
obs.on_stage_reuse(stage.name)
|
|
88
|
+
if progress is not None:
|
|
89
|
+
progress.advance(task_id)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# 5. Run — fail fast on exception
|
|
93
|
+
try:
|
|
94
|
+
ctx.mark_running(stage.name, started_at=utc_now())
|
|
95
|
+
obs.on_stage_start(stage.name)
|
|
96
|
+
result = stage.run(ctx)
|
|
97
|
+
ctx.mark_completed(stage.name, result=result, finished_at=utc_now())
|
|
98
|
+
obs.on_stage_complete(stage.name, result)
|
|
99
|
+
except Exception as exc:
|
|
100
|
+
ctx.mark_failed(stage.name, error=str(exc), finished_at=utc_now())
|
|
101
|
+
ctx.save_state()
|
|
102
|
+
obs.on_stage_fail(stage.name, str(exc))
|
|
103
|
+
raise
|
|
104
|
+
else:
|
|
105
|
+
ctx.save_state()
|
|
106
|
+
if progress is not None:
|
|
107
|
+
progress.advance(task_id)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from psystack.pipeline.stages.attribute import AttributeStage
|
|
2
|
+
from psystack.pipeline.stages.compare import CompareStage
|
|
3
|
+
from psystack.pipeline.stages.events import EventStage
|
|
4
|
+
from psystack.pipeline.stages.isolate import IsolateStage
|
|
5
|
+
from psystack.pipeline.stages.report import ReportStage
|
|
6
|
+
|
|
7
|
+
DEFAULT_PIPELINE = (
|
|
8
|
+
CompareStage(),
|
|
9
|
+
EventStage(),
|
|
10
|
+
IsolateStage(),
|
|
11
|
+
AttributeStage(),
|
|
12
|
+
ReportStage(),
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"CompareStage",
|
|
17
|
+
"EventStage",
|
|
18
|
+
"IsolateStage",
|
|
19
|
+
"AttributeStage",
|
|
20
|
+
"ReportStage",
|
|
21
|
+
"DEFAULT_PIPELINE",
|
|
22
|
+
]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""AttributeStage — computes attribution from compare metrics + isolate swap results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from pydantic import TypeAdapter
|
|
8
|
+
|
|
9
|
+
from psystack.models.comparison import ComparisonReport
|
|
10
|
+
from psystack.models.isolation import AttributionTable, IsolationResultBundle
|
|
11
|
+
from psystack.pipeline.context import RunContext
|
|
12
|
+
from psystack.pipeline.state import StageResult
|
|
13
|
+
|
|
14
|
+
_ATTR_TABLES = TypeAdapter(list[AttributionTable])
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _attribute_output_is_valid(ctx: RunContext) -> bool:
|
|
18
|
+
path = ctx.output_path("attribute")
|
|
19
|
+
if not path.exists():
|
|
20
|
+
return False
|
|
21
|
+
try:
|
|
22
|
+
_ATTR_TABLES.validate_json(path.read_text())
|
|
23
|
+
return True
|
|
24
|
+
except Exception:
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class AttributeStage:
|
|
29
|
+
name = "attribute"
|
|
30
|
+
requires = ("compare", "isolate")
|
|
31
|
+
|
|
32
|
+
def is_up_to_date(self, ctx: RunContext) -> bool:
|
|
33
|
+
return _attribute_output_is_valid(ctx)
|
|
34
|
+
|
|
35
|
+
def run(self, ctx: RunContext) -> StageResult:
|
|
36
|
+
from psystack.pipeline.isolation.attribution import compute_attribution
|
|
37
|
+
|
|
38
|
+
report = ComparisonReport.model_validate_json(
|
|
39
|
+
(ctx.stage_output_dir / "compare_report.json").read_text()
|
|
40
|
+
)
|
|
41
|
+
regressions = [m for m in report.metrics if m.status == "regression"]
|
|
42
|
+
|
|
43
|
+
if not regressions:
|
|
44
|
+
output = ctx.output_path("attribute")
|
|
45
|
+
output.write_text("[]")
|
|
46
|
+
return StageResult(
|
|
47
|
+
primary_output=str(output),
|
|
48
|
+
output_paths=[str(output)],
|
|
49
|
+
summary="No regressions — no attributions computed",
|
|
50
|
+
metadata={"num_attributions": 0, "skipped_reason": "no_regressions"},
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Read structured isolate bundle
|
|
54
|
+
bundle = IsolationResultBundle.model_validate_json(
|
|
55
|
+
ctx.output_path("isolate").read_text()
|
|
56
|
+
)
|
|
57
|
+
swap_results = bundle.swap_results
|
|
58
|
+
|
|
59
|
+
tables: list[AttributionTable] = []
|
|
60
|
+
warnings: list[str] = []
|
|
61
|
+
|
|
62
|
+
for metric in regressions:
|
|
63
|
+
table = compute_attribution(metric, swap_results)
|
|
64
|
+
tables.append(table)
|
|
65
|
+
if table.decision == "not_attributable":
|
|
66
|
+
warnings.append(f"{metric.metric_id} not attributable")
|
|
67
|
+
|
|
68
|
+
# Typed stage output: list[AttributionTable]
|
|
69
|
+
output = ctx.output_path("attribute")
|
|
70
|
+
output.write_text(json.dumps([t.model_dump() for t in tables], indent=2))
|
|
71
|
+
|
|
72
|
+
return StageResult(
|
|
73
|
+
primary_output=str(output),
|
|
74
|
+
output_paths=[str(output)],
|
|
75
|
+
summary=f"{len(tables)} attributions computed",
|
|
76
|
+
warnings=warnings,
|
|
77
|
+
metadata={"num_attributions": len(tables)},
|
|
78
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Stage protocol for the pipeline runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Protocol
|
|
6
|
+
|
|
7
|
+
from psystack.pipeline.state import StageResult
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from psystack.pipeline.context import RunContext
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Stage(Protocol):
|
|
14
|
+
name: str
|
|
15
|
+
requires: tuple[str, ...]
|
|
16
|
+
|
|
17
|
+
def is_up_to_date(self, ctx: RunContext) -> bool: ...
|
|
18
|
+
def run(self, ctx: RunContext) -> StageResult: ...
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""CompareStage — runs baseline vs candidate comparison."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from psystack.pipeline.compare import compare_manifests
|
|
6
|
+
from psystack.pipeline.context import RunContext
|
|
7
|
+
from psystack.pipeline.state import StageResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CompareStage:
|
|
11
|
+
name = "compare"
|
|
12
|
+
requires = ()
|
|
13
|
+
|
|
14
|
+
def is_up_to_date(self, ctx: RunContext) -> bool:
|
|
15
|
+
report_path = ctx.stage_output_dir / "compare_report.json"
|
|
16
|
+
return report_path.exists()
|
|
17
|
+
|
|
18
|
+
def run(self, ctx: RunContext) -> StageResult:
|
|
19
|
+
report = compare_manifests(
|
|
20
|
+
baseline_manifest=ctx.baseline_manifest,
|
|
21
|
+
candidate_manifest=ctx.candidate_manifest,
|
|
22
|
+
workspace=ctx.workspace,
|
|
23
|
+
factory=ctx.factory,
|
|
24
|
+
n_resamples=ctx.settings.run.bootstrap_resamples,
|
|
25
|
+
alpha=ctx.settings.run.alpha,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
report_path = ctx.stage_output_dir / "compare_report.json"
|
|
29
|
+
report_path.write_text(report.model_dump_json(indent=2))
|
|
30
|
+
|
|
31
|
+
regressions = [m for m in report.metrics if m.status == "regression"]
|
|
32
|
+
return StageResult(
|
|
33
|
+
primary_output=str(report_path),
|
|
34
|
+
output_paths=[str(report_path)],
|
|
35
|
+
summary=f"{len(report.metrics)} metrics compared, {len(regressions)} regressions",
|
|
36
|
+
metadata={"num_metrics": len(report.metrics), "num_regressions": len(regressions)},
|
|
37
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""EventStage — detect divergence events from compare output episode data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from psystack.pipeline.context import RunContext
|
|
8
|
+
from psystack.pipeline.events.config import EventDetectionConfig
|
|
9
|
+
from psystack.pipeline.events.detection import detect_events
|
|
10
|
+
from psystack.pipeline.state import StageResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EventStage:
|
|
14
|
+
name = "events"
|
|
15
|
+
requires = ("compare",)
|
|
16
|
+
|
|
17
|
+
def is_up_to_date(self, ctx: RunContext) -> bool:
|
|
18
|
+
events_path = ctx.stage_output_dir / "events.json"
|
|
19
|
+
return events_path.exists()
|
|
20
|
+
|
|
21
|
+
def run(self, ctx: RunContext) -> StageResult:
|
|
22
|
+
# Load episode data from compare stage output directory
|
|
23
|
+
baseline_episodes = self._load_episodes(ctx, "baseline")
|
|
24
|
+
candidate_episodes = self._load_episodes(ctx, "candidate")
|
|
25
|
+
|
|
26
|
+
# Load event config from settings if available
|
|
27
|
+
config = EventDetectionConfig()
|
|
28
|
+
settings = ctx.settings
|
|
29
|
+
if hasattr(settings, "event") and settings.event is not None:
|
|
30
|
+
config = EventDetectionConfig.model_validate(
|
|
31
|
+
settings.event.model_dump()
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
events = detect_events(baseline_episodes, candidate_episodes, config)
|
|
35
|
+
|
|
36
|
+
events_path = ctx.stage_output_dir / "events.json"
|
|
37
|
+
events_data = [e.model_dump() for e in events]
|
|
38
|
+
events_path.write_text(json.dumps(events_data, indent=2))
|
|
39
|
+
|
|
40
|
+
event_types = [e.type for e in events]
|
|
41
|
+
return StageResult(
|
|
42
|
+
primary_output=str(events_path),
|
|
43
|
+
output_paths=[str(events_path)],
|
|
44
|
+
summary=f"{len(events)} events detected: {', '.join(set(event_types)) or 'none'}",
|
|
45
|
+
metadata={"num_events": len(events), "event_types": list(set(event_types))},
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _load_episodes(self, ctx: RunContext, condition: str) -> list[dict]:
|
|
49
|
+
"""Load episode data from the compare stage output."""
|
|
50
|
+
episodes_path = ctx.workspace / condition / "episodes.json"
|
|
51
|
+
if episodes_path.exists():
|
|
52
|
+
return json.loads(episodes_path.read_text())
|
|
53
|
+
return []
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""IsolateStage — runs swap tests from a named isolation design."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from psystack.models import SwapTestSpec
|
|
6
|
+
from psystack.models.comparison import ComparisonReport
|
|
7
|
+
from psystack.models.isolation import IsolationResultBundle
|
|
8
|
+
from psystack.pipeline.context import RunContext
|
|
9
|
+
from psystack.pipeline.state import StageResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _isolate_output_is_valid(ctx: RunContext) -> bool:
|
|
13
|
+
path = ctx.output_path("isolate")
|
|
14
|
+
if not path.exists():
|
|
15
|
+
return False
|
|
16
|
+
try:
|
|
17
|
+
IsolationResultBundle.model_validate_json(path.read_text())
|
|
18
|
+
return True
|
|
19
|
+
except Exception:
|
|
20
|
+
return False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class IsolateStage:
|
|
24
|
+
name = "isolate"
|
|
25
|
+
requires = ("compare",)
|
|
26
|
+
|
|
27
|
+
def is_up_to_date(self, ctx: RunContext) -> bool:
|
|
28
|
+
return _isolate_output_is_valid(ctx)
|
|
29
|
+
|
|
30
|
+
def run(self, ctx: RunContext) -> StageResult:
|
|
31
|
+
from psystack.pipeline.isolation import build_isolation_plan
|
|
32
|
+
from psystack.pipeline.isolation.executor import execute_swap_test
|
|
33
|
+
|
|
34
|
+
report = ComparisonReport.model_validate_json(
|
|
35
|
+
(ctx.stage_output_dir / "compare_report.json").read_text()
|
|
36
|
+
)
|
|
37
|
+
regressions = [m for m in report.metrics if m.status == "regression"]
|
|
38
|
+
|
|
39
|
+
plan = build_isolation_plan("screening_v1")
|
|
40
|
+
|
|
41
|
+
if not regressions:
|
|
42
|
+
bundle = IsolationResultBundle(
|
|
43
|
+
design=plan.design,
|
|
44
|
+
cases=plan.cases,
|
|
45
|
+
swap_results=[],
|
|
46
|
+
)
|
|
47
|
+
output = ctx.output_path("isolate")
|
|
48
|
+
output.write_text(bundle.model_dump_json(indent=2))
|
|
49
|
+
return StageResult(
|
|
50
|
+
primary_output=str(output),
|
|
51
|
+
output_paths=[str(output)],
|
|
52
|
+
summary="No regressions — no swap tests run",
|
|
53
|
+
metadata={"num_swap_tests": 0, "skipped_reason": "no_regressions"},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
swap_dir = ctx.workspace / "swap_results"
|
|
57
|
+
swap_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
swap_results = []
|
|
60
|
+
warnings: list[str] = []
|
|
61
|
+
for case in plan.cases:
|
|
62
|
+
spec = SwapTestSpec(test_id=case.test_id, factors=case.factors)
|
|
63
|
+
result = execute_swap_test(
|
|
64
|
+
spec,
|
|
65
|
+
ctx.baseline_manifest,
|
|
66
|
+
ctx.candidate_manifest,
|
|
67
|
+
ctx.factory,
|
|
68
|
+
)
|
|
69
|
+
swap_results.append(result)
|
|
70
|
+
(swap_dir / f"{case.test_id}.json").write_text(result.model_dump_json(indent=2))
|
|
71
|
+
if result.status == "failed":
|
|
72
|
+
warnings.append(f"Swap test {case.test_id} failed: {result.error}")
|
|
73
|
+
|
|
74
|
+
bundle = IsolationResultBundle(
|
|
75
|
+
design=plan.design,
|
|
76
|
+
cases=plan.cases,
|
|
77
|
+
swap_results=swap_results,
|
|
78
|
+
)
|
|
79
|
+
output = ctx.output_path("isolate")
|
|
80
|
+
output.write_text(bundle.model_dump_json(indent=2))
|
|
81
|
+
|
|
82
|
+
return StageResult(
|
|
83
|
+
primary_output=str(output),
|
|
84
|
+
output_paths=[str(output)] + [str(swap_dir / f"{c.test_id}.json") for c in plan.cases],
|
|
85
|
+
summary=f"{len(swap_results)} swap tests completed",
|
|
86
|
+
warnings=warnings,
|
|
87
|
+
metadata={"num_swap_tests": len(swap_results)},
|
|
88
|
+
)
|