alignscope 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
alignscope/detector.py ADDED
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ AlignScope — Defection & Anomaly Detector
5
+
6
+ Monitors alignment metrics over time and flags moments when
7
+ an agent breaks from its coalition. These are the scientifically
8
+ interesting events in alignment research — the exact tick where
9
+ cooperation fails.
10
+
11
+ Detection methods:
12
+ 1. Direct defection events from the data source
13
+ 2. Metric-based anomalies: sudden drops in reciprocity or role stability
14
+ 3. Coalition fragmentation: when coalition sizes shrink unexpectedly
15
+
16
+ Environment-agnostic: works with any data source that provides
17
+ the standard AlignScope tick/metrics format.
18
+ """
19
+
20
+
21
+ class DefectionDetector:
22
+ """Accumulates metric history and detects alignment anomalies."""
23
+
24
+ def __init__(
25
+ self,
26
+ reciprocity_drop_threshold: float = 0.3,
27
+ stability_drop_threshold: float = 0.2,
28
+ lookback_window: int = 10,
29
+ ):
30
+ self.reciprocity_drop_threshold = reciprocity_drop_threshold
31
+ self.stability_drop_threshold = stability_drop_threshold
32
+ self.lookback_window = lookback_window
33
+
34
+ self.metric_history: list[dict] = []
35
+ self.all_events: list[dict] = []
36
+
37
+ def analyze(self, tick_data: dict, metrics: dict) -> list[dict]:
38
+ """
39
+ Analyze one tick for defection and anomaly events.
40
+
41
+ Args:
42
+ tick_data: Raw game state from any compatible data source
43
+ metrics: Computed alignment metrics from AlignmentMetrics
44
+
45
+ Returns:
46
+ List of detected events (may be empty)
47
+ """
48
+ events = []
49
+ tick = tick_data["tick"]
50
+
51
+ # 1. Forward direct defection events from data source
52
+ for defection in tick_data.get("defection_events", []):
53
+ event = {
54
+ "tick": tick,
55
+ "type": "defection",
56
+ "agent_id": defection["agent_id"],
57
+ "team": defection["team"],
58
+ "severity": defection["severity"],
59
+ "description": (
60
+ f"Agent {defection['agent_id']} (team {defection.get('team', '?')}, "
61
+ f"{defection.get('previous_role', 'unknown')}) defected. "
62
+ f"Reason: {defection.get('reason', 'no reason provided')}"
63
+ ),
64
+ "details": defection,
65
+ }
66
+ events.append(event)
67
+
68
+ # 2. Check for reciprocity drops (metric-based anomalies)
69
+ if len(self.metric_history) >= self.lookback_window:
70
+ reciprocity_events = self._detect_reciprocity_anomalies(
71
+ tick, metrics
72
+ )
73
+ events.extend(reciprocity_events)
74
+
75
+ # 3. Check for role stability drops
76
+ if len(self.metric_history) >= self.lookback_window:
77
+ stability_events = self._detect_stability_anomalies(
78
+ tick, metrics
79
+ )
80
+ events.extend(stability_events)
81
+
82
+ # 4. Check coalition fragmentation
83
+ coalition_events = self._detect_coalition_changes(tick, metrics)
84
+ events.extend(coalition_events)
85
+
86
+ # Store for history
87
+ self.metric_history.append(metrics)
88
+ self.all_events.extend(events)
89
+
90
+ return events
91
+
92
+ def _detect_reciprocity_anomalies(
93
+ self, tick: int, current: dict
94
+ ) -> list[dict]:
95
+ """Detect sudden drops in pair reciprocity."""
96
+ events = []
97
+ current_pairs = {
98
+ (p["agent_a"], p["agent_b"]): p["reciprocity"]
99
+ for p in current.get("pair_metrics", [])
100
+ }
101
+
102
+ for pair, current_val in current_pairs.items():
103
+ historical = []
104
+ for past in self.metric_history[-self.lookback_window:]:
105
+ for p in past.get("pair_metrics", []):
106
+ if (p["agent_a"], p["agent_b"]) == pair:
107
+ historical.append(p["reciprocity"])
108
+
109
+ if historical:
110
+ avg_historical = sum(historical) / len(historical)
111
+ drop = avg_historical - current_val
112
+
113
+ if drop > self.reciprocity_drop_threshold:
114
+ events.append({
115
+ "tick": tick,
116
+ "type": "reciprocity_drop",
117
+ "agent_id": pair[0],
118
+ "partner_id": pair[1],
119
+ "severity": round(min(1.0, drop / 0.5), 3),
120
+ "description": (
121
+ f"Reciprocity between agents {pair[0]} and {pair[1]} "
122
+ f"dropped from {avg_historical:.2f} to {current_val:.2f}"
123
+ ),
124
+ "team": None,
125
+ })
126
+
127
+ return events
128
+
129
+ def _detect_stability_anomalies(
130
+ self, tick: int, current: dict
131
+ ) -> list[dict]:
132
+ """Detect sudden role stability drops for individual agents."""
133
+ events = []
134
+ current_agents = current.get("agent_metrics", {})
135
+
136
+ for aid, metrics in current_agents.items():
137
+ current_stability = metrics["role_stability"]
138
+
139
+ historical = []
140
+ for past in self.metric_history[-self.lookback_window:]:
141
+ past_agents = past.get("agent_metrics", {})
142
+ if aid in past_agents:
143
+ historical.append(past_agents[aid]["role_stability"])
144
+
145
+ if historical:
146
+ avg_historical = sum(historical) / len(historical)
147
+ drop = avg_historical - current_stability
148
+
149
+ if drop > self.stability_drop_threshold:
150
+ events.append({
151
+ "tick": tick,
152
+ "type": "stability_drop",
153
+ "agent_id": aid,
154
+ "severity": round(min(1.0, drop / 0.4), 3),
155
+ "description": (
156
+ f"Agent {aid} role stability dropped from "
157
+ f"{avg_historical:.2f} to {current_stability:.2f} — "
158
+ f"possible role confusion or strategic shift"
159
+ ),
160
+ "team": metrics.get("team"),
161
+ })
162
+
163
+ return events
164
+
165
+ def _detect_coalition_changes(
166
+ self, tick: int, current: dict
167
+ ) -> list[dict]:
168
+ """
169
+ Detect coalition fragmentation.
170
+
171
+ FIX: Removed the 'prev_coalitions > 0' guard that was preventing
172
+ events from firing after coalitions first dropped to 0.
173
+
174
+ Before:
175
+ curr_coalitions < prev_coalitions AND prev_coalitions > 0
176
+ → Once coalitions hit 0 they could never drop further,
177
+ so no events fired after the first fragmentation.
178
+
179
+ After:
180
+ curr_coalitions < prev_coalitions
181
+ → Fires every time the count drops, regardless of floor value.
182
+ """
183
+ events = []
184
+
185
+ if not self.metric_history:
186
+ return events
187
+
188
+ prev = self.metric_history[-1]
189
+ prev_teams = prev.get("team_metrics", {})
190
+ curr_teams = current.get("team_metrics", {})
191
+
192
+ for tid in curr_teams:
193
+ if tid in prev_teams:
194
+ prev_coalitions = prev_teams[tid].get("active_coalitions", 0)
195
+ curr_coalitions = curr_teams[tid].get("active_coalitions", 0)
196
+
197
+ # FIX: removed "and prev_coalitions > 0"
198
+ if (isinstance(prev_coalitions, int)
199
+ and isinstance(curr_coalitions, int)
200
+ and curr_coalitions < prev_coalitions):
201
+ lost = prev_coalitions - curr_coalitions
202
+ events.append({
203
+ "tick": tick,
204
+ "type": "coalition_fragmentation",
205
+ "agent_id": None,
206
+ "team": tid,
207
+ "severity": round(min(1.0, lost / 2.0), 3),
208
+ "description": (
209
+ f"Team {tid} lost {lost} coalition(s): "
210
+ f"{prev_coalitions} → {curr_coalitions}"
211
+ ),
212
+ })
213
+
214
+ return events
215
+
216
+ def get_summary(self) -> dict:
217
+ """Return aggregate defection statistics."""
218
+ if not self.all_events:
219
+ return {
220
+ "total_events": 0,
221
+ "defections": 0,
222
+ "reciprocity_drops": 0,
223
+ "stability_drops": 0,
224
+ "coalition_fragmentations": 0,
225
+ "avg_severity": 0,
226
+ }
227
+
228
+ by_type = {}
229
+ for e in self.all_events:
230
+ t = e["type"]
231
+ by_type[t] = by_type.get(t, 0) + 1
232
+
233
+ avg_sev = sum(e.get("severity", 0) for e in self.all_events) / len(self.all_events)
234
+
235
+ return {
236
+ "total_events": len(self.all_events),
237
+ "defections": by_type.get("defection", 0),
238
+ "reciprocity_drops": by_type.get("reciprocity_drop", 0),
239
+ "stability_drops": by_type.get("stability_drop", 0),
240
+ "coalition_fragmentations": by_type.get("coalition_fragmentation", 0),
241
+ "avg_severity": round(avg_sev, 3),
242
+ }
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ AlignScope — Integration Bridges
5
+
6
+ Auto-detects installed ML observability tools (W&B, MLflow) and
7
+ forwards alignment metrics to them. This ensures zero vendor lock-in —
8
+ AlignScope adds to your existing stack, never replaces it.
9
+ """
10
+
11
+
12
+ def detect_integrations() -> dict[str, bool]:
13
+ """Check which ML tools are available."""
14
+ available = {}
15
+
16
+ try:
17
+ import wandb
18
+ available["wandb"] = True
19
+ except ImportError:
20
+ available["wandb"] = False
21
+
22
+ try:
23
+ import mlflow
24
+ available["mlflow"] = True
25
+ except ImportError:
26
+ available["mlflow"] = False
27
+
28
+ return available
@@ -0,0 +1,70 @@
1
+ """
2
+ AlignScope — MLflow Bridge
3
+
4
+ If the user has MLflow installed and an active run, AlignScope
5
+ automatically logs alignment metrics as MLflow metrics.
6
+
7
+ Zero vendor lock-in: AlignScope enriches MLflow — never replaces it.
8
+ """
9
+
10
+
11
+ class MlflowBridge:
12
+ """Forwards AlignScope metrics to an active MLflow run."""
13
+
14
+ def __init__(self):
15
+ self._mlflow = None
16
+ self._active = False
17
+
18
+ try:
19
+ import mlflow
20
+ self._mlflow = mlflow
21
+
22
+ # Check for active run
23
+ if mlflow.active_run() is not None:
24
+ self._active = True
25
+ print("[AlignScope] ✓ Detected active MLflow run — forwarding metrics")
26
+ else:
27
+ print("[AlignScope] MLflow installed but no active run. "
28
+ "Call mlflow.start_run() first to enable forwarding.")
29
+ except ImportError:
30
+ pass
31
+
32
+ def log(self, step: int, metrics: dict, events: list):
33
+ """Forward alignment metrics to MLflow."""
34
+ if not self._active or not self._mlflow:
35
+ return
36
+
37
+ # Re-check for active run
38
+ if self._mlflow.active_run() is None:
39
+ return
40
+
41
+ try:
42
+ log_data = {
43
+ "alignscope.overall_alignment": metrics.get("overall_alignment_score", 0),
44
+ }
45
+
46
+ team_metrics = metrics.get("team_metrics", {})
47
+ for tid, tm in team_metrics.items():
48
+ prefix = f"alignscope.team_{tid}"
49
+ log_data[f"{prefix}.role_stability"] = tm.get("avg_role_stability", 0)
50
+ log_data[f"{prefix}.coalitions"] = tm.get("active_coalitions", 0)
51
+ log_data[f"{prefix}.defectors"] = tm.get("defector_count", 0)
52
+
53
+ self._mlflow.log_metrics(log_data, step=step)
54
+
55
+ except Exception:
56
+ pass
57
+
58
+ def finish(self, summary: dict):
59
+ """Log final summary params to MLflow."""
60
+ if not self._active or not self._mlflow:
61
+ return
62
+
63
+ try:
64
+ params = {
65
+ f"alignscope_{k}": str(v)
66
+ for k, v in summary.items()
67
+ }
68
+ self._mlflow.log_params(params)
69
+ except Exception:
70
+ pass
@@ -0,0 +1,81 @@
1
+ """
2
+ AlignScope — Weights & Biases Bridge
3
+
4
+ If the user has W&B installed and an active run, AlignScope
5
+ automatically forwards alignment metrics as W&B custom metrics.
6
+ This means they see AlignScope data right alongside their reward
7
+ curves in the W&B dashboard.
8
+
9
+ Zero vendor lock-in: AlignScope never replaces W&B — it adds to it.
10
+ """
11
+
12
+
13
+ class WandbBridge:
14
+ """Forwards AlignScope metrics to an active W&B run."""
15
+
16
+ def __init__(self):
17
+ self._wandb = None
18
+ self._active = False
19
+
20
+ try:
21
+ import wandb
22
+ self._wandb = wandb
23
+
24
+ # Check if there's an active run
25
+ if wandb.run is not None:
26
+ self._active = True
27
+ print("[AlignScope] ✓ Detected active W&B run — forwarding metrics")
28
+ else:
29
+ print("[AlignScope] W&B installed but no active run. "
30
+ "Call wandb.init() first to enable forwarding.")
31
+ except ImportError:
32
+ pass
33
+
34
+ def log(self, step: int, metrics: dict, events: list):
35
+ """Forward alignment metrics to W&B."""
36
+ if not self._active or not self._wandb:
37
+ return
38
+
39
+ # Re-check for active run (may have been initialized after AlignScope)
40
+ if self._wandb.run is None:
41
+ return
42
+
43
+ try:
44
+ # Log overall alignment score
45
+ log_data = {
46
+ "alignscope/overall_alignment": metrics.get("overall_alignment_score", 0),
47
+ }
48
+
49
+ # Log per-team metrics
50
+ team_metrics = metrics.get("team_metrics", {})
51
+ for tid, tm in team_metrics.items():
52
+ prefix = f"alignscope/team_{tid}"
53
+ log_data[f"{prefix}/role_stability"] = tm.get("avg_role_stability", 0)
54
+ log_data[f"{prefix}/coalitions"] = tm.get("active_coalitions", 0)
55
+ log_data[f"{prefix}/defectors"] = tm.get("defector_count", 0)
56
+
57
+ # Log event counts
58
+ if events:
59
+ by_type = {}
60
+ for e in events:
61
+ t = e.get("type", "unknown")
62
+ by_type[t] = by_type.get(t, 0) + 1
63
+
64
+ for etype, count in by_type.items():
65
+ log_data[f"alignscope/events/{etype}"] = count
66
+
67
+ self._wandb.log(log_data, step=step)
68
+
69
+ except Exception:
70
+ pass # Never crash
71
+
72
+ def finish(self, summary: dict):
73
+ """Log final summary to W&B."""
74
+ if not self._active or not self._wandb or self._wandb.run is None:
75
+ return
76
+
77
+ try:
78
+ for key, value in summary.items():
79
+ self._wandb.run.summary[f"alignscope/{key}"] = value
80
+ except Exception:
81
+ pass