nullrun 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nullrun/actions.py ADDED
@@ -0,0 +1,455 @@
1
+ """
2
+ Client-side action handling for NullRun SDK.
3
+
4
+ When the circuit breaker trips (on backend or locally), these handlers
5
+ actually execute the protective actions.
6
+ """
7
+
8
+ import logging
9
+ import threading
10
+ import time
11
+ from collections.abc import Callable
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime, timezone
14
+ from enum import Enum
15
+ from typing import Any
16
+
17
+ try:
18
+ import httpx
19
+ _HAS_HTTPX = True
20
+ except ImportError:
21
+ _HAS_HTTPX = False
22
+
23
+ from nullrun.breaker.exceptions import (
24
+ NullRunBlockedException,
25
+ WorkflowKilledInterrupt,
26
+ WorkflowPausedException,
27
+ )
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @dataclass
33
+ class ActionEvent:
34
+ """Represents an action event for logging/replay."""
35
+ timestamp: str
36
+ action_type: str
37
+ workflow_id: str
38
+ reason: str
39
+ details: dict[str, Any] = field(default_factory=dict)
40
+
41
+
42
+ class ActionType(str, Enum):
43
+ """Types of actions that can be taken."""
44
+ KILL = "kill"
45
+ PAUSE = "pause"
46
+ ALERT = "alert"
47
+ SNAPSHOT = "snapshot"
48
+ BLOCK = "block"
49
+ WEBHOOK = "webhook"
50
+
51
+
52
+ class WebhookConfig:
53
+ """Configuration for webhook notifications."""
54
+ def __init__(
55
+ self,
56
+ url: str,
57
+ headers: dict[str, str] | None = None,
58
+ timeout: float = 5.0,
59
+ retries: int = 3,
60
+ ):
61
+ self.url = url
62
+ self.headers = headers or {}
63
+ self.timeout = timeout
64
+ self.retries = retries
65
+
66
+
67
+ class ActionHandler:
68
+ """
69
+ Handler for NullRun circuit breaker actions.
70
+
71
+ This executes protective actions when triggered:
72
+ - KILL: Immediately stops the workflow (raises WorkflowKilledInterrupt)
73
+ - PAUSE: Temporarily halts the workflow (raises WorkflowPausedException)
74
+ - ALERT: Sends notification (can be customized)
75
+ - SNAPSHOT: Captures workflow state for debugging
76
+ - WEBHOOK: Sends HTTP webhook notification
77
+
78
+ Usage:
79
+ handler = ActionHandler()
80
+
81
+ # Register custom alert handler
82
+ def my_alert(msg):
83
+ send_to_slack(msg)
84
+
85
+ handler.register_handler(ActionType.ALERT, my_alert)
86
+
87
+ # Register webhook
88
+ handler.register_webhook(WebhookConfig(
89
+ url="https://hooks.slack.com/...",
90
+ headers={"Content-Type": "application/json"}
91
+ ))
92
+
93
+ # Execute action
94
+ handler.handle("kill", workflow_id="wf-123", reason="Budget exceeded")
95
+ """
96
+
97
+ def __init__(self) -> None:
98
+ self._handlers: dict[ActionType, Callable[..., Any]] = {
99
+ ActionType.KILL: self._default_kill,
100
+ ActionType.PAUSE: self._default_pause,
101
+ ActionType.ALERT: self._default_alert,
102
+ ActionType.SNAPSHOT: self._default_snapshot,
103
+ ActionType.BLOCK: self._default_block,
104
+ ActionType.WEBHOOK: self._default_webhook,
105
+ }
106
+ self._paused_workflows: dict[str, float] = {}
107
+ self._webhooks: list[WebhookConfig] = []
108
+ self._action_history: list[ActionEvent] = []
109
+ self._max_history = 1000
110
+ self._lock = threading.Lock()
111
+ self._webhook_thread: threading.Thread | None = None
112
+ self._webhook_queue: list[dict[str, Any]] = []
113
+ self._webhook_max_size = 1000 # Limit queue size to prevent memory leak
114
+ self._webhook_running = False
115
+
116
+ def register_handler(self, action: ActionType, handler: Callable[..., Any]) -> None:
117
+ """Register a custom handler for an action type."""
118
+ self._handlers[action] = handler
119
+
120
+ def register_webhook(self, config: WebhookConfig) -> None:
121
+ """
122
+ Register a webhook for action notifications.
123
+
124
+ Args:
125
+ config: WebhookConfig with URL and options
126
+ """
127
+ self._webhooks.append(config)
128
+ logger.info(f"Registered webhook: {config.url}")
129
+
130
+ def remove_webhook(self, url: str) -> None:
131
+ """Remove a webhook by URL."""
132
+ self._webhooks = [w for w in self._webhooks if w.url != url]
133
+
134
+ def get_action_history(self, limit: int = 100) -> list[ActionEvent]:
135
+ """Get recent action events."""
136
+ with self._lock:
137
+ return self._action_history[-limit:]
138
+
139
+ def clear_history(self) -> None:
140
+ """Clear action history."""
141
+ with self._lock:
142
+ self._action_history.clear()
143
+
144
+ def _record_action(
145
+ self,
146
+ action_type: ActionType,
147
+ workflow_id: str,
148
+ reason: str,
149
+ details: dict[str, Any],
150
+ ) -> None:
151
+ """Record action to history."""
152
+ with self._lock:
153
+ event = ActionEvent(
154
+ timestamp=datetime.now(timezone.utc).isoformat(),
155
+ action_type=action_type.value,
156
+ workflow_id=workflow_id,
157
+ reason=reason,
158
+ details=details,
159
+ )
160
+ self._action_history.append(event)
161
+ # Trim history
162
+ if len(self._action_history) > self._max_history:
163
+ self._action_history = self._action_history[-self._max_history:]
164
+
165
+ def handle(
166
+ self,
167
+ action: str,
168
+ workflow_id: str,
169
+ reason: str | None = None,
170
+ **details: Any,
171
+ ) -> None:
172
+ """
173
+ Handle a circuit breaker action.
174
+
175
+ Args:
176
+ action: Action type string ("kill", "pause", "alert", etc.)
177
+ workflow_id: ID of the workflow
178
+ reason: Human-readable reason for the action
179
+ **details: Additional details about the action
180
+
181
+ Raises:
182
+ WorkflowKilledInterrupt: If action is "kill"
183
+ WorkflowPausedException: If action is "pause"
184
+ NullRunBlockedException: If action is "block"
185
+ """
186
+ try:
187
+ action_type = ActionType(action.lower())
188
+ except ValueError:
189
+ # Sprint 1.5 (B14): pre-fix this degraded silently to
190
+ # ``ActionType.BLOCK`` and triggered ``_default_block``,
191
+ # which raises ``NullRunBlockedException``. That made
192
+ # the SDK into a DoS amplifier: a single malformed
193
+ # ``action`` from the server (or a MITM, or a server
194
+ # schema regression) would block every subsequent tool
195
+ # call in the workflow with no actionable error.
196
+ #
197
+ # Post-fix: log at ERROR, record the event for forensic
198
+ # visibility, and DO NOT invoke any handler. The
199
+ # workflow keeps running under fail-open. The operator
200
+ # gets a clear signal that the control plane sent an
201
+ # action type the SDK doesn't understand — likely a
202
+ # version mismatch (server upgraded, SDK not yet) or a
203
+ # schema regression worth investigating.
204
+ logger.error(
205
+ f"Unknown action type received from control plane: {action!r} "
206
+ f"for workflow {workflow_id!r} (reason={reason!r}). "
207
+ "This is a server/SDK version mismatch or a control plane "
208
+ "schema regression. Failing open — the workflow will continue "
209
+ "running. Investigate ASAP."
210
+ )
211
+ self._record_action(
212
+ ActionType.BLOCK, # record what would have happened pre-fix
213
+ workflow_id,
214
+ f"unknown_action_type:{action}",
215
+ details,
216
+ )
217
+ return
218
+
219
+ handler = self._handlers.get(action_type, self._default_block)
220
+
221
+ # Record action to history
222
+ self._record_action(action_type, workflow_id, reason or "Unknown", details)
223
+
224
+ # Trigger webhooks asynchronously
225
+ if self._webhooks:
226
+ self._queue_webhook(action_type, workflow_id, reason or "Unknown", details)
227
+
228
+ try:
229
+ handler(workflow_id, reason or "Unknown", **details) # type: ignore[no-untyped-call]
230
+ except BaseException as e:
231
+ # Don't let handler exceptions propagate. We catch
232
+ # `BaseException` (not just `Exception`) because
233
+ # `WorkflowKilledInterrupt` is intentionally a
234
+ # `BaseException` subclass — it's a non-recoverable
235
+ # control signal, but inside the ActionHandler dispatch
236
+ # loop we want the kill to be recorded in history
237
+ # (already done above) and swallowed, NOT re-raised into
238
+ # the caller's frame.
239
+ logger.error(f"Action handler error: {e}")
240
+
241
+ def _default_kill(
242
+ self,
243
+ workflow_id: str,
244
+ reason: str,
245
+ **details: Any,
246
+ ) -> None:
247
+ """Default kill handler - raises WorkflowKilledInterrupt."""
248
+ logger.warning(f"KILL action for workflow {workflow_id}: {reason}")
249
+ raise WorkflowKilledInterrupt(workflow_id=workflow_id, reason=reason)
250
+
251
+ def _default_pause(
252
+ self,
253
+ workflow_id: str,
254
+ reason: str,
255
+ duration: float | None = None,
256
+ **details: Any,
257
+ ) -> None:
258
+ """Default pause handler - raises WorkflowPausedException."""
259
+ logger.warning(f"PAUSE action for workflow {workflow_id}: {reason}")
260
+
261
+ # Track paused workflow
262
+ with self._lock:
263
+ self._paused_workflows[workflow_id] = time.time()
264
+
265
+ raise WorkflowPausedException(
266
+ workflow_id=workflow_id,
267
+ reason=reason,
268
+ resume_after=duration,
269
+ )
270
+
271
+ def _default_alert(
272
+ self,
273
+ workflow_id: str,
274
+ reason: str,
275
+ **details: Any,
276
+ ) -> None:
277
+ """Default alert handler - logs the alert."""
278
+ logger.warning(f"ALERT for workflow {workflow_id}: {reason}")
279
+
280
+ def _default_snapshot(
281
+ self,
282
+ workflow_id: str,
283
+ reason: str,
284
+ **details: Any,
285
+ ) -> None:
286
+ """Default snapshot handler - logs snapshot request."""
287
+ logger.info(f"SNAPSHOT requested for workflow {workflow_id}: {reason}")
288
+
289
+ def _default_block(
290
+ self,
291
+ workflow_id: str,
292
+ reason: str,
293
+ **details: Any,
294
+ ) -> None:
295
+ """Default block handler - raises NullRunBlockedException."""
296
+ raise NullRunBlockedException(
297
+ workflow_id=workflow_id,
298
+ reason=reason,
299
+ action="block",
300
+ **details,
301
+ )
302
+
303
+ def _default_webhook(
304
+ self,
305
+ workflow_id: str,
306
+ reason: str,
307
+ **details: Any,
308
+ ) -> None:
309
+ """Default webhook handler - triggers registered webhooks."""
310
+ # Webhooks are handled asynchronously via _queue_webhook
311
+ logger.debug(f"WEBHOOK queued for workflow {workflow_id}: {reason}")
312
+
313
+ def _queue_webhook(
314
+ self,
315
+ action_type: ActionType,
316
+ workflow_id: str,
317
+ reason: str,
318
+ details: dict[str, Any],
319
+ ) -> None:
320
+ """Queue webhook for async delivery."""
321
+ payload = {
322
+ "action": action_type.value,
323
+ "workflow_id": workflow_id,
324
+ "reason": reason,
325
+ "details": details,
326
+ "timestamp": datetime.now(timezone.utc).isoformat(),
327
+ }
328
+ with self._lock:
329
+ # Enforce max queue size to prevent memory leak
330
+ if len(self._webhook_queue) >= self._webhook_max_size:
331
+ removed = self._webhook_queue.pop(0)
332
+ logger.warning(
333
+ f"Webhook queue overflow, dropping oldest: "
334
+ f"{removed.get('workflow_id')}"
335
+ )
336
+ self._webhook_queue.append(payload)
337
+
338
+ # Start webhook thread if not running
339
+ if not self._webhook_running:
340
+ self._webhook_running = True
341
+ self._webhook_thread = threading.Thread(
342
+ target=self._webhook_delivery,
343
+ daemon=True,
344
+ name="nullrun-webhook"
345
+ )
346
+ self._webhook_thread.start()
347
+
348
+ def _webhook_delivery(self) -> None:
349
+ """Background thread for delivering webhooks."""
350
+ while self._webhook_running:
351
+ try:
352
+ # Process queue
353
+ payload = None
354
+ with self._lock:
355
+ if self._webhook_queue:
356
+ payload = self._webhook_queue.pop(0)
357
+
358
+ if payload is None:
359
+ time.sleep(0.5)
360
+ continue
361
+
362
+ # Deliver to all registered webhooks
363
+ for webhook in self._webhooks:
364
+ self._deliver_webhook(webhook, payload)
365
+
366
+ except Exception as e:
367
+ logger.error(f"Webhook delivery error: {e}")
368
+
369
+ def _deliver_webhook(self, webhook: WebhookConfig, payload: dict[str, Any]) -> None:
370
+ """Deliver a single webhook."""
371
+ if not _HAS_HTTPX:
372
+ logger.warning("httpx not installed, cannot send webhook")
373
+ return
374
+
375
+ for attempt in range(webhook.retries):
376
+ try:
377
+ response = httpx.post(
378
+ webhook.url,
379
+ json=payload,
380
+ headers=webhook.headers,
381
+ timeout=webhook.timeout,
382
+ )
383
+ response.raise_for_status()
384
+ logger.debug(f"Webhook delivered to {webhook.url}")
385
+ return
386
+ except Exception as e:
387
+ logger.warning(f"Webhook attempt {attempt + 1} failed: {e}")
388
+ if attempt < webhook.retries - 1:
389
+ time.sleep(0.5 * (attempt + 1))
390
+
391
+ def stop_webhooks(self) -> None:
392
+ """Stop webhook delivery thread."""
393
+ self._webhook_running = False
394
+ if self._webhook_thread:
395
+ self._webhook_thread.join(timeout=2.0)
396
+
397
+ def is_paused(self, workflow_id: str, cooldown_seconds: float = 60.0) -> bool:
398
+ """
399
+ Check if a workflow is currently paused.
400
+
401
+ Args:
402
+ workflow_id: ID of the workflow
403
+ cooldown_seconds: Consider unpaused after this time
404
+
405
+ Returns:
406
+ True if workflow is paused and within cooldown period
407
+ """
408
+ with self._lock:
409
+ if workflow_id not in self._paused_workflows:
410
+ return False
411
+
412
+ paused_at = self._paused_workflows[workflow_id]
413
+ elapsed = time.time() - paused_at
414
+
415
+ if elapsed > cooldown_seconds:
416
+ # Cooldown expired, remove from paused list
417
+ del self._paused_workflows[workflow_id]
418
+ return False
419
+
420
+ return True
421
+
422
+
423
+ # Global action handler instance
424
+ _action_handler: ActionHandler | None = None
425
+ _handler_lock = threading.Lock()
426
+
427
+
428
+ def get_action_handler() -> ActionHandler:
429
+ """Get the global action handler instance."""
430
+ global _action_handler
431
+ if _action_handler is None:
432
+ with _handler_lock:
433
+ if _action_handler is None:
434
+ _action_handler = ActionHandler()
435
+ return _action_handler
436
+
437
+
438
+ def handle_action(
439
+ action: str,
440
+ workflow_id: str,
441
+ reason: str | None = None,
442
+ **details: Any,
443
+ ) -> None:
444
+ """
445
+ Handle a circuit breaker action using the global handler.
446
+
447
+ Usage:
448
+ handle_action("kill", "wf-123", "Budget exceeded")
449
+ """
450
+ get_action_handler().handle(action, workflow_id, reason, **details)
451
+
452
+
453
+ def register_action_handler(action: ActionType, handler: Callable[..., Any]) -> None:
454
+ """Register a custom handler for an action type."""
455
+ get_action_handler().register_handler(action, handler)
@@ -0,0 +1,27 @@
1
+ """
2
+ NullRun Breaker — circuit breaker + policy exceptions.
3
+
4
+ Historical product surface. The user-facing API now lives on
5
+ `nullrun.protect` (see `nullrun.decorators`) and `nullrun.toolbox.*`
6
+ for framework integrations. The classes and exceptions exposed here
7
+ remain so that `runtime.py`, `transport.py`, `actions.py`, and the
8
+ test suite can share a single error vocabulary.
9
+
10
+ Sprint 2.2: zombie exception classes (CostLimitExceeded,
11
+ ApprovalRequired, BreakerTimeout) were removed because they had
12
+ zero in-tree callers. See the NOTE block in
13
+ ``nullrun.breaker.exceptions`` for the full list.
14
+ """
15
+
16
+ from nullrun.breaker.circuit_breaker import CBState, CircuitBreaker
17
+ from nullrun.breaker.exceptions import (
18
+ BreakerError,
19
+ BreakerTransportError,
20
+ )
21
+
22
+ __all__ = [
23
+ "BreakerError",
24
+ "BreakerTransportError",
25
+ "CircuitBreaker",
26
+ "CBState",
27
+ ]