delimit-cli 4.1.44 → 4.1.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,569 @@
1
+ """Governance Hardening for Proactive Auto-Triggers (LED-661).
2
+
3
+ Provides four hardening primitives that wrap MCP tool calls and loop engine
4
+ operations with resilience guarantees:
5
+
6
+ - ResilientToolCaller: retry with exponential backoff, timeout, fallback
7
+ - ApprovalFlow: email-based approve/reject for founder decisions
8
+ - TriggerDebouncer: per-tool cooldowns to prevent notification storms
9
+ - ChainCircuitBreaker: halt chains after consecutive failures
10
+
11
+ All classes are opt-in. When not configured, existing behavior is unchanged.
12
+ Wire into loop_engine.run_governed_iteration() via GovernanceHardeningConfig.
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import time
18
+ import uuid
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+ from typing import Any, Callable, Dict, List, Optional
22
+
23
+ logger = logging.getLogger("delimit.ai.governance_hardening")
24
+
25
+
26
+ # ── ResilientToolCaller ─────────────────────────────────────────────────
27
+
28
+ class ResilientToolCaller:
29
+ """Wrap MCP tool calls with retry, timeout, and fallback.
30
+
31
+ Parameters:
32
+ max_retries: Maximum number of retry attempts (default 3).
33
+ base_delay: Initial delay in seconds for exponential backoff (default 1.0).
34
+ max_delay: Cap on backoff delay in seconds (default 30.0).
35
+ timeout: Per-call timeout in seconds (default 60.0).
36
+ fallback: Optional callable returning a fallback result on total failure.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ max_retries: int = 3,
42
+ base_delay: float = 1.0,
43
+ max_delay: float = 30.0,
44
+ timeout: float = 60.0,
45
+ fallback: Optional[Callable[..., Any]] = None,
46
+ ):
47
+ self.max_retries = max_retries
48
+ self.base_delay = base_delay
49
+ self.max_delay = max_delay
50
+ self.timeout = timeout
51
+ self.fallback = fallback
52
+ self._call_log: List[Dict[str, Any]] = []
53
+
54
+ def call(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
55
+ """Execute fn with retry and exponential backoff.
56
+
57
+ Returns the result on success, or the fallback result if all retries
58
+ are exhausted and a fallback is configured. Raises the last exception
59
+ if no fallback is available.
60
+ """
61
+ last_error: Optional[Exception] = None
62
+
63
+ for attempt in range(1, self.max_retries + 1):
64
+ start = time.monotonic()
65
+ try:
66
+ result = fn(*args, **kwargs)
67
+ elapsed = time.monotonic() - start
68
+ self._call_log.append({
69
+ "fn": getattr(fn, "__name__", str(fn)),
70
+ "attempt": attempt,
71
+ "status": "success",
72
+ "elapsed": round(elapsed, 3),
73
+ "timestamp": datetime.now(timezone.utc).isoformat(),
74
+ })
75
+ return result
76
+ except Exception as exc:
77
+ elapsed = time.monotonic() - start
78
+ last_error = exc
79
+ self._call_log.append({
80
+ "fn": getattr(fn, "__name__", str(fn)),
81
+ "attempt": attempt,
82
+ "status": "error",
83
+ "error": str(exc),
84
+ "elapsed": round(elapsed, 3),
85
+ "timestamp": datetime.now(timezone.utc).isoformat(),
86
+ })
87
+ logger.warning(
88
+ "ResilientToolCaller: %s attempt %d/%d failed: %s",
89
+ getattr(fn, "__name__", "?"), attempt, self.max_retries, exc,
90
+ )
91
+
92
+ if attempt < self.max_retries:
93
+ delay = min(self.base_delay * (2 ** (attempt - 1)), self.max_delay)
94
+ time.sleep(delay)
95
+
96
+ # All retries exhausted
97
+ if self.fallback is not None:
98
+ logger.info("ResilientToolCaller: falling back for %s", getattr(fn, "__name__", "?"))
99
+ return self.fallback(*args, **kwargs)
100
+
101
+ raise last_error # type: ignore[misc]
102
+
103
+ @property
104
+ def call_log(self) -> List[Dict[str, Any]]:
105
+ return list(self._call_log)
106
+
107
+ def reset_log(self) -> None:
108
+ self._call_log.clear()
109
+
110
+
111
+ # ── ApprovalFlow ────────────────────────────────────────────────────────
112
+
113
+ class ApprovalFlow:
114
+ """Email-based approval flow for founder decisions.
115
+
116
+ Sends an approval request via email, then polls the inbox for a response.
117
+ Times out after a configurable period with a configurable default action.
118
+
119
+ Parameters:
120
+ send_fn: Callable to send an email. Signature: (subject, body, priority) -> None.
121
+ poll_fn: Callable to poll inbox. Signature: () -> List[Dict] of messages.
122
+ timeout_seconds: Max wait time (default 86400 = 24h).
123
+ poll_interval: Seconds between inbox checks (default 300 = 5min).
124
+ default_action: Action when timeout expires ("reject" or "approve").
125
+ state_dir: Directory to persist pending approval state.
126
+ """
127
+
128
+ PENDING_FILE = "pending_approvals.json"
129
+
130
+ def __init__(
131
+ self,
132
+ send_fn: Optional[Callable] = None,
133
+ poll_fn: Optional[Callable] = None,
134
+ timeout_seconds: float = 86400,
135
+ poll_interval: float = 300,
136
+ default_action: str = "reject",
137
+ state_dir: Optional[Path] = None,
138
+ ):
139
+ self.send_fn = send_fn
140
+ self.poll_fn = poll_fn
141
+ self.timeout_seconds = timeout_seconds
142
+ self.poll_interval = poll_interval
143
+ self.default_action = default_action
144
+ self.state_dir = state_dir or Path.home() / ".delimit" / "loop" / "approvals"
145
+ self._pending: Dict[str, Dict[str, Any]] = {}
146
+ self._load_state()
147
+
148
+ def _load_state(self) -> None:
149
+ self.state_dir.mkdir(parents=True, exist_ok=True)
150
+ path = self.state_dir / self.PENDING_FILE
151
+ if path.exists():
152
+ try:
153
+ self._pending = json.loads(path.read_text())
154
+ except (json.JSONDecodeError, OSError):
155
+ self._pending = {}
156
+
157
+ def _save_state(self) -> None:
158
+ self.state_dir.mkdir(parents=True, exist_ok=True)
159
+ path = self.state_dir / self.PENDING_FILE
160
+ path.write_text(json.dumps(self._pending, indent=2))
161
+
162
+ def request_approval(
163
+ self,
164
+ action_description: str,
165
+ context: str = "",
166
+ priority: str = "P1",
167
+ ) -> str:
168
+ """Send an approval request email and return a request ID.
169
+
170
+ The request is persisted so it survives process restarts.
171
+ """
172
+ request_id = f"approval-{uuid.uuid4().hex[:8]}"
173
+ subject = f"[Delimit Approval] {action_description[:80]}"
174
+ body = (
175
+ f"Action: {action_description}\n"
176
+ f"Context: {context}\n"
177
+ f"Request ID: {request_id}\n\n"
178
+ f"Reply APPROVE or REJECT to this email.\n"
179
+ f"Auto-{self.default_action} in {self.timeout_seconds // 3600}h if no response."
180
+ )
181
+
182
+ record = {
183
+ "request_id": request_id,
184
+ "action": action_description,
185
+ "context": context,
186
+ "priority": priority,
187
+ "status": "pending",
188
+ "created_at": datetime.now(timezone.utc).isoformat(),
189
+ "timeout_at": datetime.fromtimestamp(
190
+ time.time() + self.timeout_seconds, tz=timezone.utc
191
+ ).isoformat(),
192
+ }
193
+ self._pending[request_id] = record
194
+
195
+ if self.send_fn:
196
+ try:
197
+ self.send_fn(subject, body, priority)
198
+ record["email_sent"] = True
199
+ except Exception as exc:
200
+ logger.error("ApprovalFlow: failed to send email: %s", exc)
201
+ record["email_sent"] = False
202
+ record["email_error"] = str(exc)
203
+
204
+ self._save_state()
205
+ return request_id
206
+
207
+ def check_approval(self, request_id: str) -> Dict[str, Any]:
208
+ """Check the status of a pending approval request.
209
+
210
+ Polls the inbox for responses matching the request ID.
211
+ Returns {"status": "approved"|"rejected"|"pending"|"timed_out"}.
212
+ """
213
+ record = self._pending.get(request_id)
214
+ if not record:
215
+ return {"status": "not_found", "request_id": request_id}
216
+
217
+ if record["status"] != "pending":
218
+ return {"status": record["status"], "request_id": request_id}
219
+
220
+ # Check timeout
221
+ timeout_at = datetime.fromisoformat(record["timeout_at"])
222
+ if datetime.now(timezone.utc) >= timeout_at:
223
+ record["status"] = f"timed_out_{self.default_action}"
224
+ self._save_state()
225
+ return {"status": record["status"], "request_id": request_id}
226
+
227
+ # Poll inbox
228
+ if self.poll_fn:
229
+ try:
230
+ messages = self.poll_fn()
231
+ for msg in messages:
232
+ msg_text = str(msg.get("body", "") or msg.get("subject", "")).upper()
233
+ if request_id in str(msg.get("body", "")) or request_id in str(msg.get("subject", "")):
234
+ if "APPROVE" in msg_text:
235
+ record["status"] = "approved"
236
+ record["resolved_at"] = datetime.now(timezone.utc).isoformat()
237
+ self._save_state()
238
+ return {"status": "approved", "request_id": request_id}
239
+ elif "REJECT" in msg_text:
240
+ record["status"] = "rejected"
241
+ record["resolved_at"] = datetime.now(timezone.utc).isoformat()
242
+ self._save_state()
243
+ return {"status": "rejected", "request_id": request_id}
244
+ except Exception as exc:
245
+ logger.warning("ApprovalFlow: poll failed: %s", exc)
246
+
247
+ return {"status": "pending", "request_id": request_id}
248
+
249
+ @property
250
+ def pending_requests(self) -> List[Dict[str, Any]]:
251
+ return [r for r in self._pending.values() if r.get("status") == "pending"]
252
+
253
+
254
+ # ── TriggerDebouncer ────────────────────────────────────────────────────
255
+
256
+ class TriggerDebouncer:
257
+ """Prevent storms of tool calls by enforcing per-tool cooldowns.
258
+
259
+ Parameters:
260
+ default_cooldown: Default cooldown in seconds (default 300 = 5min).
261
+ tool_cooldowns: Dict mapping tool names to specific cooldowns.
262
+ max_calls_per_hour: Global rate limit across all tools (default 5).
263
+ """
264
+
265
+ def __init__(
266
+ self,
267
+ default_cooldown: float = 300.0,
268
+ tool_cooldowns: Optional[Dict[str, float]] = None,
269
+ max_calls_per_hour: int = 5,
270
+ ):
271
+ self.default_cooldown = default_cooldown
272
+ self.tool_cooldowns = tool_cooldowns or {}
273
+ self.max_calls_per_hour = max_calls_per_hour
274
+ self._last_call: Dict[str, float] = {} # tool_name -> monotonic timestamp
275
+ self._hourly_calls: List[float] = [] # monotonic timestamps of all calls
276
+
277
+ def can_fire(self, tool_name: str) -> bool:
278
+ """Check if the tool is allowed to fire (respects cooldown and rate limit)."""
279
+ now = time.monotonic()
280
+
281
+ # Per-tool cooldown check
282
+ cooldown = self.tool_cooldowns.get(tool_name, self.default_cooldown)
283
+ last = self._last_call.get(tool_name)
284
+ if last is not None and (now - last) < cooldown:
285
+ return False
286
+
287
+ # Global hourly rate limit
288
+ cutoff = now - 3600
289
+ self._hourly_calls = [t for t in self._hourly_calls if t > cutoff]
290
+ if len(self._hourly_calls) >= self.max_calls_per_hour:
291
+ return False
292
+
293
+ return True
294
+
295
+ def record_call(self, tool_name: str) -> None:
296
+ """Record that a tool was fired."""
297
+ now = time.monotonic()
298
+ self._last_call[tool_name] = now
299
+ self._hourly_calls.append(now)
300
+
301
+ def try_fire(self, tool_name: str, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Optional[Any]:
302
+ """Fire the tool only if debounce allows it. Returns None if suppressed."""
303
+ if not self.can_fire(tool_name):
304
+ logger.debug("TriggerDebouncer: suppressed %s (cooldown)", tool_name)
305
+ return None
306
+ result = fn(*args, **kwargs)
307
+ self.record_call(tool_name)
308
+ return result
309
+
310
+ def time_until_allowed(self, tool_name: str) -> float:
311
+ """Seconds until this tool can fire again. 0.0 if allowed now."""
312
+ now = time.monotonic()
313
+ cooldown = self.tool_cooldowns.get(tool_name, self.default_cooldown)
314
+ last = self._last_call.get(tool_name)
315
+ if last is None:
316
+ return 0.0
317
+ remaining = cooldown - (now - last)
318
+ return max(0.0, remaining)
319
+
320
+ def reset(self, tool_name: Optional[str] = None) -> None:
321
+ """Reset cooldown state for a specific tool or all tools."""
322
+ if tool_name:
323
+ self._last_call.pop(tool_name, None)
324
+ else:
325
+ self._last_call.clear()
326
+ self._hourly_calls.clear()
327
+
328
+
329
+ # ── ChainCircuitBreaker ────────────────────────────────────────────────
330
+
331
+ class ChainCircuitBreaker:
332
+ """Halt tool chains after consecutive failures.
333
+
334
+ Parameters:
335
+ failure_threshold: Number of consecutive failures to trip the breaker (default 3).
336
+ recovery_timeout: Seconds to wait before allowing a retry (default 300 = 5min).
337
+ notify_fn: Optional callable invoked when the breaker trips.
338
+ """
339
+
340
+ STATE_CLOSED = "closed" # Normal operation
341
+ STATE_OPEN = "open" # Breaker tripped, rejecting calls
342
+ STATE_HALF_OPEN = "half_open" # Allowing a single probe call
343
+
344
+ def __init__(
345
+ self,
346
+ failure_threshold: int = 3,
347
+ recovery_timeout: float = 300.0,
348
+ notify_fn: Optional[Callable[[str, int], None]] = None,
349
+ ):
350
+ self.failure_threshold = failure_threshold
351
+ self.recovery_timeout = recovery_timeout
352
+ self.notify_fn = notify_fn
353
+
354
+ self._state: str = self.STATE_CLOSED
355
+ self._consecutive_failures: int = 0
356
+ self._last_failure_time: float = 0.0
357
+ self._failure_log: List[Dict[str, Any]] = []
358
+
359
+ @property
360
+ def state(self) -> str:
361
+ """Current breaker state, accounting for recovery timeout."""
362
+ if self._state == self.STATE_OPEN:
363
+ if time.monotonic() - self._last_failure_time >= self.recovery_timeout:
364
+ self._state = self.STATE_HALF_OPEN
365
+ return self._state
366
+
367
+ @property
368
+ def consecutive_failures(self) -> int:
369
+ return self._consecutive_failures
370
+
371
+ def record_success(self) -> None:
372
+ """Record a successful call. Resets failure count and closes the breaker."""
373
+ self._consecutive_failures = 0
374
+ self._state = self.STATE_CLOSED
375
+
376
+ def record_failure(self, error: str = "") -> None:
377
+ """Record a failed call. May trip the breaker."""
378
+ self._consecutive_failures += 1
379
+ self._last_failure_time = time.monotonic()
380
+ self._failure_log.append({
381
+ "error": error,
382
+ "count": self._consecutive_failures,
383
+ "timestamp": datetime.now(timezone.utc).isoformat(),
384
+ })
385
+
386
+ if self._consecutive_failures >= self.failure_threshold:
387
+ self._state = self.STATE_OPEN
388
+ logger.warning(
389
+ "ChainCircuitBreaker: tripped after %d consecutive failures",
390
+ self._consecutive_failures,
391
+ )
392
+ if self.notify_fn:
393
+ try:
394
+ self.notify_fn(error, self._consecutive_failures)
395
+ except Exception as exc:
396
+ logger.error("ChainCircuitBreaker: notify_fn failed: %s", exc)
397
+
398
+ def allow_call(self) -> bool:
399
+ """Check if a call is currently allowed."""
400
+ current = self.state # triggers timeout check
401
+ if current == self.STATE_CLOSED:
402
+ return True
403
+ if current == self.STATE_HALF_OPEN:
404
+ return True # Allow one probe
405
+ return False
406
+
407
+ def execute(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
408
+ """Execute fn through the circuit breaker.
409
+
410
+ Raises CircuitBreakerOpen if the breaker is open.
411
+ Records success/failure automatically.
412
+ """
413
+ if not self.allow_call():
414
+ raise CircuitBreakerOpen(
415
+ f"Circuit breaker is open after {self._consecutive_failures} failures. "
416
+ f"Recovery in {self.recovery_timeout - (time.monotonic() - self._last_failure_time):.0f}s."
417
+ )
418
+
419
+ try:
420
+ result = fn(*args, **kwargs)
421
+ self.record_success()
422
+ return result
423
+ except Exception as exc:
424
+ self.record_failure(str(exc))
425
+ raise
426
+
427
+ def reset(self) -> None:
428
+ """Manually reset the breaker to closed state."""
429
+ self._state = self.STATE_CLOSED
430
+ self._consecutive_failures = 0
431
+ self._failure_log.clear()
432
+
433
+ @property
434
+ def failure_log(self) -> List[Dict[str, Any]]:
435
+ return list(self._failure_log)
436
+
437
+
438
+ class CircuitBreakerOpen(Exception):
439
+ """Raised when a call is attempted on an open circuit breaker."""
440
+ pass
441
+
442
+
443
+ # ── GovernanceHardeningConfig ───────────────────────────────────────────
444
+
445
+ class GovernanceHardeningConfig:
446
+ """Central configuration for governance hardening.
447
+
448
+ Opt-in: when not configured, all components return passthrough behavior.
449
+ Wire into loop_engine.run_governed_iteration() to enable hardening.
450
+ """
451
+
452
+ def __init__(
453
+ self,
454
+ resilient_caller: Optional[ResilientToolCaller] = None,
455
+ approval_flow: Optional[ApprovalFlow] = None,
456
+ debouncer: Optional[TriggerDebouncer] = None,
457
+ circuit_breaker: Optional[ChainCircuitBreaker] = None,
458
+ ):
459
+ self.resilient_caller = resilient_caller
460
+ self.approval_flow = approval_flow
461
+ self.debouncer = debouncer
462
+ self.circuit_breaker = circuit_breaker
463
+
464
+ @classmethod
465
+ def from_dict(cls, config: Dict[str, Any]) -> "GovernanceHardeningConfig":
466
+ """Create from a config dict (e.g., loaded from settings.json)."""
467
+ hardening = config.get("governance_hardening", {})
468
+ if not hardening.get("enabled", False):
469
+ return cls() # All None = passthrough
470
+
471
+ rc_cfg = hardening.get("resilient_caller", {})
472
+ af_cfg = hardening.get("approval_flow", {})
473
+ db_cfg = hardening.get("debouncer", {})
474
+ cb_cfg = hardening.get("circuit_breaker", {})
475
+
476
+ resilient_caller = ResilientToolCaller(
477
+ max_retries=rc_cfg.get("max_retries", 3),
478
+ base_delay=rc_cfg.get("base_delay", 1.0),
479
+ max_delay=rc_cfg.get("max_delay", 30.0),
480
+ timeout=rc_cfg.get("timeout", 60.0),
481
+ ) if rc_cfg.get("enabled", True) else None
482
+
483
+ debouncer = TriggerDebouncer(
484
+ default_cooldown=db_cfg.get("default_cooldown", 300),
485
+ tool_cooldowns=db_cfg.get("tool_cooldowns", {}),
486
+ max_calls_per_hour=db_cfg.get("max_calls_per_hour", 5),
487
+ ) if db_cfg.get("enabled", True) else None
488
+
489
+ circuit_breaker = ChainCircuitBreaker(
490
+ failure_threshold=cb_cfg.get("failure_threshold", 3),
491
+ recovery_timeout=cb_cfg.get("recovery_timeout", 300),
492
+ ) if cb_cfg.get("enabled", True) else None
493
+
494
+ # ApprovalFlow needs send/poll functions — created without them here;
495
+ # caller should inject the actual functions after construction.
496
+ approval_flow = ApprovalFlow(
497
+ timeout_seconds=af_cfg.get("timeout_seconds", 86400),
498
+ poll_interval=af_cfg.get("poll_interval", 300),
499
+ default_action=af_cfg.get("default_action", "reject"),
500
+ ) if af_cfg.get("enabled", False) else None
501
+
502
+ return cls(
503
+ resilient_caller=resilient_caller,
504
+ approval_flow=approval_flow,
505
+ debouncer=debouncer,
506
+ circuit_breaker=circuit_breaker,
507
+ )
508
+
509
+ def is_active(self) -> bool:
510
+ """True if any hardening component is configured."""
511
+ return any([
512
+ self.resilient_caller,
513
+ self.approval_flow,
514
+ self.debouncer,
515
+ self.circuit_breaker,
516
+ ])
517
+
518
+
519
+ # ── Integration helper for loop_engine ──────────────────────────────────
520
+
521
+ def hardened_dispatch(
522
+ config: GovernanceHardeningConfig,
523
+ dispatch_fn: Callable[..., Any],
524
+ tool_name: str = "dispatch_task",
525
+ **kwargs: Any,
526
+ ) -> Any:
527
+ """Run a dispatch function through the full hardening stack.
528
+
529
+ Order: debouncer -> circuit breaker -> resilient caller -> dispatch_fn
530
+ If any layer is not configured, it is skipped (passthrough).
531
+ """
532
+ # 1. Debouncer gate
533
+ if config.debouncer:
534
+ if not config.debouncer.can_fire(tool_name):
535
+ remaining = config.debouncer.time_until_allowed(tool_name)
536
+ return {
537
+ "status": "debounced",
538
+ "tool": tool_name,
539
+ "retry_in_seconds": round(remaining, 1),
540
+ }
541
+
542
+ # 2. Circuit breaker gate
543
+ if config.circuit_breaker:
544
+ if not config.circuit_breaker.allow_call():
545
+ return {
546
+ "status": "circuit_open",
547
+ "tool": tool_name,
548
+ "consecutive_failures": config.circuit_breaker.consecutive_failures,
549
+ }
550
+
551
+ # 3. Execute with resilient caller (or directly)
552
+ try:
553
+ if config.resilient_caller:
554
+ result = config.resilient_caller.call(dispatch_fn, **kwargs)
555
+ else:
556
+ result = dispatch_fn(**kwargs)
557
+
558
+ # Record success
559
+ if config.circuit_breaker:
560
+ config.circuit_breaker.record_success()
561
+ if config.debouncer:
562
+ config.debouncer.record_call(tool_name)
563
+
564
+ return result
565
+
566
+ except Exception as exc:
567
+ if config.circuit_breaker:
568
+ config.circuit_breaker.record_failure(str(exc))
569
+ raise