kekkai-cli 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
portal/ops/monitoring.py DELETED
@@ -1,517 +0,0 @@
1
- """Monitoring and alerting system for Kekkai Portal.
2
-
3
- Provides:
4
- - Alert rules for auth/authz anomalies
5
- - Alert rules for import failures
6
- - Metric collection
7
- - Integration with audit log system
8
-
9
- ASVS 5.0 Requirements:
10
- - V16.4.3: Send logs to separate system
11
- - V16.3.2: Log failed authz
12
- """
13
-
14
- from __future__ import annotations
15
-
16
- import json
17
- import logging
18
- import threading
19
- import time
20
- from collections import defaultdict
21
- from collections.abc import Callable
22
- from dataclasses import dataclass, field
23
- from datetime import UTC, datetime, timedelta
24
- from enum import Enum
25
- from typing import Any
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- class AlertSeverity(Enum):
31
- """Alert severity levels."""
32
-
33
- INFO = "info"
34
- WARNING = "warning"
35
- CRITICAL = "critical"
36
-
37
-
38
- class AlertType(Enum):
39
- """Types of alerts."""
40
-
41
- AUTH_FAILURE_SPIKE = "auth_failure_spike"
42
- AUTH_BRUTE_FORCE = "auth_brute_force"
43
- AUTHZ_DENIAL = "authz_denial"
44
- CROSS_TENANT_ATTEMPT = "cross_tenant_attempt"
45
- IMPORT_FAILURE = "import_failure"
46
- BACKUP_FAILURE = "backup_failure"
47
- SYSTEM_ERROR = "system_error"
48
- SAML_REPLAY = "saml_replay"
49
- LICENSE_EXPIRED = "license_expired"
50
-
51
-
52
- @dataclass
53
- class AlertRule:
54
- """Definition of an alert rule."""
55
-
56
- name: str
57
- alert_type: AlertType
58
- severity: AlertSeverity
59
- threshold: int
60
- window_seconds: int
61
- description: str = ""
62
- enabled: bool = True
63
- cooldown_seconds: int = 300
64
-
65
- def to_dict(self) -> dict[str, Any]:
66
- """Convert to dictionary."""
67
- return {
68
- "name": self.name,
69
- "alert_type": self.alert_type.value,
70
- "severity": self.severity.value,
71
- "threshold": self.threshold,
72
- "window_seconds": self.window_seconds,
73
- "description": self.description,
74
- "enabled": self.enabled,
75
- "cooldown_seconds": self.cooldown_seconds,
76
- }
77
-
78
-
79
- @dataclass
80
- class Alert:
81
- """Represents a triggered alert."""
82
-
83
- rule_name: str
84
- alert_type: AlertType
85
- severity: AlertSeverity
86
- timestamp: datetime
87
- message: str
88
- details: dict[str, Any] = field(default_factory=dict)
89
- alert_id: str = ""
90
-
91
- def __post_init__(self) -> None:
92
- if not self.alert_id:
93
- import secrets
94
-
95
- self.alert_id = f"alert_{int(time.time())}_{secrets.token_hex(4)}"
96
-
97
- def to_dict(self) -> dict[str, Any]:
98
- """Convert to dictionary."""
99
- return {
100
- "alert_id": self.alert_id,
101
- "rule_name": self.rule_name,
102
- "alert_type": self.alert_type.value,
103
- "severity": self.severity.value,
104
- "timestamp": self.timestamp.isoformat(),
105
- "message": self.message,
106
- "details": self.details,
107
- }
108
-
109
-
110
- @dataclass
111
- class MonitoringConfig:
112
- """Configuration for monitoring service."""
113
-
114
- enabled: bool = True
115
- alert_handlers: list[Callable[[Alert], None]] = field(default_factory=list)
116
- metrics_retention_hours: int = 24
117
- check_interval_seconds: int = 60
118
-
119
- rules: list[AlertRule] = field(default_factory=list)
120
-
121
- def __post_init__(self) -> None:
122
- if not self.rules:
123
- self.rules = get_default_rules()
124
-
125
-
126
- def get_default_rules() -> list[AlertRule]:
127
- """Get default alert rules."""
128
- return [
129
- AlertRule(
130
- name="auth_failure_spike",
131
- alert_type=AlertType.AUTH_FAILURE_SPIKE,
132
- severity=AlertSeverity.WARNING,
133
- threshold=10,
134
- window_seconds=300,
135
- description="Multiple authentication failures in short period",
136
- ),
137
- AlertRule(
138
- name="brute_force_detection",
139
- alert_type=AlertType.AUTH_BRUTE_FORCE,
140
- severity=AlertSeverity.CRITICAL,
141
- threshold=5,
142
- window_seconds=60,
143
- description="Potential brute force attack from single IP",
144
- ),
145
- AlertRule(
146
- name="authz_denial_alert",
147
- alert_type=AlertType.AUTHZ_DENIAL,
148
- severity=AlertSeverity.WARNING,
149
- threshold=5,
150
- window_seconds=300,
151
- description="Multiple authorization denials for user",
152
- ),
153
- AlertRule(
154
- name="cross_tenant_attempt",
155
- alert_type=AlertType.CROSS_TENANT_ATTEMPT,
156
- severity=AlertSeverity.CRITICAL,
157
- threshold=1,
158
- window_seconds=60,
159
- description="Cross-tenant access attempt detected",
160
- ),
161
- AlertRule(
162
- name="import_failure_alert",
163
- alert_type=AlertType.IMPORT_FAILURE,
164
- severity=AlertSeverity.WARNING,
165
- threshold=3,
166
- window_seconds=600,
167
- description="Multiple import failures",
168
- ),
169
- AlertRule(
170
- name="saml_replay_alert",
171
- alert_type=AlertType.SAML_REPLAY,
172
- severity=AlertSeverity.CRITICAL,
173
- threshold=1,
174
- window_seconds=60,
175
- description="SAML replay attack blocked",
176
- ),
177
- AlertRule(
178
- name="backup_failure_alert",
179
- alert_type=AlertType.BACKUP_FAILURE,
180
- severity=AlertSeverity.CRITICAL,
181
- threshold=1,
182
- window_seconds=3600,
183
- description="Backup job failed",
184
- ),
185
- ]
186
-
187
-
188
- class MetricsCollector:
189
- """Collects and stores metrics for monitoring."""
190
-
191
- def __init__(self, retention_hours: int = 24) -> None:
192
- self._retention_hours = retention_hours
193
- self._metrics: dict[str, list[tuple[datetime, Any]]] = defaultdict(list)
194
- self._counters: dict[str, int] = defaultdict(int)
195
- self._lock = threading.Lock()
196
-
197
- def increment(
198
- self, metric_name: str, value: int = 1, labels: dict[str, str] | None = None
199
- ) -> None:
200
- """Increment a counter metric."""
201
- key = self._make_key(metric_name, labels)
202
- with self._lock:
203
- self._counters[key] += value
204
- self._metrics[key].append((datetime.now(UTC), value))
205
-
206
- def gauge(self, metric_name: str, value: float, labels: dict[str, str] | None = None) -> None:
207
- """Set a gauge metric value."""
208
- key = self._make_key(metric_name, labels)
209
- with self._lock:
210
- self._metrics[key].append((datetime.now(UTC), value))
211
-
212
- def get_count(
213
- self, metric_name: str, window_seconds: int, labels: dict[str, str] | None = None
214
- ) -> int:
215
- """Get count of events within window."""
216
- key = self._make_key(metric_name, labels)
217
- cutoff = datetime.now(UTC) - timedelta(seconds=window_seconds)
218
-
219
- with self._lock:
220
- values = self._metrics.get(key, [])
221
- return sum(v for ts, v in values if ts >= cutoff and isinstance(v, int))
222
-
223
- def get_events_in_window(
224
- self, metric_name: str, window_seconds: int, labels: dict[str, str] | None = None
225
- ) -> list[tuple[datetime, Any]]:
226
- """Get all events within window."""
227
- key = self._make_key(metric_name, labels)
228
- cutoff = datetime.now(UTC) - timedelta(seconds=window_seconds)
229
-
230
- with self._lock:
231
- values = self._metrics.get(key, [])
232
- return [(ts, v) for ts, v in values if ts >= cutoff]
233
-
234
- def cleanup_old_metrics(self) -> int:
235
- """Remove metrics older than retention period. Returns count removed."""
236
- cutoff = datetime.now(UTC) - timedelta(hours=self._retention_hours)
237
- removed = 0
238
-
239
- with self._lock:
240
- for key in list(self._metrics.keys()):
241
- original_len = len(self._metrics[key])
242
- self._metrics[key] = [(ts, v) for ts, v in self._metrics[key] if ts >= cutoff]
243
- removed += original_len - len(self._metrics[key])
244
-
245
- return removed
246
-
247
- def get_all_metrics(self) -> dict[str, Any]:
248
- """Get snapshot of all current metrics."""
249
- with self._lock:
250
- return {
251
- "counters": dict(self._counters),
252
- "metrics": {k: len(v) for k, v in self._metrics.items()},
253
- }
254
-
255
- def _make_key(self, metric_name: str, labels: dict[str, str] | None) -> str:
256
- """Create metric key from name and labels."""
257
- if not labels:
258
- return metric_name
259
- label_str = ",".join(f"{k}={v}" for k, v in sorted(labels.items()))
260
- return f"{metric_name}{{{label_str}}}"
261
-
262
-
263
- class MonitoringService:
264
- """Main monitoring service for Kekkai Portal."""
265
-
266
- def __init__(self, config: MonitoringConfig) -> None:
267
- self._config = config
268
- self._metrics = MetricsCollector(config.metrics_retention_hours)
269
- self._last_alert_time: dict[str, datetime] = {}
270
- self._lock = threading.Lock()
271
- self._running = False
272
- self._check_thread: threading.Thread | None = None
273
-
274
- def start(self) -> None:
275
- """Start background monitoring."""
276
- if not self._config.enabled:
277
- return
278
-
279
- self._running = True
280
- self._check_thread = threading.Thread(target=self._check_loop, daemon=True)
281
- self._check_thread.start()
282
- logger.info("monitoring.started")
283
-
284
- def stop(self) -> None:
285
- """Stop background monitoring."""
286
- self._running = False
287
- if self._check_thread:
288
- self._check_thread.join(timeout=5)
289
- logger.info("monitoring.stopped")
290
-
291
- def record_auth_failure(self, client_ip: str, reason: str, user_id: str | None = None) -> None:
292
- """Record an authentication failure event."""
293
- self._metrics.increment("auth_failures", labels={"ip": client_ip})
294
- self._metrics.increment("auth_failures_total")
295
-
296
- self._check_rule_immediate(AlertType.AUTH_FAILURE_SPIKE)
297
- self._check_brute_force(client_ip)
298
-
299
- def record_authz_denial(
300
- self, user_id: str, tenant_id: str, permission: str, resource: str | None = None
301
- ) -> None:
302
- """Record an authorization denial event."""
303
- self._metrics.increment("authz_denials", labels={"user": user_id, "tenant": tenant_id})
304
- self._metrics.increment("authz_denials_total")
305
-
306
- self._check_rule_immediate(AlertType.AUTHZ_DENIAL, {"user_id": user_id})
307
-
308
- def record_cross_tenant_attempt(
309
- self, user_id: str, source_tenant: str, target_tenant: str
310
- ) -> None:
311
- """Record a cross-tenant access attempt."""
312
- self._metrics.increment(
313
- "cross_tenant_attempts",
314
- labels={"user": user_id, "source": source_tenant, "target": target_tenant},
315
- )
316
-
317
- self._trigger_alert(
318
- AlertRule(
319
- name="cross_tenant_attempt",
320
- alert_type=AlertType.CROSS_TENANT_ATTEMPT,
321
- severity=AlertSeverity.CRITICAL,
322
- threshold=1,
323
- window_seconds=60,
324
- ),
325
- f"Cross-tenant access attempt: user={user_id} from={source_tenant} to={target_tenant}",
326
- {"user_id": user_id, "source_tenant": source_tenant, "target_tenant": target_tenant},
327
- )
328
-
329
- def record_import_failure(self, tenant_id: str, reason: str) -> None:
330
- """Record an import failure event."""
331
- self._metrics.increment("import_failures", labels={"tenant": tenant_id})
332
- self._metrics.increment("import_failures_total")
333
-
334
- self._check_rule_immediate(AlertType.IMPORT_FAILURE, {"tenant_id": tenant_id})
335
-
336
- def record_saml_replay_blocked(self, assertion_id: str, client_ip: str) -> None:
337
- """Record a blocked SAML replay attempt."""
338
- self._metrics.increment("saml_replay_blocked", labels={"ip": client_ip})
339
-
340
- self._trigger_alert(
341
- AlertRule(
342
- name="saml_replay_alert",
343
- alert_type=AlertType.SAML_REPLAY,
344
- severity=AlertSeverity.CRITICAL,
345
- threshold=1,
346
- window_seconds=60,
347
- ),
348
- f"SAML replay attack blocked: assertion={assertion_id[:16]}... ip={client_ip}",
349
- {"assertion_id": assertion_id, "client_ip": client_ip},
350
- )
351
-
352
- def record_backup_failure(self, backup_id: str, error: str) -> None:
353
- """Record a backup failure event."""
354
- self._metrics.increment("backup_failures")
355
-
356
- self._trigger_alert(
357
- AlertRule(
358
- name="backup_failure_alert",
359
- alert_type=AlertType.BACKUP_FAILURE,
360
- severity=AlertSeverity.CRITICAL,
361
- threshold=1,
362
- window_seconds=3600,
363
- ),
364
- f"Backup failed: {backup_id}",
365
- {"backup_id": backup_id, "error": error},
366
- )
367
-
368
- def get_metrics(self) -> dict[str, Any]:
369
- """Get current metrics snapshot."""
370
- return self._metrics.get_all_metrics()
371
-
372
- def get_recent_alerts(self, limit: int = 100) -> list[dict[str, Any]]:
373
- """Get recent alerts (from log - placeholder for full implementation)."""
374
- return []
375
-
376
- def add_alert_handler(self, handler: Callable[[Alert], None]) -> None:
377
- """Add an alert handler callback."""
378
- self._config.alert_handlers.append(handler)
379
-
380
- def _check_loop(self) -> None:
381
- """Background loop to check alert rules."""
382
- while self._running:
383
- try:
384
- self._check_all_rules()
385
- self._metrics.cleanup_old_metrics()
386
- except Exception as e:
387
- logger.error("monitoring.check_error error=%s", str(e))
388
-
389
- time.sleep(self._config.check_interval_seconds)
390
-
391
- def _check_all_rules(self) -> None:
392
- """Check all configured alert rules."""
393
- for rule in self._config.rules:
394
- if not rule.enabled:
395
- continue
396
- self._check_rule(rule)
397
-
398
- def _check_rule(self, rule: AlertRule) -> None:
399
- """Check a single alert rule."""
400
- metric_name = self._alert_type_to_metric(rule.alert_type)
401
- if not metric_name:
402
- return
403
-
404
- count = self._metrics.get_count(metric_name, rule.window_seconds)
405
- if count >= rule.threshold:
406
- self._trigger_alert(
407
- rule, f"{rule.description}: {count} events in {rule.window_seconds}s"
408
- )
409
-
410
- def _check_rule_immediate(
411
- self, alert_type: AlertType, context: dict[str, Any] | None = None
412
- ) -> None:
413
- """Check rules immediately for a specific alert type."""
414
- for rule in self._config.rules:
415
- if rule.alert_type == alert_type and rule.enabled:
416
- self._check_rule(rule)
417
-
418
- def _check_brute_force(self, client_ip: str) -> None:
419
- """Check for brute force attack from specific IP."""
420
- count = self._metrics.get_count("auth_failures", 60, labels={"ip": client_ip})
421
- if count >= 5:
422
- rule = AlertRule(
423
- name="brute_force_detection",
424
- alert_type=AlertType.AUTH_BRUTE_FORCE,
425
- severity=AlertSeverity.CRITICAL,
426
- threshold=5,
427
- window_seconds=60,
428
- )
429
- self._trigger_alert(
430
- rule,
431
- f"Potential brute force attack from {client_ip}: {count} failures in 60s",
432
- {"client_ip": client_ip, "failure_count": count},
433
- )
434
-
435
- def _trigger_alert(
436
- self, rule: AlertRule, message: str, details: dict[str, Any] | None = None
437
- ) -> None:
438
- """Trigger an alert if not in cooldown."""
439
- with self._lock:
440
- last_time = self._last_alert_time.get(rule.name)
441
- now = datetime.now(UTC)
442
-
443
- if last_time and (now - last_time).total_seconds() < rule.cooldown_seconds:
444
- return
445
-
446
- self._last_alert_time[rule.name] = now
447
-
448
- alert = Alert(
449
- rule_name=rule.name,
450
- alert_type=rule.alert_type,
451
- severity=rule.severity,
452
- timestamp=now,
453
- message=message,
454
- details=details or {},
455
- )
456
-
457
- logger.warning(
458
- "alert.triggered rule=%s severity=%s message=%s",
459
- rule.name,
460
- rule.severity.value,
461
- message,
462
- )
463
-
464
- for handler in self._config.alert_handlers:
465
- try:
466
- handler(alert)
467
- except Exception as e:
468
- logger.error("alert.handler.error handler=%s error=%s", handler.__name__, str(e))
469
-
470
- def _alert_type_to_metric(self, alert_type: AlertType) -> str | None:
471
- """Map alert type to metric name."""
472
- mapping = {
473
- AlertType.AUTH_FAILURE_SPIKE: "auth_failures_total",
474
- AlertType.AUTHZ_DENIAL: "authz_denials_total",
475
- AlertType.IMPORT_FAILURE: "import_failures_total",
476
- AlertType.BACKUP_FAILURE: "backup_failures",
477
- }
478
- return mapping.get(alert_type)
479
-
480
-
481
- def create_monitoring_service(
482
- enabled: bool = True,
483
- rules: list[AlertRule] | None = None,
484
- ) -> MonitoringService:
485
- """Create a configured MonitoringService instance."""
486
- config = MonitoringConfig(enabled=enabled)
487
- if rules:
488
- config.rules = rules
489
- return MonitoringService(config)
490
-
491
-
492
- def log_alert_handler(alert: Alert) -> None:
493
- """Default alert handler that logs to file."""
494
- logger.warning("ALERT: %s", json.dumps(alert.to_dict()))
495
-
496
-
497
- def webhook_alert_handler_factory(webhook_url: str) -> Callable[[Alert], None]:
498
- """Create a webhook alert handler."""
499
- import urllib.error
500
- import urllib.request
501
-
502
- def handler(alert: Alert) -> None:
503
- try:
504
- data = json.dumps(alert.to_dict()).encode("utf-8")
505
- req = urllib.request.Request( # noqa: S310
506
- webhook_url,
507
- data=data,
508
- headers={"Content-Type": "application/json"},
509
- method="POST",
510
- )
511
- with urllib.request.urlopen(req, timeout=10) as resp: # noqa: S310
512
- if resp.status >= 400:
513
- logger.error("webhook.failed status=%d", resp.status)
514
- except urllib.error.URLError as e:
515
- logger.error("webhook.error url=%s error=%s", webhook_url, str(e))
516
-
517
- return handler