kryten-robot 0.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,482 @@
1
+ """Health monitoring system for Kryten.
2
+
3
+ This module provides HTTP health check endpoints and metrics tracking for
4
+ operational visibility and integration with orchestration platforms (Kubernetes, systemd).
5
+
6
+ The health monitor runs an HTTP server on a separate thread to avoid blocking
7
+ the async event loop, exposing:
8
+ - /health - JSON health status (200 OK or 503 Service Unavailable)
9
+ - /metrics - Prometheus-compatible metrics (optional)
10
+
11
+ Examples:
12
+ Basic usage:
13
+ >>> monitor = HealthMonitor(
14
+ ... connector=cytube_connector,
15
+ ... nats_client=nats_client,
16
+ ... publisher=event_publisher,
17
+ ... logger=logger,
18
+ ... port=8080
19
+ ... )
20
+ >>> monitor.start()
21
+ >>> # Health available at http://localhost:8080/health
22
+ >>> monitor.stop()
23
+
24
+ Integration with Kubernetes:
25
+ apiVersion: v1
26
+ kind: Pod
27
+ spec:
28
+ containers:
29
+ - name: kryten
30
+ livenessProbe:
31
+ httpGet:
32
+ path: /health
33
+ port: 8080
34
+ initialDelaySeconds: 10
35
+ periodSeconds: 30
36
+
37
+ Note:
38
+ The health server runs on a separate thread to ensure health checks
39
+ remain responsive even if the main event loop is busy or blocked.
40
+ """
41
+
42
+ import json
43
+ import logging
44
+ import time
45
+ from http.server import BaseHTTPRequestHandler, HTTPServer
46
+ from threading import Thread
47
+ from typing import TYPE_CHECKING, Optional
48
+
49
+ from .cytube_connector import CytubeConnector
50
+ from .event_publisher import EventPublisher
51
+ from .nats_client import NatsClient
52
+
53
+ if TYPE_CHECKING:
54
+ from .command_subscriber import CommandSubscriber
55
+
56
+
57
+ class HealthStatus:
58
+ """Health status aggregator for Kryten components.
59
+
60
+ Collects component states and metrics to determine overall health.
61
+
62
+ Attributes:
63
+ connector: CytubeConnector instance for connection status.
64
+ nats_client: NatsClient instance for NATS status.
65
+ publisher: EventPublisher instance for event processing status.
66
+ command_subscriber: CommandSubscriber instance for command metrics (optional).
67
+ start_time: Application start timestamp for uptime calculation.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ connector: CytubeConnector,
73
+ nats_client: NatsClient,
74
+ publisher: EventPublisher,
75
+ command_subscriber: Optional["CommandSubscriber"] = None,
76
+ ):
77
+ """Initialize health status aggregator.
78
+
79
+ Args:
80
+ connector: CytubeConnector instance.
81
+ nats_client: NatsClient instance.
82
+ publisher: EventPublisher instance.
83
+ command_subscriber: CommandSubscriber instance (optional).
84
+ """
85
+ self.connector = connector
86
+ self.nats_client = nats_client
87
+ self.publisher = publisher
88
+ self.command_subscriber = command_subscriber
89
+ self.start_time = time.time()
90
+
91
+ def is_healthy(self) -> bool:
92
+ """Check if all critical components are healthy.
93
+
94
+ Returns:
95
+ True if all components connected/running, False otherwise.
96
+
97
+ Examples:
98
+ >>> status = HealthStatus(connector, nats, publisher)
99
+ >>> if status.is_healthy():
100
+ ... print("System healthy")
101
+ """
102
+ return (
103
+ self.connector.is_connected
104
+ and self.nats_client.is_connected
105
+ and self.publisher.is_running
106
+ )
107
+
108
+ def get_status_dict(self) -> dict:
109
+ """Get comprehensive health status as dictionary.
110
+
111
+ Returns:
112
+ Dictionary with status, uptime, components, and metrics.
113
+
114
+ Examples:
115
+ >>> status = HealthStatus(connector, nats, publisher)
116
+ >>> data = status.get_status_dict()
117
+ >>> print(data["status"]) # "healthy" or "unhealthy"
118
+ """
119
+ uptime = time.time() - self.start_time
120
+
121
+ # Component states
122
+ components = {
123
+ "cytube_connector": "connected" if self.connector.is_connected else "disconnected",
124
+ "nats_client": "connected" if self.nats_client.is_connected else "disconnected",
125
+ "event_publisher": "running" if self.publisher.is_running else "stopped",
126
+ }
127
+
128
+ # Add command subscriber if enabled
129
+ if self.command_subscriber:
130
+ components["command_subscriber"] = "running" if self.command_subscriber.is_running else "stopped"
131
+
132
+ # Aggregate metrics from components
133
+ connector_stats = self.connector.stats
134
+ nats_stats = self.nats_client.stats
135
+ publisher_stats = self.publisher.stats
136
+
137
+ metrics = {
138
+ "events_received": connector_stats.get("events_processed", 0),
139
+ "events_published": publisher_stats.get("events_published", 0),
140
+ "publish_errors": publisher_stats.get("publish_errors", 0),
141
+ "nats_bytes_sent": nats_stats.get("bytes_sent", 0),
142
+ }
143
+
144
+ # Add command metrics if subscriber enabled
145
+ if self.command_subscriber:
146
+ command_stats = self.command_subscriber.stats
147
+ metrics["commands_processed"] = command_stats.get("commands_processed", 0)
148
+ metrics["commands_failed"] = command_stats.get("commands_failed", 0)
149
+
150
+ return {
151
+ "status": "healthy" if self.is_healthy() else "unhealthy",
152
+ "uptime_seconds": round(uptime, 2),
153
+ "components": components,
154
+ "metrics": metrics,
155
+ }
156
+
157
+ def get_prometheus_metrics(self) -> str:
158
+ """Get Prometheus-compatible metrics.
159
+
160
+ Returns:
161
+ Prometheus text format metrics.
162
+
163
+ Examples:
164
+ >>> status = HealthStatus(connector, nats, publisher)
165
+ >>> metrics = status.get_prometheus_metrics()
166
+ >>> print(metrics)
167
+ # HELP kryten_up Whether Kryten is up (1) or down (0)
168
+ # TYPE kryten_up gauge
169
+ kryten_up 1
170
+ ...
171
+ """
172
+ uptime = time.time() - self.start_time
173
+ is_healthy = 1 if self.is_healthy() else 0
174
+
175
+ connector_stats = self.connector.stats
176
+ nats_stats = self.nats_client.stats
177
+ publisher_stats = self.publisher.stats
178
+
179
+ lines = [
180
+ "# HELP kryten_up Whether Kryten is up (1) or down (0)",
181
+ "# TYPE kryten_up gauge",
182
+ f"kryten_up {is_healthy}",
183
+ "",
184
+ "# HELP kryten_uptime_seconds Time since application start",
185
+ "# TYPE kryten_uptime_seconds counter",
186
+ f"kryten_uptime_seconds {uptime:.2f}",
187
+ "",
188
+ "# HELP kryten_events_received_total Events received from CyTube",
189
+ "# TYPE kryten_events_received_total counter",
190
+ f"kryten_events_received_total {connector_stats.get('events_processed', 0)}",
191
+ "",
192
+ "# HELP kryten_events_published_total Events published to NATS",
193
+ "# TYPE kryten_events_published_total counter",
194
+ f"kryten_events_published_total {publisher_stats.get('events_published', 0)}",
195
+ "",
196
+ "# HELP kryten_publish_errors_total Publishing errors",
197
+ "# TYPE kryten_publish_errors_total counter",
198
+ f"kryten_publish_errors_total {publisher_stats.get('publish_errors', 0)}",
199
+ "",
200
+ "# HELP kryten_nats_bytes_sent_total Bytes sent to NATS",
201
+ "# TYPE kryten_nats_bytes_sent_total counter",
202
+ f"kryten_nats_bytes_sent_total {nats_stats.get('bytes_sent', 0)}",
203
+ "",
204
+ "# HELP kryten_component_connected Component connection status (1=connected, 0=disconnected)",
205
+ "# TYPE kryten_component_connected gauge",
206
+ f"kryten_component_connected{{component=\"cytube\"}} {1 if self.connector.is_connected else 0}",
207
+ f"kryten_component_connected{{component=\"nats\"}} {1 if self.nats_client.is_connected else 0}",
208
+ f"kryten_component_connected{{component=\"publisher\"}} {1 if self.publisher.is_running else 0}",
209
+ ]
210
+
211
+ # Add command metrics if subscriber enabled
212
+ if self.command_subscriber:
213
+ command_stats = self.command_subscriber.stats
214
+ lines.extend([
215
+ f"kryten_component_connected{{component=\"command_subscriber\"}} {1 if self.command_subscriber.is_running else 0}",
216
+ "",
217
+ "# HELP kryten_commands_processed_total Commands received and executed",
218
+ "# TYPE kryten_commands_processed_total counter",
219
+ f"kryten_commands_processed_total {command_stats.get('commands_processed', 0)}",
220
+ "",
221
+ "# HELP kryten_commands_failed_total Commands that failed to execute",
222
+ "# TYPE kryten_commands_failed_total counter",
223
+ f"kryten_commands_failed_total {command_stats.get('commands_failed', 0)}",
224
+ ])
225
+
226
+ lines.append("")
227
+
228
+ return "\n".join(lines)
229
+
230
+
231
+ class HealthRequestHandler(BaseHTTPRequestHandler):
232
+ """HTTP request handler for health endpoints.
233
+
234
+ Handles /health and /metrics endpoints, accessing the HealthStatus
235
+ instance attached to the server.
236
+ """
237
+
238
+ # Suppress default logging (we use structured logging)
239
+ def log_message(self, format, *args):
240
+ """Override to suppress default HTTP logging."""
241
+ pass
242
+
243
+ def do_GET(self):
244
+ """Handle GET requests for health endpoints.
245
+
246
+ Routes:
247
+ /health - JSON health status
248
+ /metrics - Prometheus metrics
249
+ """
250
+ try:
251
+ if self.path == "/health":
252
+ self._handle_health()
253
+ elif self.path == "/metrics":
254
+ self._handle_metrics()
255
+ else:
256
+ self._handle_not_found()
257
+ except Exception as e:
258
+ # Log error but don't let it crash the server
259
+ if hasattr(self.server, 'logger'):
260
+ self.server.logger.error(f"Health endpoint error: {e}", exc_info=True)
261
+ self.send_error(500, "Internal Server Error")
262
+
263
+ def _handle_health(self):
264
+ """Handle /health endpoint.
265
+
266
+ Returns 200 OK if healthy, 503 Service Unavailable if unhealthy.
267
+ """
268
+ status: HealthStatus = self.server.health_status
269
+ status_dict = status.get_status_dict()
270
+
271
+ # Determine HTTP status code
272
+ http_status = 200 if status.is_healthy() else 503
273
+
274
+ # Send response
275
+ self.send_response(http_status)
276
+ self.send_header("Content-Type", "application/json")
277
+ self.end_headers()
278
+
279
+ response_json = json.dumps(status_dict, indent=2)
280
+ self.wfile.write(response_json.encode("utf-8"))
281
+
282
+ def _handle_metrics(self):
283
+ """Handle /metrics endpoint.
284
+
285
+ Returns Prometheus-compatible metrics.
286
+ """
287
+ status: HealthStatus = self.server.health_status
288
+ metrics_text = status.get_prometheus_metrics()
289
+
290
+ self.send_response(200)
291
+ self.send_header("Content-Type", "text/plain; version=0.0.4")
292
+ self.end_headers()
293
+
294
+ self.wfile.write(metrics_text.encode("utf-8"))
295
+
296
+ def _handle_not_found(self):
297
+ """Handle unknown endpoints."""
298
+ self.send_response(404)
299
+ self.send_header("Content-Type", "application/json")
300
+ self.end_headers()
301
+
302
+ error_dict = {
303
+ "error": "Not Found",
304
+ "message": "Available endpoints: /health, /metrics"
305
+ }
306
+ self.wfile.write(json.dumps(error_dict).encode("utf-8"))
307
+
308
+
309
+ class HealthMonitor:
310
+ """Health monitoring system with HTTP server.
311
+
312
+ Runs an HTTP server on a separate thread exposing health status and
313
+ metrics for operational visibility and orchestration integration.
314
+
315
+ Args:
316
+ connector: CytubeConnector instance.
317
+ nats_client: NatsClient instance.
318
+ publisher: EventPublisher instance.
319
+ command_subscriber: CommandSubscriber instance (optional).
320
+ logger: Logger for health monitoring events.
321
+ host: HTTP server bind address (default: "0.0.0.0").
322
+ port: HTTP server port (default: 8080).
323
+
324
+ Examples:
325
+ >>> monitor = HealthMonitor(connector, nats, publisher, cmd_sub, logger)
326
+ >>> monitor.start()
327
+ >>> # Health check: curl http://localhost:8080/health
328
+ >>> # Metrics: curl http://localhost:8080/metrics
329
+ >>> monitor.stop()
330
+ """
331
+
332
+ def __init__(
333
+ self,
334
+ connector: CytubeConnector,
335
+ nats_client: NatsClient,
336
+ publisher: EventPublisher,
337
+ logger: logging.Logger,
338
+ command_subscriber: Optional["CommandSubscriber"] = None,
339
+ host: str = "0.0.0.0",
340
+ port: int = 8080,
341
+ ):
342
+ """Initialize health monitor.
343
+
344
+ Args:
345
+ connector: CytubeConnector instance.
346
+ nats_client: NatsClient instance.
347
+ publisher: EventPublisher instance.
348
+ logger: Logger instance.
349
+ command_subscriber: CommandSubscriber instance (optional).
350
+ host: Server bind address.
351
+ port: Server port.
352
+ """
353
+ self.connector = connector
354
+ self.nats_client = nats_client
355
+ self.publisher = publisher
356
+ self.command_subscriber = command_subscriber
357
+ self.logger = logger
358
+ self.host = host
359
+ self.port = port
360
+
361
+ self._health_status = HealthStatus(connector, nats_client, publisher, command_subscriber)
362
+ self._server: HTTPServer | None = None
363
+ self._server_thread: Thread | None = None
364
+ self._running = False
365
+
366
+ def start(self):
367
+ """Start health monitoring HTTP server.
368
+
369
+ Starts server on separate thread to avoid blocking event loop.
370
+ Safe to call multiple times (no-op if already running).
371
+
372
+ Examples:
373
+ >>> monitor = HealthMonitor(connector, nats, publisher, logger)
374
+ >>> monitor.start()
375
+ >>> assert monitor.is_running
376
+ """
377
+ if self._running:
378
+ self.logger.debug("Health monitor already running")
379
+ return
380
+
381
+ try:
382
+ # Create HTTP server
383
+ self._server = HTTPServer((self.host, self.port), HealthRequestHandler)
384
+ self._server.health_status = self._health_status
385
+ self._server.logger = self.logger
386
+
387
+ # Start server on separate thread
388
+ self._server_thread = Thread(
389
+ target=self._run_server,
390
+ daemon=True,
391
+ name="health-monitor"
392
+ )
393
+ self._server_thread.start()
394
+
395
+ self._running = True
396
+ self.logger.info(
397
+ "Health monitor started",
398
+ extra={"host": self.host, "port": self.port}
399
+ )
400
+
401
+ except Exception as e:
402
+ self.logger.error(f"Failed to start health monitor: {e}", exc_info=True)
403
+ raise
404
+
405
+ def _run_server(self):
406
+ """Run HTTP server (called on separate thread).
407
+
408
+ Internal method that serves requests until server is shut down.
409
+ """
410
+ try:
411
+ self.logger.debug("Health monitor server thread started")
412
+ self._server.serve_forever()
413
+ except Exception as e:
414
+ self.logger.error(f"Health monitor server error: {e}", exc_info=True)
415
+ finally:
416
+ self.logger.debug("Health monitor server thread exiting")
417
+
418
+ def stop(self):
419
+ """Stop health monitoring HTTP server.
420
+
421
+ Shuts down server and waits for thread to exit.
422
+ Safe to call multiple times (no-op if not running).
423
+
424
+ Examples:
425
+ >>> monitor.stop()
426
+ >>> assert not monitor.is_running
427
+ """
428
+ if not self._running:
429
+ self.logger.debug("Health monitor not running")
430
+ return
431
+
432
+ try:
433
+ self.logger.info("Stopping health monitor")
434
+
435
+ # Shutdown server
436
+ if self._server:
437
+ self._server.shutdown()
438
+ self._server.server_close()
439
+
440
+ # Wait for thread to exit
441
+ if self._server_thread and self._server_thread.is_alive():
442
+ self._server_thread.join(timeout=5.0)
443
+ if self._server_thread.is_alive():
444
+ self.logger.warning("Health monitor thread did not exit cleanly")
445
+
446
+ self._running = False
447
+ self.logger.info("Health monitor stopped")
448
+
449
+ except Exception as e:
450
+ self.logger.error(f"Error stopping health monitor: {e}", exc_info=True)
451
+ raise
452
+
453
+ @property
454
+ def is_running(self) -> bool:
455
+ """Check if health monitor is running.
456
+
457
+ Returns:
458
+ True if server is running, False otherwise.
459
+
460
+ Examples:
461
+ >>> monitor.start()
462
+ >>> assert monitor.is_running
463
+ """
464
+ return self._running
465
+
466
+ def __enter__(self):
467
+ """Enter context manager (start server).
468
+
469
+ Returns:
470
+ Self for use in with statement.
471
+ """
472
+ self.start()
473
+ return self
474
+
475
+ def __exit__(self, exc_type, exc_val, exc_tb):
476
+ """Exit context manager (stop server).
477
+
478
+ Returns:
479
+ False to propagate any exception.
480
+ """
481
+ self.stop()
482
+ return False