kryten-robot 0.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,371 @@
1
+ """Service Registry - Track and monitor Kryten microservices.
2
+
3
+ This module provides service discovery and health monitoring for the Kryten
4
+ ecosystem. It subscribes to lifecycle events from all services and maintains
5
+ an inventory of active services with their heartbeat status.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import logging
11
+ from collections.abc import Callable
12
+ from dataclasses import dataclass, field
13
+ from datetime import UTC, datetime
14
+ from typing import Any
15
+
16
+ from .nats_client import NatsClient
17
+
18
+
19
+ @dataclass
20
+ class ServiceInfo:
21
+ """Information about a registered service.
22
+
23
+ Attributes:
24
+ name: Service name (e.g., "userstats", "moderator")
25
+ version: Service version string
26
+ hostname: Hostname where service is running
27
+ first_seen: Timestamp when service was first discovered
28
+ last_heartbeat: Timestamp of most recent heartbeat
29
+ last_startup: Timestamp of most recent startup event
30
+ heartbeat_count: Total number of heartbeats received
31
+ metadata: Additional service-specific metadata
32
+ """
33
+ name: str
34
+ version: str
35
+ hostname: str
36
+ first_seen: datetime
37
+ last_heartbeat: datetime
38
+ last_startup: datetime
39
+ heartbeat_count: int = 0
40
+ metadata: dict[str, Any] = field(default_factory=dict)
41
+
42
+ @property
43
+ def seconds_since_heartbeat(self) -> float:
44
+ """Calculate seconds since last heartbeat."""
45
+ return (datetime.now(UTC) - self.last_heartbeat).total_seconds()
46
+
47
+ @property
48
+ def is_stale(self) -> bool:
49
+ """Check if service appears offline (no heartbeat in 90 seconds)."""
50
+ return self.seconds_since_heartbeat > 90
51
+
52
+ def to_dict(self) -> dict[str, Any]:
53
+ """Convert to dictionary for serialization."""
54
+ return {
55
+ "name": self.name,
56
+ "version": self.version,
57
+ "hostname": self.hostname,
58
+ "first_seen": self.first_seen.isoformat(),
59
+ "last_heartbeat": self.last_heartbeat.isoformat(),
60
+ "last_startup": self.last_startup.isoformat(),
61
+ "heartbeat_count": self.heartbeat_count,
62
+ "seconds_since_heartbeat": self.seconds_since_heartbeat,
63
+ "is_stale": self.is_stale,
64
+ "metadata": self.metadata,
65
+ }
66
+
67
+
68
+ class ServiceRegistry:
69
+ """Monitor and track Kryten microservices.
70
+
71
+ Subscribes to lifecycle events from all services and maintains a registry
72
+ of active services with their health status.
73
+
74
+ Subscriptions:
75
+ - kryten.lifecycle.*.startup - Service startup notifications
76
+ - kryten.lifecycle.*.heartbeat - Service heartbeat events
77
+ - kryten.lifecycle.*.shutdown - Service shutdown notifications
78
+
79
+ Attributes:
80
+ nats_client: NATS client for subscriptions
81
+ logger: Logger instance
82
+ services: Dictionary of registered services by name
83
+
84
+ Examples:
85
+ >>> registry = ServiceRegistry(nats_client, logger)
86
+ >>> await registry.start()
87
+ >>> services = registry.get_active_services()
88
+ >>> await registry.stop()
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ nats_client: NatsClient,
94
+ logger: logging.Logger,
95
+ ):
96
+ """Initialize service registry.
97
+
98
+ Args:
99
+ nats_client: NATS client for event subscriptions
100
+ logger: Logger for structured output
101
+ """
102
+ self._nats = nats_client
103
+ self._logger = logger
104
+ self._running = False
105
+
106
+ # Service tracking
107
+ self._services: dict[str, ServiceInfo] = {}
108
+ self._lock = asyncio.Lock()
109
+
110
+ # Subscriptions
111
+ self._startup_sub = None
112
+ self._heartbeat_sub = None
113
+ self._shutdown_sub = None
114
+
115
+ # Callbacks for service events
116
+ self._on_service_registered: Callable[[ServiceInfo], None] | None = None
117
+ self._on_service_heartbeat: Callable[[ServiceInfo], None] | None = None
118
+ self._on_service_shutdown: Callable[[str], None] | None = None
119
+
120
+ @property
121
+ def is_running(self) -> bool:
122
+ """Check if registry is running."""
123
+ return self._running
124
+
125
+ @property
126
+ def service_count(self) -> int:
127
+ """Get count of registered services."""
128
+ return len(self._services)
129
+
130
+ def on_service_registered(self, callback: Callable[[ServiceInfo], None]) -> None:
131
+ """Register callback for when new service is discovered.
132
+
133
+ Args:
134
+ callback: Function to call with ServiceInfo when service registers
135
+ """
136
+ self._on_service_registered = callback
137
+
138
+ def on_service_heartbeat(self, callback: Callable[[ServiceInfo], None]) -> None:
139
+ """Register callback for service heartbeat events.
140
+
141
+ Args:
142
+ callback: Function to call with ServiceInfo on each heartbeat
143
+ """
144
+ self._on_service_heartbeat = callback
145
+
146
+ def on_service_shutdown(self, callback: Callable[[str], None]) -> None:
147
+ """Register callback for service shutdown events.
148
+
149
+ Args:
150
+ callback: Function to call with service name on shutdown
151
+ """
152
+ self._on_service_shutdown = callback
153
+
154
+ async def start(self) -> None:
155
+ """Start service registry and subscribe to lifecycle events."""
156
+ if self._running:
157
+ self._logger.warning("Service registry already running")
158
+ return
159
+
160
+ self._running = True
161
+
162
+ try:
163
+ # Subscribe to startup events from all services
164
+ self._startup_sub = await self._nats.subscribe_request_reply(
165
+ "kryten.lifecycle.*.startup",
166
+ callback=self._handle_startup
167
+ )
168
+ self._logger.info("Subscribed to kryten.lifecycle.*.startup")
169
+
170
+ # Subscribe to heartbeat events from all services
171
+ self._heartbeat_sub = await self._nats.subscribe_request_reply(
172
+ "kryten.lifecycle.*.heartbeat",
173
+ callback=self._handle_heartbeat
174
+ )
175
+ self._logger.info("Subscribed to kryten.lifecycle.*.heartbeat")
176
+
177
+ # Subscribe to shutdown events from all services
178
+ self._shutdown_sub = await self._nats.subscribe_request_reply(
179
+ "kryten.lifecycle.*.shutdown",
180
+ callback=self._handle_shutdown
181
+ )
182
+ self._logger.info("Subscribed to kryten.lifecycle.*.shutdown")
183
+
184
+ self._logger.info("Service registry started")
185
+
186
+ except Exception as e:
187
+ self._logger.error(f"Failed to start service registry: {e}", exc_info=True)
188
+ self._running = False
189
+ raise
190
+
191
+ async def stop(self) -> None:
192
+ """Stop service registry and unsubscribe from events."""
193
+ if not self._running:
194
+ return
195
+
196
+ self._running = False
197
+
198
+ # Unsubscribe from all events
199
+ for sub in [self._startup_sub, self._heartbeat_sub, self._shutdown_sub]:
200
+ if sub:
201
+ try:
202
+ await sub.unsubscribe()
203
+ except Exception as e:
204
+ self._logger.warning(f"Error unsubscribing: {e}")
205
+
206
+ self._startup_sub = None
207
+ self._heartbeat_sub = None
208
+ self._shutdown_sub = None
209
+
210
+ self._logger.info("Service registry stopped")
211
+
212
+ async def _handle_startup(self, msg) -> None:
213
+ """Handle service startup event."""
214
+ try:
215
+ data = json.loads(msg.data.decode('utf-8'))
216
+ service_name = data.get("service")
217
+
218
+ if not service_name:
219
+ return
220
+
221
+ # Extract service information
222
+ version = data.get("version", "unknown")
223
+ hostname = data.get("hostname", "unknown")
224
+ timestamp = datetime.fromisoformat(data.get("timestamp", datetime.now(UTC).isoformat()))
225
+
226
+ async with self._lock:
227
+ is_new = service_name not in self._services
228
+
229
+ if is_new:
230
+ # New service discovered
231
+ service_info = ServiceInfo(
232
+ name=service_name,
233
+ version=version,
234
+ hostname=hostname,
235
+ first_seen=timestamp,
236
+ last_heartbeat=timestamp,
237
+ last_startup=timestamp,
238
+ metadata=data,
239
+ )
240
+ self._services[service_name] = service_info
241
+ self._logger.info(
242
+ f"Service registered: {service_name} v{version} on {hostname}"
243
+ )
244
+
245
+ # Trigger callback
246
+ if self._on_service_registered:
247
+ try:
248
+ self._on_service_registered(service_info)
249
+ except Exception as e:
250
+ self._logger.error(f"Error in service registered callback: {e}")
251
+ else:
252
+ # Service restarted
253
+ service_info = self._services[service_name]
254
+ service_info.version = version
255
+ service_info.hostname = hostname
256
+ service_info.last_startup = timestamp
257
+ service_info.last_heartbeat = timestamp
258
+ service_info.metadata = data
259
+ self._logger.info(
260
+ f"Service restarted: {service_name} v{version} on {hostname}"
261
+ )
262
+
263
+ except json.JSONDecodeError as e:
264
+ self._logger.error(f"Invalid startup event JSON: {e}")
265
+ except Exception as e:
266
+ self._logger.error(f"Error handling startup event: {e}", exc_info=True)
267
+
268
+ async def _handle_heartbeat(self, msg) -> None:
269
+ """Handle service heartbeat event."""
270
+ try:
271
+ data = json.loads(msg.data.decode('utf-8'))
272
+ service_name = data.get("service")
273
+
274
+ if not service_name:
275
+ return
276
+
277
+ timestamp = datetime.fromisoformat(data.get("timestamp", datetime.now(UTC).isoformat()))
278
+
279
+ async with self._lock:
280
+ if service_name in self._services:
281
+ service_info = self._services[service_name]
282
+ service_info.last_heartbeat = timestamp
283
+ service_info.heartbeat_count += 1
284
+
285
+ self._logger.debug(
286
+ f"Heartbeat from {service_name} "
287
+ f"(count: {service_info.heartbeat_count})"
288
+ )
289
+
290
+ # Trigger callback
291
+ if self._on_service_heartbeat:
292
+ try:
293
+ self._on_service_heartbeat(service_info)
294
+ except Exception as e:
295
+ self._logger.error(f"Error in heartbeat callback: {e}")
296
+ else:
297
+ # Heartbeat from unknown service - log warning
298
+ self._logger.warning(
299
+ f"Heartbeat from unregistered service: {service_name} "
300
+ "(may have missed startup event)"
301
+ )
302
+
303
+ except json.JSONDecodeError as e:
304
+ self._logger.error(f"Invalid heartbeat event JSON: {e}")
305
+ except Exception as e:
306
+ self._logger.error(f"Error handling heartbeat event: {e}", exc_info=True)
307
+
308
+ async def _handle_shutdown(self, msg) -> None:
309
+ """Handle service shutdown event."""
310
+ try:
311
+ data = json.loads(msg.data.decode('utf-8'))
312
+ service_name = data.get("service")
313
+ reason = data.get("reason", "Unknown")
314
+
315
+ if not service_name:
316
+ return
317
+
318
+ async with self._lock:
319
+ if service_name in self._services:
320
+ del self._services[service_name]
321
+ self._logger.info(f"Service shutdown: {service_name} ({reason})")
322
+
323
+ # Trigger callback
324
+ if self._on_service_shutdown:
325
+ try:
326
+ self._on_service_shutdown(service_name)
327
+ except Exception as e:
328
+ self._logger.error(f"Error in shutdown callback: {e}")
329
+
330
+ except json.JSONDecodeError as e:
331
+ self._logger.error(f"Invalid shutdown event JSON: {e}")
332
+ except Exception as e:
333
+ self._logger.error(f"Error handling shutdown event: {e}", exc_info=True)
334
+
335
+ def get_service(self, name: str) -> ServiceInfo | None:
336
+ """Get information about a specific service.
337
+
338
+ Args:
339
+ name: Service name
340
+
341
+ Returns:
342
+ ServiceInfo if service is registered, None otherwise
343
+ """
344
+ return self._services.get(name)
345
+
346
+ def get_all_services(self) -> list[ServiceInfo]:
347
+ """Get information about all registered services.
348
+
349
+ Returns:
350
+ List of ServiceInfo objects for all services
351
+ """
352
+ return list(self._services.values())
353
+
354
+ def get_active_services(self) -> list[ServiceInfo]:
355
+ """Get only active services (not stale).
356
+
357
+ Returns:
358
+ List of ServiceInfo objects for services with recent heartbeats
359
+ """
360
+ return [s for s in self._services.values() if not s.is_stale]
361
+
362
+ def get_stale_services(self) -> list[ServiceInfo]:
363
+ """Get services that appear offline (stale heartbeats).
364
+
365
+ Returns:
366
+ List of ServiceInfo objects for services with stale heartbeats
367
+ """
368
+ return [s for s in self._services.values() if s.is_stale]
369
+
370
+
371
+ __all__ = ["ServiceRegistry", "ServiceInfo"]