puda-comms 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puda_comms/__init__.py +5 -0
- puda_comms/command_service.py +635 -0
- puda_comms/execution_state.py +89 -0
- puda_comms/machine_client.py +771 -0
- puda_comms/models.py +88 -0
- puda_comms-0.0.2.dist-info/METADATA +310 -0
- puda_comms-0.0.2.dist-info/RECORD +8 -0
- puda_comms-0.0.2.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,771 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Basic default NATS Client for Generic Machines
|
|
3
|
+
Handles commands, telemetry, and events following the puda.{machine_id}.{category}.{sub_category} pattern
|
|
4
|
+
Specific methods to a single machine should be implemented in the machine-edge client
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
from contextlib import asynccontextmanager
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Dict, Any, Optional, Callable, Awaitable
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
import nats
|
|
13
|
+
from puda_comms.models import (
|
|
14
|
+
CommandResponseStatus,
|
|
15
|
+
CommandResponse,
|
|
16
|
+
CommandResponseCode,
|
|
17
|
+
NATSMessage,
|
|
18
|
+
CommandRequest,
|
|
19
|
+
MessageType,
|
|
20
|
+
ImmediateCommand,
|
|
21
|
+
)
|
|
22
|
+
from nats.js.client import JetStreamContext
|
|
23
|
+
from nats.js.api import StreamConfig
|
|
24
|
+
from nats.js.errors import NotFoundError
|
|
25
|
+
from nats.aio.msg import Msg
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MachineClient:
|
|
31
|
+
"""
|
|
32
|
+
NATS client for machines.
|
|
33
|
+
|
|
34
|
+
Subject pattern: puda.{machine_id}.{category}.{sub_category}
|
|
35
|
+
- Telemetry: core NATS (no JetStream)
|
|
36
|
+
- Commands: JetStream with exactly-once delivery
|
|
37
|
+
- Queue commands: COMMAND_QUEUE stream (WorkQueue retention)
|
|
38
|
+
- Immediate commands: COMMAND_IMMEDIATE stream (WorkQueue retention)
|
|
39
|
+
- Command responses: JetStream streams (Interest retention)
|
|
40
|
+
- Queue responses: RESPONSE_QUEUE stream (Interest retention)
|
|
41
|
+
- Immediate responses: RESPONSE_IMMEDIATE stream (Interest retention)
|
|
42
|
+
- Events: Core NATS (fire-and-forget, no JetStream)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# Constants
|
|
46
|
+
NAMESPACE = "puda"
|
|
47
|
+
KEEP_ALIVE_INTERVAL = 25 # seconds
|
|
48
|
+
STREAM_COMMAND_QUEUE = "COMMAND_QUEUE"
|
|
49
|
+
STREAM_COMMAND_IMMEDIATE = "COMMAND_IMMEDIATE"
|
|
50
|
+
STREAM_RESPONSE_QUEUE = "RESPONSE_QUEUE"
|
|
51
|
+
STREAM_RESPONSE_IMMEDIATE = "RESPONSE_IMMEDIATE"
|
|
52
|
+
|
|
53
|
+
def __init__(self, servers: list[str], machine_id: str):
|
|
54
|
+
"""
|
|
55
|
+
Initialize NATS client for machine.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
servers: List of NATS server URLs (e.g., ["nats://localhost:4222"])
|
|
59
|
+
machine_id: Machine identifier (e.g., "opentron")
|
|
60
|
+
"""
|
|
61
|
+
self.servers = servers
|
|
62
|
+
self.machine_id = machine_id
|
|
63
|
+
self.nc: Optional[nats.NATS] = None
|
|
64
|
+
self.js: Optional[JetStreamContext] = None
|
|
65
|
+
self.kv = None
|
|
66
|
+
|
|
67
|
+
# Generate subject and stream names
|
|
68
|
+
self._init_subjects()
|
|
69
|
+
|
|
70
|
+
# Default subscriptions
|
|
71
|
+
self._cmd_queue_sub = None
|
|
72
|
+
self._cmd_immediate_sub = None
|
|
73
|
+
|
|
74
|
+
# Connection state
|
|
75
|
+
self._is_connected = False
|
|
76
|
+
self._reconnect_handlers = []
|
|
77
|
+
|
|
78
|
+
# Queue control state
|
|
79
|
+
self._pause_lock = asyncio.Lock()
|
|
80
|
+
self._is_paused = False
|
|
81
|
+
self._cancelled_run_ids = set()
|
|
82
|
+
|
|
83
|
+
def _init_subjects(self):
|
|
84
|
+
"""Initialize all subject and stream names."""
|
|
85
|
+
namespace = self.NAMESPACE
|
|
86
|
+
machine_id_safe = self.machine_id.replace('.', '-')
|
|
87
|
+
|
|
88
|
+
# Telemetry subjects (core NATS, no JetStream)
|
|
89
|
+
self.tlm_heartbeat = f"{namespace}.{machine_id_safe}.tlm.heartbeat"
|
|
90
|
+
self.tlm_pos = f"{namespace}.{machine_id_safe}.tlm.pos"
|
|
91
|
+
self.tlm_health = f"{namespace}.{machine_id_safe}.tlm.health"
|
|
92
|
+
|
|
93
|
+
# Command subjects (JetStream, exactly-once)
|
|
94
|
+
self.cmd_queue = f"{namespace}.{machine_id_safe}.cmd.queue" # should be pull consumer
|
|
95
|
+
self.cmd_immediate = f"{namespace}.{machine_id_safe}.cmd.immediate" # push consumer
|
|
96
|
+
|
|
97
|
+
# Response subjects (JetStream streams)
|
|
98
|
+
self.response_queue = f"{namespace}.{machine_id_safe}.cmd.response.queue"
|
|
99
|
+
self.response_immediate = f"{namespace}.{machine_id_safe}.cmd.response.immediate"
|
|
100
|
+
|
|
101
|
+
# Event subjects (Core NATS, no JetStream)
|
|
102
|
+
self.evt_log = f"{namespace}.{machine_id_safe}.evt.log"
|
|
103
|
+
self.evt_alert = f"{namespace}.{machine_id_safe}.evt.alert"
|
|
104
|
+
self.evt_media = f"{namespace}.{machine_id_safe}.evt.media"
|
|
105
|
+
|
|
106
|
+
# KV bucket name for status
|
|
107
|
+
self.kv_bucket_name = f"MACHINE_STATE_{machine_id_safe}"
|
|
108
|
+
|
|
109
|
+
# ==================== HELPER METHODS ====================
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def _format_timestamp() -> str:
|
|
113
|
+
"""Format current timestamp as ISO 8601 UTC string."""
|
|
114
|
+
return datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
115
|
+
|
|
116
|
+
async def _publish_telemetry(self, subject: str, data: Dict[str, Any]) -> bool:
|
|
117
|
+
"""Publish telemetry message to core NATS."""
|
|
118
|
+
if not self.nc:
|
|
119
|
+
logger.warning("NATS not connected, skipping %s", subject)
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
message = {'timestamp': self._format_timestamp(), **data}
|
|
124
|
+
await self.nc.publish(subject=subject, payload=json.dumps(message).encode())
|
|
125
|
+
logger.debug("Published to %s", subject)
|
|
126
|
+
return True
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.error("Error publishing to %s: %s", subject, e)
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
async def _publish_event(self, subject: str, data: Dict[str, Any]) -> bool:
|
|
132
|
+
"""Publish event message to Core NATS (fire-and-forget)."""
|
|
133
|
+
if not self.nc:
|
|
134
|
+
logger.warning("NATS not connected, skipping %s", subject)
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
message = {'timestamp': self._format_timestamp(), **data}
|
|
139
|
+
await self.nc.publish(subject=subject, payload=json.dumps(message).encode())
|
|
140
|
+
logger.debug("Published to %s", subject)
|
|
141
|
+
return True
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.error("Error publishing to %s: %s", subject, e)
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
async def _ensure_stream(self, stream_name: str, subject_pattern: str, retention: str = 'workqueue'):
|
|
147
|
+
"""
|
|
148
|
+
Ensure a stream exists with the specified retention policy.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
stream_name: Name of the stream (e.g., STREAM_COMMAND_QUEUE)
|
|
152
|
+
subject_pattern: Subject pattern for the stream (e.g., "puda.*.cmd.queue")
|
|
153
|
+
retention: Retention policy ('workqueue', 'interest', or 'limits'). Defaults to 'workqueue'
|
|
154
|
+
"""
|
|
155
|
+
if not self.js:
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
# Try to get existing stream
|
|
160
|
+
stream_info = await self.js.stream_info(stream_name)
|
|
161
|
+
# Check if it has the correct pattern and retention
|
|
162
|
+
config = stream_info.config
|
|
163
|
+
if subject_pattern not in config.subjects or getattr(config, 'retention', None) != retention:
|
|
164
|
+
logger.info("Updating %s stream: subject=%s, retention=%s", stream_name, subject_pattern, retention)
|
|
165
|
+
updated_config = StreamConfig(
|
|
166
|
+
name=stream_name,
|
|
167
|
+
subjects=[subject_pattern],
|
|
168
|
+
retention=retention
|
|
169
|
+
)
|
|
170
|
+
await self.js.update_stream(config=updated_config)
|
|
171
|
+
logger.info("Successfully updated %s stream", stream_name)
|
|
172
|
+
except NotFoundError:
|
|
173
|
+
# Stream doesn't exist, create it
|
|
174
|
+
logger.info("Creating %s stream: subject=%s, retention=%s", stream_name, subject_pattern, retention)
|
|
175
|
+
await self.js.add_stream(
|
|
176
|
+
StreamConfig(
|
|
177
|
+
name=stream_name,
|
|
178
|
+
subjects=[subject_pattern],
|
|
179
|
+
retention=retention
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
logger.info("Successfully created %s stream", stream_name)
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
|
|
185
|
+
raise
|
|
186
|
+
|
|
187
|
+
async def _ensure_command_queue_stream(self):
|
|
188
|
+
"""Ensure COMMAND_QUEUE stream exists with WorkQueue retention policy."""
|
|
189
|
+
await self._ensure_stream(
|
|
190
|
+
self.STREAM_COMMAND_QUEUE,
|
|
191
|
+
f"{self.NAMESPACE}.*.cmd.queue"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
async def _ensure_command_immediate_stream(self):
|
|
195
|
+
"""Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
|
|
196
|
+
await self._ensure_stream(
|
|
197
|
+
self.STREAM_COMMAND_IMMEDIATE,
|
|
198
|
+
f"{self.NAMESPACE}.*.cmd.immediate"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
async def _ensure_response_queue_stream(self):
|
|
202
|
+
"""Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
|
|
203
|
+
await self._ensure_stream(
|
|
204
|
+
self.STREAM_RESPONSE_QUEUE,
|
|
205
|
+
f"{self.NAMESPACE}.*.cmd.response.queue",
|
|
206
|
+
retention='interest'
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
async def _ensure_response_immediate_stream(self):
|
|
210
|
+
"""Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
|
|
211
|
+
await self._ensure_stream(
|
|
212
|
+
self.STREAM_RESPONSE_IMMEDIATE,
|
|
213
|
+
f"{self.NAMESPACE}.*.cmd.response.immediate",
|
|
214
|
+
retention='interest'
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
async def _get_or_create_kv_bucket(self):
|
|
218
|
+
"""Get or create KV bucket, handling errors gracefully."""
|
|
219
|
+
if not self.js:
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
return await self.js.create_key_value(bucket=self.kv_bucket_name)
|
|
224
|
+
except Exception:
|
|
225
|
+
try:
|
|
226
|
+
return await self.js.key_value(self.kv_bucket_name)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.warning("Could not create or access KV bucket: %s", e)
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
async def _cleanup_subscriptions(self):
|
|
232
|
+
"""Unsubscribe from all subscriptions."""
|
|
233
|
+
# Clean up subscriptions
|
|
234
|
+
if self._cmd_queue_sub:
|
|
235
|
+
try:
|
|
236
|
+
await self._cmd_queue_sub.unsubscribe()
|
|
237
|
+
except Exception:
|
|
238
|
+
pass
|
|
239
|
+
self._cmd_queue_sub = None
|
|
240
|
+
|
|
241
|
+
if self._cmd_immediate_sub:
|
|
242
|
+
try:
|
|
243
|
+
await self._cmd_immediate_sub.unsubscribe()
|
|
244
|
+
except Exception:
|
|
245
|
+
pass
|
|
246
|
+
self._cmd_immediate_sub = None
|
|
247
|
+
|
|
248
|
+
def _reset_connection_state(self):
|
|
249
|
+
"""Reset connection-related state."""
|
|
250
|
+
self._is_connected = False
|
|
251
|
+
self.js = None
|
|
252
|
+
self.kv = None
|
|
253
|
+
# Subscriptions will be recreated on reconnection
|
|
254
|
+
self._cmd_queue_sub = None
|
|
255
|
+
self._cmd_immediate_sub = None
|
|
256
|
+
|
|
257
|
+
# ==================== CONNECTION MANAGEMENT ====================
|
|
258
|
+
|
|
259
|
+
async def connect(self) -> bool:
|
|
260
|
+
"""Connect to NATS server and initialize JetStream with auto-reconnection."""
|
|
261
|
+
try:
|
|
262
|
+
self.nc = await nats.connect(
|
|
263
|
+
servers=self.servers,
|
|
264
|
+
reconnect_time_wait=2,
|
|
265
|
+
max_reconnect_attempts=-1,
|
|
266
|
+
error_cb=self._error_callback,
|
|
267
|
+
disconnected_cb=self._disconnected_callback,
|
|
268
|
+
reconnected_cb=self._reconnected_callback,
|
|
269
|
+
closed_cb=self._closed_callback
|
|
270
|
+
)
|
|
271
|
+
self.js = self.nc.jetstream()
|
|
272
|
+
await self._ensure_command_queue_stream()
|
|
273
|
+
await self._ensure_command_immediate_stream()
|
|
274
|
+
await self._ensure_response_queue_stream()
|
|
275
|
+
await self._ensure_response_immediate_stream()
|
|
276
|
+
self.kv = await self._get_or_create_kv_bucket()
|
|
277
|
+
self._is_connected = True
|
|
278
|
+
logger.info("Connected to NATS servers: %s", self.servers)
|
|
279
|
+
return True
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error("Failed to connect to NATS: %s", e)
|
|
282
|
+
self._reset_connection_state()
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
async def _error_callback(self, error: Exception):
|
|
286
|
+
"""Callback for NATS errors."""
|
|
287
|
+
logger.error("NATS error: %s", error)
|
|
288
|
+
|
|
289
|
+
async def _disconnected_callback(self):
|
|
290
|
+
"""Callback when disconnected from NATS."""
|
|
291
|
+
logger.warning("Disconnected from NATS servers")
|
|
292
|
+
self._reset_connection_state()
|
|
293
|
+
|
|
294
|
+
async def _reconnected_callback(self):
|
|
295
|
+
"""Callback when reconnected to NATS."""
|
|
296
|
+
logger.info("Reconnected to NATS servers")
|
|
297
|
+
self._is_connected = True
|
|
298
|
+
|
|
299
|
+
if self.nc:
|
|
300
|
+
self.js = self.nc.jetstream()
|
|
301
|
+
await self._ensure_command_queue_stream()
|
|
302
|
+
await self._ensure_command_immediate_stream()
|
|
303
|
+
await self._ensure_response_queue_stream()
|
|
304
|
+
await self._ensure_response_immediate_stream()
|
|
305
|
+
self.kv = await self._get_or_create_kv_bucket()
|
|
306
|
+
await self._resubscribe_handlers()
|
|
307
|
+
|
|
308
|
+
async def _resubscribe_handlers(self):
|
|
309
|
+
"""Re-subscribe to all handlers after reconnection."""
|
|
310
|
+
subscribe_methods = {
|
|
311
|
+
'queue': self.subscribe_queue,
|
|
312
|
+
'immediate': self.subscribe_immediate,
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
for handler_info in self._reconnect_handlers:
|
|
316
|
+
try:
|
|
317
|
+
handler_type = handler_info['type']
|
|
318
|
+
handler = handler_info['handler']
|
|
319
|
+
subscribe_method = subscribe_methods.get(handler_type)
|
|
320
|
+
|
|
321
|
+
if subscribe_method:
|
|
322
|
+
await subscribe_method(handler)
|
|
323
|
+
else:
|
|
324
|
+
logger.warning("Unknown handler type: %s", handler_type)
|
|
325
|
+
except Exception as e:
|
|
326
|
+
logger.error("Failed to re-subscribe %s: %s", handler_type, e)
|
|
327
|
+
|
|
328
|
+
async def _closed_callback(self):
|
|
329
|
+
"""Callback when connection is closed."""
|
|
330
|
+
logger.info("NATS connection closed")
|
|
331
|
+
self._reset_connection_state()
|
|
332
|
+
|
|
333
|
+
async def disconnect(self):
|
|
334
|
+
"""Disconnect from NATS server."""
|
|
335
|
+
await self._cleanup_subscriptions()
|
|
336
|
+
if self.nc:
|
|
337
|
+
await self.nc.close()
|
|
338
|
+
self._reset_connection_state()
|
|
339
|
+
logger.info("Disconnected from NATS")
|
|
340
|
+
|
|
341
|
+
# ==================== TELEMETRY (Core NATS, no JetStream) ====================
|
|
342
|
+
|
|
343
|
+
async def publish_heartbeat(self):
|
|
344
|
+
"""Publish heartbeat telemetry (timestamp only)."""
|
|
345
|
+
await self._publish_telemetry(self.tlm_heartbeat, {})
|
|
346
|
+
|
|
347
|
+
async def publish_position(self, coords: Dict[str, float]):
|
|
348
|
+
"""Publish real-time position coordinates."""
|
|
349
|
+
await self._publish_telemetry(self.tlm_pos, coords)
|
|
350
|
+
|
|
351
|
+
async def publish_health(self, vitals: Dict[str, Any]):
|
|
352
|
+
"""Publish system health vitals (CPU, memory, temperature, etc.)."""
|
|
353
|
+
await self._publish_telemetry(self.tlm_health, vitals)
|
|
354
|
+
|
|
355
|
+
async def publish_state(self, data: Dict[str, Any]):
|
|
356
|
+
"""
|
|
357
|
+
Overwrites machine state in nats KV store.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
data: Dictionary with state data
|
|
361
|
+
"""
|
|
362
|
+
if not self.kv:
|
|
363
|
+
logger.warning("KV store not available, skipping state update")
|
|
364
|
+
return
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
message = {'timestamp': self._format_timestamp(), **data}
|
|
368
|
+
await self.kv.put(self.machine_id, json.dumps(message).encode())
|
|
369
|
+
logger.info("Updated state in KV store: %s", message)
|
|
370
|
+
except Exception as e:
|
|
371
|
+
logger.error("Error updating status in KV store: %s", e)
|
|
372
|
+
|
|
373
|
+
# ==================== COMMANDS (JetStream, exactly-once with run_id) ====================
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
@asynccontextmanager
|
|
377
|
+
async def _keep_message_alive(self, msg: Msg, interval: int = KEEP_ALIVE_INTERVAL):
|
|
378
|
+
"""
|
|
379
|
+
Context manager that maintains a background task to reset the
|
|
380
|
+
redelivery timer (in_progress) while the block/machine is executing.
|
|
381
|
+
"""
|
|
382
|
+
async def _heartbeat():
|
|
383
|
+
while True:
|
|
384
|
+
await asyncio.sleep(interval)
|
|
385
|
+
try:
|
|
386
|
+
await msg.in_progress()
|
|
387
|
+
logger.debug("Reset redelivery timer via keep-alive")
|
|
388
|
+
except Exception:
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
task = asyncio.create_task(_heartbeat())
|
|
392
|
+
try:
|
|
393
|
+
yield
|
|
394
|
+
finally:
|
|
395
|
+
task.cancel()
|
|
396
|
+
try:
|
|
397
|
+
await task
|
|
398
|
+
except asyncio.CancelledError:
|
|
399
|
+
pass
|
|
400
|
+
|
|
401
|
+
async def _publish_command_response(
|
|
402
|
+
self,
|
|
403
|
+
msg: Msg,
|
|
404
|
+
response: CommandResponse,
|
|
405
|
+
subject: str
|
|
406
|
+
):
|
|
407
|
+
"""
|
|
408
|
+
Publish command response message to JetStream response stream.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
msg: NATS message
|
|
412
|
+
response: CommandResponse object containing status, code, message, and completed_at
|
|
413
|
+
subject: NATS subject to publish the response to
|
|
414
|
+
"""
|
|
415
|
+
if not self.js:
|
|
416
|
+
return
|
|
417
|
+
|
|
418
|
+
try:
|
|
419
|
+
original_message = NATSMessage.model_validate_json(msg.data)
|
|
420
|
+
|
|
421
|
+
# Create response message with RESPONSE type
|
|
422
|
+
response_header = original_message.header.model_copy(
|
|
423
|
+
update={
|
|
424
|
+
'message_type': MessageType.RESPONSE,
|
|
425
|
+
'timestamp': self._format_timestamp()
|
|
426
|
+
}
|
|
427
|
+
)
|
|
428
|
+
response_message = original_message.model_copy(
|
|
429
|
+
update={'header': response_header, 'response': response}
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Publish to JetStream response stream
|
|
433
|
+
await self.js.publish(subject=subject, payload=response_message.model_dump_json().encode())
|
|
434
|
+
logger.info("Published command response to JetStream: %s", response_message.model_dump_json())
|
|
435
|
+
except Exception as e:
|
|
436
|
+
logger.error("Error publishing command response: %s", e)
|
|
437
|
+
|
|
438
|
+
async def process_queue_cmd(
|
|
439
|
+
self,
|
|
440
|
+
msg: Msg,
|
|
441
|
+
handler: Callable[[CommandRequest], Awaitable[CommandResponse]]
|
|
442
|
+
) -> None:
|
|
443
|
+
"""
|
|
444
|
+
Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
msg: NATS message
|
|
448
|
+
handler: Handler function that processes the message and returns CommandResponse
|
|
449
|
+
"""
|
|
450
|
+
try:
|
|
451
|
+
# Parse message
|
|
452
|
+
message = NATSMessage.model_validate_json(msg.data)
|
|
453
|
+
run_id = message.header.run_id
|
|
454
|
+
step_number = message.command.step_number
|
|
455
|
+
command = message.command.name
|
|
456
|
+
|
|
457
|
+
# Check if cancelled
|
|
458
|
+
if run_id and run_id in self._cancelled_run_ids:
|
|
459
|
+
logger.info("Skipping cancelled command: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
|
|
460
|
+
await msg.ack()
|
|
461
|
+
await self._publish_command_response(
|
|
462
|
+
msg=msg,
|
|
463
|
+
response=CommandResponse(
|
|
464
|
+
status=CommandResponseStatus.ERROR,
|
|
465
|
+
code=CommandResponseCode.COMMAND_CANCELLED,
|
|
466
|
+
message='Command cancelled'
|
|
467
|
+
),
|
|
468
|
+
subject=self.response_queue
|
|
469
|
+
)
|
|
470
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
471
|
+
return
|
|
472
|
+
|
|
473
|
+
# Check if paused (for queue messages)
|
|
474
|
+
async with self._pause_lock:
|
|
475
|
+
if self._is_paused:
|
|
476
|
+
await self._publish_command_response(
|
|
477
|
+
msg,
|
|
478
|
+
CommandResponse(
|
|
479
|
+
status=CommandResponseStatus.ERROR,
|
|
480
|
+
code=CommandResponseCode.MACHINE_PAUSED,
|
|
481
|
+
message='Machine paused'
|
|
482
|
+
),
|
|
483
|
+
subject=self.response_queue
|
|
484
|
+
)
|
|
485
|
+
return
|
|
486
|
+
while self._is_paused:
|
|
487
|
+
await msg.in_progress()
|
|
488
|
+
await asyncio.sleep(1)
|
|
489
|
+
# Re-check cancelled state in case it was cancelled while paused
|
|
490
|
+
if run_id and run_id in self._cancelled_run_ids:
|
|
491
|
+
logger.info("Command cancelled while paused: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
|
|
492
|
+
await msg.ack()
|
|
493
|
+
await self._publish_command_response(
|
|
494
|
+
msg=msg,
|
|
495
|
+
response=CommandResponse(
|
|
496
|
+
status=CommandResponseStatus.ERROR,
|
|
497
|
+
code=CommandResponseCode.COMMAND_CANCELLED,
|
|
498
|
+
message='Command cancelled'
|
|
499
|
+
),
|
|
500
|
+
subject=self.response_queue
|
|
501
|
+
)
|
|
502
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
503
|
+
return
|
|
504
|
+
|
|
505
|
+
# Execute handler with auto-heartbeat (task might take a while for machine to complete)
|
|
506
|
+
# The handler should be defined in the machine-specific edge module.
|
|
507
|
+
async with self._keep_message_alive(msg):
|
|
508
|
+
response: CommandResponse = await handler(message)
|
|
509
|
+
|
|
510
|
+
# Finalize message state based on response
|
|
511
|
+
if response.status == CommandResponseStatus.SUCCESS:
|
|
512
|
+
await msg.ack()
|
|
513
|
+
else:
|
|
514
|
+
await msg.term()
|
|
515
|
+
|
|
516
|
+
await self._publish_command_response(
|
|
517
|
+
msg=msg,
|
|
518
|
+
response=response,
|
|
519
|
+
subject=self.response_queue
|
|
520
|
+
)
|
|
521
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
522
|
+
|
|
523
|
+
except asyncio.CancelledError:
|
|
524
|
+
# Handler was cancelled (e.g., via task cancellation)
|
|
525
|
+
logger.info("Handler execution cancelled: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
|
|
526
|
+
await msg.ack()
|
|
527
|
+
await self._publish_command_response(
|
|
528
|
+
msg=msg,
|
|
529
|
+
response=CommandResponse(
|
|
530
|
+
status=CommandResponseStatus.ERROR,
|
|
531
|
+
code=CommandResponseCode.COMMAND_CANCELLED,
|
|
532
|
+
message='Command cancelled'
|
|
533
|
+
),
|
|
534
|
+
subject=self.response_queue
|
|
535
|
+
)
|
|
536
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
537
|
+
|
|
538
|
+
except json.JSONDecodeError as e:
|
|
539
|
+
logger.error("JSON Decode Error. Terminating message.")
|
|
540
|
+
await msg.term()
|
|
541
|
+
await self._publish_command_response(
|
|
542
|
+
msg=msg,
|
|
543
|
+
response=CommandResponse(
|
|
544
|
+
status=CommandResponseStatus.ERROR,
|
|
545
|
+
code=CommandResponseCode.JSON_DECODE_ERROR,
|
|
546
|
+
message=f'JSON decode error: {e}'
|
|
547
|
+
),
|
|
548
|
+
subject=self.response_queue
|
|
549
|
+
)
|
|
550
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
551
|
+
# For JSON decode errors, handler wasn't called, so we can't rely on it
|
|
552
|
+
# This is a rare case - consider if handler should be called with None payload
|
|
553
|
+
|
|
554
|
+
except Exception as e:
|
|
555
|
+
# Check if cancelled before sending error response
|
|
556
|
+
if run_id and run_id in self._cancelled_run_ids:
|
|
557
|
+
logger.info("Command cancelled during execution (exception occurred): run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
|
|
558
|
+
await msg.ack()
|
|
559
|
+
await self._publish_command_response(
|
|
560
|
+
msg=msg,
|
|
561
|
+
response=CommandResponse(
|
|
562
|
+
status=CommandResponseStatus.ERROR,
|
|
563
|
+
code=CommandResponseCode.COMMAND_CANCELLED,
|
|
564
|
+
message='Command cancelled'
|
|
565
|
+
),
|
|
566
|
+
subject=self.response_queue
|
|
567
|
+
)
|
|
568
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
569
|
+
else:
|
|
570
|
+
# Terminate all errors to prevent infinite redelivery loops
|
|
571
|
+
logger.error("Handler failed (terminating message): %s", e)
|
|
572
|
+
await msg.term()
|
|
573
|
+
await self._publish_command_response(
|
|
574
|
+
msg=msg,
|
|
575
|
+
response=CommandResponse(
|
|
576
|
+
status=CommandResponseStatus.ERROR,
|
|
577
|
+
code=CommandResponseCode.EXECUTION_ERROR,
|
|
578
|
+
message=str(e)
|
|
579
|
+
),
|
|
580
|
+
subject=self.response_queue
|
|
581
|
+
)
|
|
582
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
583
|
+
|
|
584
|
+
async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
|
|
585
|
+
"""Process immediate commands (pause, cancel, resume, etc.)."""
|
|
586
|
+
try:
|
|
587
|
+
message = NATSMessage.model_validate_json(msg.data)
|
|
588
|
+
# Ack immediately after successful parse
|
|
589
|
+
await msg.ack()
|
|
590
|
+
|
|
591
|
+
# Handle built-in commands
|
|
592
|
+
if message.command is None:
|
|
593
|
+
logger.error("Received message with no command")
|
|
594
|
+
return
|
|
595
|
+
|
|
596
|
+
command_name = message.command.name.lower()
|
|
597
|
+
|
|
598
|
+
match command_name:
|
|
599
|
+
case ImmediateCommand.PAUSE:
|
|
600
|
+
async with self._pause_lock:
|
|
601
|
+
if not self._is_paused:
|
|
602
|
+
self._is_paused = True
|
|
603
|
+
logger.info("Queue paused")
|
|
604
|
+
await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
|
|
605
|
+
# Call handler and use its response
|
|
606
|
+
response: CommandResponse = await handler(message)
|
|
607
|
+
|
|
608
|
+
case ImmediateCommand.RESUME:
|
|
609
|
+
async with self._pause_lock:
|
|
610
|
+
if self._is_paused:
|
|
611
|
+
self._is_paused = False
|
|
612
|
+
logger.info("Queue resumed")
|
|
613
|
+
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
614
|
+
# Call handler and use its response
|
|
615
|
+
response: CommandResponse = await handler(message)
|
|
616
|
+
|
|
617
|
+
case ImmediateCommand.CANCEL:
|
|
618
|
+
if message.header.run_id:
|
|
619
|
+
self._cancelled_run_ids.add(message.header.run_id)
|
|
620
|
+
logger.info("Cancelling all commands with run_id: %s", message.header.run_id)
|
|
621
|
+
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
622
|
+
# Call handler and use its response
|
|
623
|
+
response: CommandResponse = await handler(message)
|
|
624
|
+
|
|
625
|
+
case _:
|
|
626
|
+
# For other immediate commands, call the user-provided handler
|
|
627
|
+
response: CommandResponse = await handler(message)
|
|
628
|
+
|
|
629
|
+
await self._publish_command_response(
|
|
630
|
+
msg=msg,
|
|
631
|
+
response=response,
|
|
632
|
+
subject=self.response_immediate
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
except json.JSONDecodeError as e:
|
|
636
|
+
logger.error("JSON Decode Error in immediate command: %s", e)
|
|
637
|
+
# msg.ack() was already called, so we just need to publish error response
|
|
638
|
+
await self._publish_command_response(
|
|
639
|
+
msg=msg,
|
|
640
|
+
response=CommandResponse(
|
|
641
|
+
status=CommandResponseStatus.ERROR,
|
|
642
|
+
code=CommandResponseCode.JSON_DECODE_ERROR,
|
|
643
|
+
message=f'JSON decode error: {e}'
|
|
644
|
+
),
|
|
645
|
+
subject=self.response_immediate
|
|
646
|
+
)
|
|
647
|
+
await self.publish_state({'state': 'error', 'run_id': None})
|
|
648
|
+
|
|
649
|
+
except Exception as e:
|
|
650
|
+
# msg.ack() was already called, so we just publish error response
|
|
651
|
+
logger.error("Error processing immediate command: %s", e)
|
|
652
|
+
await self._publish_command_response(
|
|
653
|
+
msg=msg,
|
|
654
|
+
response=CommandResponse(
|
|
655
|
+
status=CommandResponseStatus.ERROR,
|
|
656
|
+
code=CommandResponseCode.EXECUTION_ERROR,
|
|
657
|
+
message=str(e)
|
|
658
|
+
),
|
|
659
|
+
subject=self.response_immediate
|
|
660
|
+
)
|
|
661
|
+
await self.publish_state({'state': 'error', 'run_id': None})
|
|
662
|
+
|
|
663
|
+
async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
|
|
664
|
+
"""
|
|
665
|
+
Subscribe to queue commands with default consumer.
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
handler: Async function that processes command payloads and returns CommandResponse
|
|
669
|
+
"""
|
|
670
|
+
if not self.js:
|
|
671
|
+
logger.error("JetStream not available for queue subscription")
|
|
672
|
+
return
|
|
673
|
+
|
|
674
|
+
# Ensure stream exists before attempting to subscribe
|
|
675
|
+
await self._ensure_command_queue_stream()
|
|
676
|
+
|
|
677
|
+
try:
|
|
678
|
+
async def message_handler(msg: Msg):
|
|
679
|
+
"""Wrapper to process queue messages."""
|
|
680
|
+
await self.process_queue_cmd(msg, handler)
|
|
681
|
+
|
|
682
|
+
self._cmd_queue_sub = await self.js.subscribe(
|
|
683
|
+
subject=self.cmd_queue,
|
|
684
|
+
stream=self.STREAM_COMMAND_QUEUE,
|
|
685
|
+
durable=f"cmd_queue_{self.machine_id}",
|
|
686
|
+
cb=message_handler
|
|
687
|
+
)
|
|
688
|
+
except NotFoundError:
|
|
689
|
+
# Stream still not found after ensuring it exists - this shouldn't happen
|
|
690
|
+
# but handle it gracefully with detailed diagnostics
|
|
691
|
+
logger.error("Stream %s not found when subscribing to %s. This may indicate:",
|
|
692
|
+
self.STREAM_COMMAND_QUEUE, self.cmd_queue)
|
|
693
|
+
logger.error(" 1. Stream creation failed silently")
|
|
694
|
+
logger.error(" 2. Subject pattern mismatch (stream pattern: %s.*.cmd.queue, subject: %s)",
|
|
695
|
+
self.NAMESPACE, self.cmd_queue)
|
|
696
|
+
logger.error(" 3. NATS cluster propagation delay")
|
|
697
|
+
# Try to get stream info one more time for diagnostics
|
|
698
|
+
try:
|
|
699
|
+
stream_info = await self.js.stream_info(self.STREAM_COMMAND_QUEUE)
|
|
700
|
+
logger.error(" Stream actually exists with subjects: %s", stream_info.config.subjects)
|
|
701
|
+
except Exception as stream_check_error:
|
|
702
|
+
logger.error(" Stream verification failed: %s", stream_check_error)
|
|
703
|
+
raise
|
|
704
|
+
|
|
705
|
+
# Register handler for reconnection
|
|
706
|
+
if not any(h['type'] == 'queue' for h in self._reconnect_handlers):
|
|
707
|
+
self._reconnect_handlers.append({'type': 'queue', 'handler': handler})
|
|
708
|
+
logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
|
|
709
|
+
self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
|
|
710
|
+
|
|
711
|
+
async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
|
|
712
|
+
"""
|
|
713
|
+
Subscribe to immediate commands with default consumer.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
handler: Async function that processes command payloads (payload) -> bool
|
|
717
|
+
"""
|
|
718
|
+
if not self.js:
|
|
719
|
+
logger.error("JetStream not available for immediate subscription")
|
|
720
|
+
return
|
|
721
|
+
|
|
722
|
+
async def message_handler(msg: Msg):
|
|
723
|
+
"""Wrapper to process immediate messages."""
|
|
724
|
+
await self.process_immediate_cmd(msg, handler)
|
|
725
|
+
|
|
726
|
+
# Ensure stream exists before attempting to subscribe
|
|
727
|
+
await self._ensure_command_immediate_stream()
|
|
728
|
+
|
|
729
|
+
try:
|
|
730
|
+
self._cmd_immediate_sub = await self.js.subscribe(
|
|
731
|
+
subject=self.cmd_immediate,
|
|
732
|
+
stream=self.STREAM_COMMAND_IMMEDIATE,
|
|
733
|
+
durable=f"cmd_immed_{self.machine_id}",
|
|
734
|
+
cb=message_handler
|
|
735
|
+
)
|
|
736
|
+
except NotFoundError:
|
|
737
|
+
# Stream still not found after ensuring it exists - this shouldn't happen
|
|
738
|
+
# but handle it gracefully
|
|
739
|
+
logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
|
|
740
|
+
self.STREAM_COMMAND_IMMEDIATE)
|
|
741
|
+
raise
|
|
742
|
+
|
|
743
|
+
# Register handler for reconnection
|
|
744
|
+
if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
|
|
745
|
+
self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
|
|
746
|
+
logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
|
|
747
|
+
self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
# ==================== EVENTS (Core NATS, no JetStream) ====================
|
|
751
|
+
|
|
752
|
+
async def publish_log(self, log_level: str, msg: str, **kwargs):
|
|
753
|
+
"""Publish log event (Core NATS, fire-and-forget)."""
|
|
754
|
+
await self._publish_event(
|
|
755
|
+
self.evt_log,
|
|
756
|
+
{'log_level': log_level, 'msg': msg, **kwargs}
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
async def publish_alert(self, alert_type: str, severity: str, **kwargs):
|
|
760
|
+
"""Publish alert event for critical issues (Core NATS, fire-and-forget)."""
|
|
761
|
+
await self._publish_event(
|
|
762
|
+
self.evt_alert,
|
|
763
|
+
{'type': alert_type, 'severity': severity, **kwargs}
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
async def publish_media(self, media_url: str, media_type: str = "image", **kwargs):
|
|
767
|
+
"""Publish media event after uploading to object storage (Core NATS, fire-and-forget)."""
|
|
768
|
+
await self._publish_event(
|
|
769
|
+
self.evt_media,
|
|
770
|
+
{'media_url': media_url, 'media_type': media_type, **kwargs}
|
|
771
|
+
)
|