puda-comms 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,771 @@
1
+ """
2
+ Basic default NATS Client for Generic Machines
3
+ Handles commands, telemetry, and events following the puda.{machine_id}.{category}.{sub_category} pattern
4
+ Specific methods to a single machine should be implemented in the machine-edge client
5
+ """
6
+ import asyncio
7
+ from contextlib import asynccontextmanager
8
+ import json
9
+ import logging
10
+ from typing import Dict, Any, Optional, Callable, Awaitable
11
+ from datetime import datetime, timezone
12
+ import nats
13
+ from puda_comms.models import (
14
+ CommandResponseStatus,
15
+ CommandResponse,
16
+ CommandResponseCode,
17
+ NATSMessage,
18
+ CommandRequest,
19
+ MessageType,
20
+ ImmediateCommand,
21
+ )
22
+ from nats.js.client import JetStreamContext
23
+ from nats.js.api import StreamConfig
24
+ from nats.js.errors import NotFoundError
25
+ from nats.aio.msg import Msg
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class MachineClient:
31
+ """
32
+ NATS client for machines.
33
+
34
+ Subject pattern: puda.{machine_id}.{category}.{sub_category}
35
+ - Telemetry: core NATS (no JetStream)
36
+ - Commands: JetStream with exactly-once delivery
37
+ - Queue commands: COMMAND_QUEUE stream (WorkQueue retention)
38
+ - Immediate commands: COMMAND_IMMEDIATE stream (WorkQueue retention)
39
+ - Command responses: JetStream streams (Interest retention)
40
+ - Queue responses: RESPONSE_QUEUE stream (Interest retention)
41
+ - Immediate responses: RESPONSE_IMMEDIATE stream (Interest retention)
42
+ - Events: Core NATS (fire-and-forget, no JetStream)
43
+ """
44
+
45
+ # Constants
46
+ NAMESPACE = "puda"
47
+ KEEP_ALIVE_INTERVAL = 25 # seconds
48
+ STREAM_COMMAND_QUEUE = "COMMAND_QUEUE"
49
+ STREAM_COMMAND_IMMEDIATE = "COMMAND_IMMEDIATE"
50
+ STREAM_RESPONSE_QUEUE = "RESPONSE_QUEUE"
51
+ STREAM_RESPONSE_IMMEDIATE = "RESPONSE_IMMEDIATE"
52
+
53
+ def __init__(self, servers: list[str], machine_id: str):
54
+ """
55
+ Initialize NATS client for machine.
56
+
57
+ Args:
58
+ servers: List of NATS server URLs (e.g., ["nats://localhost:4222"])
59
+ machine_id: Machine identifier (e.g., "opentron")
60
+ """
61
+ self.servers = servers
62
+ self.machine_id = machine_id
63
+ self.nc: Optional[nats.NATS] = None
64
+ self.js: Optional[JetStreamContext] = None
65
+ self.kv = None
66
+
67
+ # Generate subject and stream names
68
+ self._init_subjects()
69
+
70
+ # Default subscriptions
71
+ self._cmd_queue_sub = None
72
+ self._cmd_immediate_sub = None
73
+
74
+ # Connection state
75
+ self._is_connected = False
76
+ self._reconnect_handlers = []
77
+
78
+ # Queue control state
79
+ self._pause_lock = asyncio.Lock()
80
+ self._is_paused = False
81
+ self._cancelled_run_ids = set()
82
+
83
+ def _init_subjects(self):
84
+ """Initialize all subject and stream names."""
85
+ namespace = self.NAMESPACE
86
+ machine_id_safe = self.machine_id.replace('.', '-')
87
+
88
+ # Telemetry subjects (core NATS, no JetStream)
89
+ self.tlm_heartbeat = f"{namespace}.{machine_id_safe}.tlm.heartbeat"
90
+ self.tlm_pos = f"{namespace}.{machine_id_safe}.tlm.pos"
91
+ self.tlm_health = f"{namespace}.{machine_id_safe}.tlm.health"
92
+
93
+ # Command subjects (JetStream, exactly-once)
94
+ self.cmd_queue = f"{namespace}.{machine_id_safe}.cmd.queue" # should be pull consumer
95
+ self.cmd_immediate = f"{namespace}.{machine_id_safe}.cmd.immediate" # push consumer
96
+
97
+ # Response subjects (JetStream streams)
98
+ self.response_queue = f"{namespace}.{machine_id_safe}.cmd.response.queue"
99
+ self.response_immediate = f"{namespace}.{machine_id_safe}.cmd.response.immediate"
100
+
101
+ # Event subjects (Core NATS, no JetStream)
102
+ self.evt_log = f"{namespace}.{machine_id_safe}.evt.log"
103
+ self.evt_alert = f"{namespace}.{machine_id_safe}.evt.alert"
104
+ self.evt_media = f"{namespace}.{machine_id_safe}.evt.media"
105
+
106
+ # KV bucket name for status
107
+ self.kv_bucket_name = f"MACHINE_STATE_{machine_id_safe}"
108
+
109
+ # ==================== HELPER METHODS ====================
110
+
111
+ @staticmethod
112
+ def _format_timestamp() -> str:
113
+ """Format current timestamp as ISO 8601 UTC string."""
114
+ return datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
115
+
116
+ async def _publish_telemetry(self, subject: str, data: Dict[str, Any]) -> bool:
117
+ """Publish telemetry message to core NATS."""
118
+ if not self.nc:
119
+ logger.warning("NATS not connected, skipping %s", subject)
120
+ return False
121
+
122
+ try:
123
+ message = {'timestamp': self._format_timestamp(), **data}
124
+ await self.nc.publish(subject=subject, payload=json.dumps(message).encode())
125
+ logger.debug("Published to %s", subject)
126
+ return True
127
+ except Exception as e:
128
+ logger.error("Error publishing to %s: %s", subject, e)
129
+ return False
130
+
131
+ async def _publish_event(self, subject: str, data: Dict[str, Any]) -> bool:
132
+ """Publish event message to Core NATS (fire-and-forget)."""
133
+ if not self.nc:
134
+ logger.warning("NATS not connected, skipping %s", subject)
135
+ return False
136
+
137
+ try:
138
+ message = {'timestamp': self._format_timestamp(), **data}
139
+ await self.nc.publish(subject=subject, payload=json.dumps(message).encode())
140
+ logger.debug("Published to %s", subject)
141
+ return True
142
+ except Exception as e:
143
+ logger.error("Error publishing to %s: %s", subject, e)
144
+ return False
145
+
146
+ async def _ensure_stream(self, stream_name: str, subject_pattern: str, retention: str = 'workqueue'):
147
+ """
148
+ Ensure a stream exists with the specified retention policy.
149
+
150
+ Args:
151
+ stream_name: Name of the stream (e.g., STREAM_COMMAND_QUEUE)
152
+ subject_pattern: Subject pattern for the stream (e.g., "puda.*.cmd.queue")
153
+ retention: Retention policy ('workqueue', 'interest', or 'limits'). Defaults to 'workqueue'
154
+ """
155
+ if not self.js:
156
+ return
157
+
158
+ try:
159
+ # Try to get existing stream
160
+ stream_info = await self.js.stream_info(stream_name)
161
+ # Check if it has the correct pattern and retention
162
+ config = stream_info.config
163
+ if subject_pattern not in config.subjects or getattr(config, 'retention', None) != retention:
164
+ logger.info("Updating %s stream: subject=%s, retention=%s", stream_name, subject_pattern, retention)
165
+ updated_config = StreamConfig(
166
+ name=stream_name,
167
+ subjects=[subject_pattern],
168
+ retention=retention
169
+ )
170
+ await self.js.update_stream(config=updated_config)
171
+ logger.info("Successfully updated %s stream", stream_name)
172
+ except NotFoundError:
173
+ # Stream doesn't exist, create it
174
+ logger.info("Creating %s stream: subject=%s, retention=%s", stream_name, subject_pattern, retention)
175
+ await self.js.add_stream(
176
+ StreamConfig(
177
+ name=stream_name,
178
+ subjects=[subject_pattern],
179
+ retention=retention
180
+ )
181
+ )
182
+ logger.info("Successfully created %s stream", stream_name)
183
+ except Exception as e:
184
+ logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
185
+ raise
186
+
187
+ async def _ensure_command_queue_stream(self):
188
+ """Ensure COMMAND_QUEUE stream exists with WorkQueue retention policy."""
189
+ await self._ensure_stream(
190
+ self.STREAM_COMMAND_QUEUE,
191
+ f"{self.NAMESPACE}.*.cmd.queue"
192
+ )
193
+
194
+ async def _ensure_command_immediate_stream(self):
195
+ """Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
196
+ await self._ensure_stream(
197
+ self.STREAM_COMMAND_IMMEDIATE,
198
+ f"{self.NAMESPACE}.*.cmd.immediate"
199
+ )
200
+
201
+ async def _ensure_response_queue_stream(self):
202
+ """Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
203
+ await self._ensure_stream(
204
+ self.STREAM_RESPONSE_QUEUE,
205
+ f"{self.NAMESPACE}.*.cmd.response.queue",
206
+ retention='interest'
207
+ )
208
+
209
+ async def _ensure_response_immediate_stream(self):
210
+ """Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
211
+ await self._ensure_stream(
212
+ self.STREAM_RESPONSE_IMMEDIATE,
213
+ f"{self.NAMESPACE}.*.cmd.response.immediate",
214
+ retention='interest'
215
+ )
216
+
217
+ async def _get_or_create_kv_bucket(self):
218
+ """Get or create KV bucket, handling errors gracefully."""
219
+ if not self.js:
220
+ return None
221
+
222
+ try:
223
+ return await self.js.create_key_value(bucket=self.kv_bucket_name)
224
+ except Exception:
225
+ try:
226
+ return await self.js.key_value(self.kv_bucket_name)
227
+ except Exception as e:
228
+ logger.warning("Could not create or access KV bucket: %s", e)
229
+ return None
230
+
231
+ async def _cleanup_subscriptions(self):
232
+ """Unsubscribe from all subscriptions."""
233
+ # Clean up subscriptions
234
+ if self._cmd_queue_sub:
235
+ try:
236
+ await self._cmd_queue_sub.unsubscribe()
237
+ except Exception:
238
+ pass
239
+ self._cmd_queue_sub = None
240
+
241
+ if self._cmd_immediate_sub:
242
+ try:
243
+ await self._cmd_immediate_sub.unsubscribe()
244
+ except Exception:
245
+ pass
246
+ self._cmd_immediate_sub = None
247
+
248
+ def _reset_connection_state(self):
249
+ """Reset connection-related state."""
250
+ self._is_connected = False
251
+ self.js = None
252
+ self.kv = None
253
+ # Subscriptions will be recreated on reconnection
254
+ self._cmd_queue_sub = None
255
+ self._cmd_immediate_sub = None
256
+
257
+ # ==================== CONNECTION MANAGEMENT ====================
258
+
259
+ async def connect(self) -> bool:
260
+ """Connect to NATS server and initialize JetStream with auto-reconnection."""
261
+ try:
262
+ self.nc = await nats.connect(
263
+ servers=self.servers,
264
+ reconnect_time_wait=2,
265
+ max_reconnect_attempts=-1,
266
+ error_cb=self._error_callback,
267
+ disconnected_cb=self._disconnected_callback,
268
+ reconnected_cb=self._reconnected_callback,
269
+ closed_cb=self._closed_callback
270
+ )
271
+ self.js = self.nc.jetstream()
272
+ await self._ensure_command_queue_stream()
273
+ await self._ensure_command_immediate_stream()
274
+ await self._ensure_response_queue_stream()
275
+ await self._ensure_response_immediate_stream()
276
+ self.kv = await self._get_or_create_kv_bucket()
277
+ self._is_connected = True
278
+ logger.info("Connected to NATS servers: %s", self.servers)
279
+ return True
280
+ except Exception as e:
281
+ logger.error("Failed to connect to NATS: %s", e)
282
+ self._reset_connection_state()
283
+ return False
284
+
285
+ async def _error_callback(self, error: Exception):
286
+ """Callback for NATS errors."""
287
+ logger.error("NATS error: %s", error)
288
+
289
+ async def _disconnected_callback(self):
290
+ """Callback when disconnected from NATS."""
291
+ logger.warning("Disconnected from NATS servers")
292
+ self._reset_connection_state()
293
+
294
+ async def _reconnected_callback(self):
295
+ """Callback when reconnected to NATS."""
296
+ logger.info("Reconnected to NATS servers")
297
+ self._is_connected = True
298
+
299
+ if self.nc:
300
+ self.js = self.nc.jetstream()
301
+ await self._ensure_command_queue_stream()
302
+ await self._ensure_command_immediate_stream()
303
+ await self._ensure_response_queue_stream()
304
+ await self._ensure_response_immediate_stream()
305
+ self.kv = await self._get_or_create_kv_bucket()
306
+ await self._resubscribe_handlers()
307
+
308
+ async def _resubscribe_handlers(self):
309
+ """Re-subscribe to all handlers after reconnection."""
310
+ subscribe_methods = {
311
+ 'queue': self.subscribe_queue,
312
+ 'immediate': self.subscribe_immediate,
313
+ }
314
+
315
+ for handler_info in self._reconnect_handlers:
316
+ try:
317
+ handler_type = handler_info['type']
318
+ handler = handler_info['handler']
319
+ subscribe_method = subscribe_methods.get(handler_type)
320
+
321
+ if subscribe_method:
322
+ await subscribe_method(handler)
323
+ else:
324
+ logger.warning("Unknown handler type: %s", handler_type)
325
+ except Exception as e:
326
+ logger.error("Failed to re-subscribe %s: %s", handler_type, e)
327
+
328
+ async def _closed_callback(self):
329
+ """Callback when connection is closed."""
330
+ logger.info("NATS connection closed")
331
+ self._reset_connection_state()
332
+
333
+ async def disconnect(self):
334
+ """Disconnect from NATS server."""
335
+ await self._cleanup_subscriptions()
336
+ if self.nc:
337
+ await self.nc.close()
338
+ self._reset_connection_state()
339
+ logger.info("Disconnected from NATS")
340
+
341
+ # ==================== TELEMETRY (Core NATS, no JetStream) ====================
342
+
343
+ async def publish_heartbeat(self):
344
+ """Publish heartbeat telemetry (timestamp only)."""
345
+ await self._publish_telemetry(self.tlm_heartbeat, {})
346
+
347
+ async def publish_position(self, coords: Dict[str, float]):
348
+ """Publish real-time position coordinates."""
349
+ await self._publish_telemetry(self.tlm_pos, coords)
350
+
351
+ async def publish_health(self, vitals: Dict[str, Any]):
352
+ """Publish system health vitals (CPU, memory, temperature, etc.)."""
353
+ await self._publish_telemetry(self.tlm_health, vitals)
354
+
355
+ async def publish_state(self, data: Dict[str, Any]):
356
+ """
357
+ Overwrites machine state in nats KV store.
358
+
359
+ Args:
360
+ data: Dictionary with state data
361
+ """
362
+ if not self.kv:
363
+ logger.warning("KV store not available, skipping state update")
364
+ return
365
+
366
+ try:
367
+ message = {'timestamp': self._format_timestamp(), **data}
368
+ await self.kv.put(self.machine_id, json.dumps(message).encode())
369
+ logger.info("Updated state in KV store: %s", message)
370
+ except Exception as e:
371
+ logger.error("Error updating status in KV store: %s", e)
372
+
373
+ # ==================== COMMANDS (JetStream, exactly-once with run_id) ====================
374
+
375
+
376
+ @asynccontextmanager
377
+ async def _keep_message_alive(self, msg: Msg, interval: int = KEEP_ALIVE_INTERVAL):
378
+ """
379
+ Context manager that maintains a background task to reset the
380
+ redelivery timer (in_progress) while the block/machine is executing.
381
+ """
382
+ async def _heartbeat():
383
+ while True:
384
+ await asyncio.sleep(interval)
385
+ try:
386
+ await msg.in_progress()
387
+ logger.debug("Reset redelivery timer via keep-alive")
388
+ except Exception:
389
+ break
390
+
391
+ task = asyncio.create_task(_heartbeat())
392
+ try:
393
+ yield
394
+ finally:
395
+ task.cancel()
396
+ try:
397
+ await task
398
+ except asyncio.CancelledError:
399
+ pass
400
+
401
+ async def _publish_command_response(
402
+ self,
403
+ msg: Msg,
404
+ response: CommandResponse,
405
+ subject: str
406
+ ):
407
+ """
408
+ Publish command response message to JetStream response stream.
409
+
410
+ Args:
411
+ msg: NATS message
412
+ response: CommandResponse object containing status, code, message, and completed_at
413
+ subject: NATS subject to publish the response to
414
+ """
415
+ if not self.js:
416
+ return
417
+
418
+ try:
419
+ original_message = NATSMessage.model_validate_json(msg.data)
420
+
421
+ # Create response message with RESPONSE type
422
+ response_header = original_message.header.model_copy(
423
+ update={
424
+ 'message_type': MessageType.RESPONSE,
425
+ 'timestamp': self._format_timestamp()
426
+ }
427
+ )
428
+ response_message = original_message.model_copy(
429
+ update={'header': response_header, 'response': response}
430
+ )
431
+
432
+ # Publish to JetStream response stream
433
+ await self.js.publish(subject=subject, payload=response_message.model_dump_json().encode())
434
+ logger.info("Published command response to JetStream: %s", response_message.model_dump_json())
435
+ except Exception as e:
436
+ logger.error("Error publishing command response: %s", e)
437
+
438
+ async def process_queue_cmd(
439
+ self,
440
+ msg: Msg,
441
+ handler: Callable[[CommandRequest], Awaitable[CommandResponse]]
442
+ ) -> None:
443
+ """
444
+ Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
445
+
446
+ Args:
447
+ msg: NATS message
448
+ handler: Handler function that processes the message and returns CommandResponse
449
+ """
450
+ try:
451
+ # Parse message
452
+ message = NATSMessage.model_validate_json(msg.data)
453
+ run_id = message.header.run_id
454
+ step_number = message.command.step_number
455
+ command = message.command.name
456
+
457
+ # Check if cancelled
458
+ if run_id and run_id in self._cancelled_run_ids:
459
+ logger.info("Skipping cancelled command: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
460
+ await msg.ack()
461
+ await self._publish_command_response(
462
+ msg=msg,
463
+ response=CommandResponse(
464
+ status=CommandResponseStatus.ERROR,
465
+ code=CommandResponseCode.COMMAND_CANCELLED,
466
+ message='Command cancelled'
467
+ ),
468
+ subject=self.response_queue
469
+ )
470
+ # Note: Final state update should be published by the handler with machine-specific data
471
+ return
472
+
473
+ # Check if paused (for queue messages)
474
+ async with self._pause_lock:
475
+ if self._is_paused:
476
+ await self._publish_command_response(
477
+ msg,
478
+ CommandResponse(
479
+ status=CommandResponseStatus.ERROR,
480
+ code=CommandResponseCode.MACHINE_PAUSED,
481
+ message='Machine paused'
482
+ ),
483
+ subject=self.response_queue
484
+ )
485
+ return
486
+ while self._is_paused:
487
+ await msg.in_progress()
488
+ await asyncio.sleep(1)
489
+ # Re-check cancelled state in case it was cancelled while paused
490
+ if run_id and run_id in self._cancelled_run_ids:
491
+ logger.info("Command cancelled while paused: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
492
+ await msg.ack()
493
+ await self._publish_command_response(
494
+ msg=msg,
495
+ response=CommandResponse(
496
+ status=CommandResponseStatus.ERROR,
497
+ code=CommandResponseCode.COMMAND_CANCELLED,
498
+ message='Command cancelled'
499
+ ),
500
+ subject=self.response_queue
501
+ )
502
+ # Note: Final state update should be published by the handler with machine-specific data
503
+ return
504
+
505
+ # Execute handler with auto-heartbeat (task might take a while for machine to complete)
506
+ # The handler should be defined in the machine-specific edge module.
507
+ async with self._keep_message_alive(msg):
508
+ response: CommandResponse = await handler(message)
509
+
510
+ # Finalize message state based on response
511
+ if response.status == CommandResponseStatus.SUCCESS:
512
+ await msg.ack()
513
+ else:
514
+ await msg.term()
515
+
516
+ await self._publish_command_response(
517
+ msg=msg,
518
+ response=response,
519
+ subject=self.response_queue
520
+ )
521
+ # Note: Final state update should be published by the handler with machine-specific data
522
+
523
+ except asyncio.CancelledError:
524
+ # Handler was cancelled (e.g., via task cancellation)
525
+ logger.info("Handler execution cancelled: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
526
+ await msg.ack()
527
+ await self._publish_command_response(
528
+ msg=msg,
529
+ response=CommandResponse(
530
+ status=CommandResponseStatus.ERROR,
531
+ code=CommandResponseCode.COMMAND_CANCELLED,
532
+ message='Command cancelled'
533
+ ),
534
+ subject=self.response_queue
535
+ )
536
+ # Note: Final state update should be published by the handler with machine-specific data
537
+
538
+ except json.JSONDecodeError as e:
539
+ logger.error("JSON Decode Error. Terminating message.")
540
+ await msg.term()
541
+ await self._publish_command_response(
542
+ msg=msg,
543
+ response=CommandResponse(
544
+ status=CommandResponseStatus.ERROR,
545
+ code=CommandResponseCode.JSON_DECODE_ERROR,
546
+ message=f'JSON decode error: {e}'
547
+ ),
548
+ subject=self.response_queue
549
+ )
550
+ # Note: Final state update should be published by the handler with machine-specific data
551
+ # For JSON decode errors, handler wasn't called, so we can't rely on it
552
+ # This is a rare case - consider if handler should be called with None payload
553
+
554
+ except Exception as e:
555
+ # Check if cancelled before sending error response
556
+ if run_id and run_id in self._cancelled_run_ids:
557
+ logger.info("Command cancelled during execution (exception occurred): run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
558
+ await msg.ack()
559
+ await self._publish_command_response(
560
+ msg=msg,
561
+ response=CommandResponse(
562
+ status=CommandResponseStatus.ERROR,
563
+ code=CommandResponseCode.COMMAND_CANCELLED,
564
+ message='Command cancelled'
565
+ ),
566
+ subject=self.response_queue
567
+ )
568
+ # Note: Final state update should be published by the handler with machine-specific data
569
+ else:
570
+ # Terminate all errors to prevent infinite redelivery loops
571
+ logger.error("Handler failed (terminating message): %s", e)
572
+ await msg.term()
573
+ await self._publish_command_response(
574
+ msg=msg,
575
+ response=CommandResponse(
576
+ status=CommandResponseStatus.ERROR,
577
+ code=CommandResponseCode.EXECUTION_ERROR,
578
+ message=str(e)
579
+ ),
580
+ subject=self.response_queue
581
+ )
582
+ # Note: Final state update should be published by the handler with machine-specific data
583
+
584
+ async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
585
+ """Process immediate commands (pause, cancel, resume, etc.)."""
586
+ try:
587
+ message = NATSMessage.model_validate_json(msg.data)
588
+ # Ack immediately after successful parse
589
+ await msg.ack()
590
+
591
+ # Handle built-in commands
592
+ if message.command is None:
593
+ logger.error("Received message with no command")
594
+ return
595
+
596
+ command_name = message.command.name.lower()
597
+
598
+ match command_name:
599
+ case ImmediateCommand.PAUSE:
600
+ async with self._pause_lock:
601
+ if not self._is_paused:
602
+ self._is_paused = True
603
+ logger.info("Queue paused")
604
+ await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
605
+ # Call handler and use its response
606
+ response: CommandResponse = await handler(message)
607
+
608
+ case ImmediateCommand.RESUME:
609
+ async with self._pause_lock:
610
+ if self._is_paused:
611
+ self._is_paused = False
612
+ logger.info("Queue resumed")
613
+ await self.publish_state({'state': 'idle', 'run_id': None})
614
+ # Call handler and use its response
615
+ response: CommandResponse = await handler(message)
616
+
617
+ case ImmediateCommand.CANCEL:
618
+ if message.header.run_id:
619
+ self._cancelled_run_ids.add(message.header.run_id)
620
+ logger.info("Cancelling all commands with run_id: %s", message.header.run_id)
621
+ await self.publish_state({'state': 'idle', 'run_id': None})
622
+ # Call handler and use its response
623
+ response: CommandResponse = await handler(message)
624
+
625
+ case _:
626
+ # For other immediate commands, call the user-provided handler
627
+ response: CommandResponse = await handler(message)
628
+
629
+ await self._publish_command_response(
630
+ msg=msg,
631
+ response=response,
632
+ subject=self.response_immediate
633
+ )
634
+
635
+ except json.JSONDecodeError as e:
636
+ logger.error("JSON Decode Error in immediate command: %s", e)
637
+ # msg.ack() was already called, so we just need to publish error response
638
+ await self._publish_command_response(
639
+ msg=msg,
640
+ response=CommandResponse(
641
+ status=CommandResponseStatus.ERROR,
642
+ code=CommandResponseCode.JSON_DECODE_ERROR,
643
+ message=f'JSON decode error: {e}'
644
+ ),
645
+ subject=self.response_immediate
646
+ )
647
+ await self.publish_state({'state': 'error', 'run_id': None})
648
+
649
+ except Exception as e:
650
+ # msg.ack() was already called, so we just publish error response
651
+ logger.error("Error processing immediate command: %s", e)
652
+ await self._publish_command_response(
653
+ msg=msg,
654
+ response=CommandResponse(
655
+ status=CommandResponseStatus.ERROR,
656
+ code=CommandResponseCode.EXECUTION_ERROR,
657
+ message=str(e)
658
+ ),
659
+ subject=self.response_immediate
660
+ )
661
+ await self.publish_state({'state': 'error', 'run_id': None})
662
+
663
+ async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
664
+ """
665
+ Subscribe to queue commands with default consumer.
666
+
667
+ Args:
668
+ handler: Async function that processes command payloads and returns CommandResponse
669
+ """
670
+ if not self.js:
671
+ logger.error("JetStream not available for queue subscription")
672
+ return
673
+
674
+ # Ensure stream exists before attempting to subscribe
675
+ await self._ensure_command_queue_stream()
676
+
677
+ try:
678
+ async def message_handler(msg: Msg):
679
+ """Wrapper to process queue messages."""
680
+ await self.process_queue_cmd(msg, handler)
681
+
682
+ self._cmd_queue_sub = await self.js.subscribe(
683
+ subject=self.cmd_queue,
684
+ stream=self.STREAM_COMMAND_QUEUE,
685
+ durable=f"cmd_queue_{self.machine_id}",
686
+ cb=message_handler
687
+ )
688
+ except NotFoundError:
689
+ # Stream still not found after ensuring it exists - this shouldn't happen
690
+ # but handle it gracefully with detailed diagnostics
691
+ logger.error("Stream %s not found when subscribing to %s. This may indicate:",
692
+ self.STREAM_COMMAND_QUEUE, self.cmd_queue)
693
+ logger.error(" 1. Stream creation failed silently")
694
+ logger.error(" 2. Subject pattern mismatch (stream pattern: %s.*.cmd.queue, subject: %s)",
695
+ self.NAMESPACE, self.cmd_queue)
696
+ logger.error(" 3. NATS cluster propagation delay")
697
+ # Try to get stream info one more time for diagnostics
698
+ try:
699
+ stream_info = await self.js.stream_info(self.STREAM_COMMAND_QUEUE)
700
+ logger.error(" Stream actually exists with subjects: %s", stream_info.config.subjects)
701
+ except Exception as stream_check_error:
702
+ logger.error(" Stream verification failed: %s", stream_check_error)
703
+ raise
704
+
705
+ # Register handler for reconnection
706
+ if not any(h['type'] == 'queue' for h in self._reconnect_handlers):
707
+ self._reconnect_handlers.append({'type': 'queue', 'handler': handler})
708
+ logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
709
+ self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
710
+
711
+ async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
712
+ """
713
+ Subscribe to immediate commands with default consumer.
714
+
715
+ Args:
716
+ handler: Async function that processes command payloads (payload) -> bool
717
+ """
718
+ if not self.js:
719
+ logger.error("JetStream not available for immediate subscription")
720
+ return
721
+
722
+ async def message_handler(msg: Msg):
723
+ """Wrapper to process immediate messages."""
724
+ await self.process_immediate_cmd(msg, handler)
725
+
726
+ # Ensure stream exists before attempting to subscribe
727
+ await self._ensure_command_immediate_stream()
728
+
729
+ try:
730
+ self._cmd_immediate_sub = await self.js.subscribe(
731
+ subject=self.cmd_immediate,
732
+ stream=self.STREAM_COMMAND_IMMEDIATE,
733
+ durable=f"cmd_immed_{self.machine_id}",
734
+ cb=message_handler
735
+ )
736
+ except NotFoundError:
737
+ # Stream still not found after ensuring it exists - this shouldn't happen
738
+ # but handle it gracefully
739
+ logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
740
+ self.STREAM_COMMAND_IMMEDIATE)
741
+ raise
742
+
743
+ # Register handler for reconnection
744
+ if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
745
+ self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
746
+ logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
747
+ self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
748
+
749
+
750
+ # ==================== EVENTS (Core NATS, no JetStream) ====================
751
+
752
+ async def publish_log(self, log_level: str, msg: str, **kwargs):
753
+ """Publish log event (Core NATS, fire-and-forget)."""
754
+ await self._publish_event(
755
+ self.evt_log,
756
+ {'log_level': log_level, 'msg': msg, **kwargs}
757
+ )
758
+
759
+ async def publish_alert(self, alert_type: str, severity: str, **kwargs):
760
+ """Publish alert event for critical issues (Core NATS, fire-and-forget)."""
761
+ await self._publish_event(
762
+ self.evt_alert,
763
+ {'type': alert_type, 'severity': severity, **kwargs}
764
+ )
765
+
766
+ async def publish_media(self, media_url: str, media_type: str = "image", **kwargs):
767
+ """Publish media event after uploading to object storage (Core NATS, fire-and-forget)."""
768
+ await self._publish_event(
769
+ self.evt_media,
770
+ {'media_url': media_url, 'media_type': media_type, **kwargs}
771
+ )