puda-comms 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,14 +10,13 @@ This service handles:
10
10
  import asyncio
11
11
  import json
12
12
  import logging
13
- import os
14
13
  import signal
15
14
  from datetime import datetime, timezone
16
- from typing import Dict, Any, Optional, Tuple
15
+ from typing import Dict, Any, Optional
17
16
  import nats
18
17
  from nats.js.client import JetStreamContext
19
18
  from nats.aio.msg import Msg
20
- from puda_comms.models import CommandRequest, CommandResponse, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
19
+ from puda_comms.models import CommandRequest, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
21
20
 
22
21
  logger = logging.getLogger(__name__)
23
22
 
@@ -38,7 +37,7 @@ class ResponseHandler:
38
37
  def __init__(self, js: JetStreamContext, machine_id: str):
39
38
  self.js = js
40
39
  self.machine_id = machine_id
41
- self._pending_responses: Dict[str, Tuple[asyncio.Event, CommandResponse]] = {}
40
+ self._pending_responses: Dict[str, Dict[str, Any]] = {} # {'event': asyncio.Event, 'response': Optional[NATSMessage]}
42
41
  self._queue_consumer = None
43
42
  self._immediate_consumer = None
44
43
  self._initialized = False
@@ -103,8 +102,8 @@ class ResponseHandler:
103
102
 
104
103
  # Get the pending response
105
104
  pending = self._pending_responses[key]
106
- # Store the full NATSMessage JSON structure
107
- pending['response'] = message.model_dump()
105
+ # Store the NATSMessage directly
106
+ pending['response'] = message
108
107
  # Signal that response was received
109
108
  # Don't delete here - let get_response() delete it after retrieval
110
109
  pending['event'].set()
@@ -153,7 +152,7 @@ class ResponseHandler:
153
152
  }
154
153
  return event
155
154
 
156
- def get_response(self, run_id: str, step_number: int) -> Optional[Dict[str, Any]]:
155
+ def get_response(self, run_id: str, step_number: int) -> Optional[NATSMessage]:
157
156
  """
158
157
  Get the response for a pending command.
159
158
 
@@ -162,7 +161,7 @@ class ResponseHandler:
162
161
  step_number: Step number for the command
163
162
 
164
163
  Returns:
165
- The NATSMessage dict structure if available, None otherwise
164
+ The NATSMessage if available, None otherwise
166
165
  """
167
166
  key = f"{run_id}:{str(step_number)}"
168
167
  if key in self._pending_responses:
@@ -178,8 +177,16 @@ class ResponseHandler:
178
177
  if key in self._pending_responses:
179
178
  del self._pending_responses[key]
180
179
 
180
+ def cancel_all_pending(self):
181
+ """Cancel all pending responses by setting their events. This wakes up any waiting tasks immediately."""
182
+ for pending in self._pending_responses.values():
183
+ pending['event'].set()
184
+
181
185
  async def cleanup(self):
182
186
  """Clean up subscriptions."""
187
+ # Cancel all pending responses first to wake up waiting tasks
188
+ self.cancel_all_pending()
189
+
183
190
  if self._queue_consumer:
184
191
  try:
185
192
  await self._queue_consumer.unsubscribe()
@@ -200,11 +207,6 @@ class CommandService:
200
207
  Handles connection management, command parsing, and response handling.
201
208
  Can send commands to multiple machines.
202
209
 
203
- Supports async context manager usage for automatic cleanup:
204
- async with CommandService() as service:
205
- await service.send_queue_command(...)
206
- # Automatically disconnects on exit
207
-
208
210
  Automatically registers signal handlers (SIGTERM, SIGINT) for graceful shutdown.
209
211
  """
210
212
 
@@ -212,20 +214,19 @@ class CommandService:
212
214
 
213
215
  def __init__(
214
216
  self,
215
- servers: Optional[list[str]] = None
217
+ servers: list[str]
216
218
  ):
217
219
  """
218
220
  Initialize NATS service.
219
221
 
220
222
  Args:
221
- servers: List of NATS server URLs. If None, reads from NATS_SERVERS env var.
223
+ servers: List of NATS server URLs. Must be a non-empty list.
224
+
225
+ Raises:
226
+ ValueError: If servers is None or empty.
222
227
  """
223
- if servers is None:
224
- nats_servers_env = os.getenv(
225
- "NATS_SERVERS",
226
- "nats://192.168.50.201:4222,nats://192.168.50.201:4223,nats://192.168.50.201:4224"
227
- )
228
- servers = [s.strip() for s in nats_servers_env.split(",")]
228
+ if servers is None or len(servers) == 0:
229
+ raise ValueError("Please provide a non-empty list of NATS server URLs")
229
230
 
230
231
  self.servers = servers
231
232
  self.nc: Optional[nats.NATS] = None
@@ -254,24 +255,50 @@ class CommandService:
254
255
  """
255
256
  Connect to NATS servers.
256
257
 
258
+ Limits connection attempts to 3. After 3 failed attempts, gives up and logs error.
259
+
257
260
  Returns:
258
261
  True if connected successfully, False otherwise
259
262
  """
260
263
  if self._connected:
261
264
  return True
262
265
 
263
- try:
264
- self.nc = await nats.connect(servers=self.servers)
265
- self.js = self.nc.jetstream()
266
-
267
- self._connected = True
268
- logger.info("Connected to NATS servers: %s", self.servers)
269
- return True
270
-
271
- except Exception as e:
272
- logger.error("Failed to connect to NATS: %s", e)
273
- self._connected = False
274
- return False
266
+ max_attempts = 3
267
+ connect_timeout = 3 # 3 seconds timeout per connection attempt
268
+
269
+ for attempt in range(1, max_attempts + 1):
270
+ try:
271
+ logger.info("Connection attempt %d/%d to NATS servers: %s", attempt, max_attempts, self.servers)
272
+ self.nc = await asyncio.wait_for(
273
+ nats.connect(
274
+ servers=self.servers,
275
+ connect_timeout=connect_timeout,
276
+ reconnect_time_wait=2,
277
+ max_reconnect_attempts=0 # No reconnection during initial connection
278
+ ),
279
+ timeout=connect_timeout + 1 # Slightly longer timeout for the wait_for
280
+ )
281
+ self.js = self.nc.jetstream()
282
+
283
+ self._connected = True
284
+ logger.info("Connected to NATS servers")
285
+ return True
286
+
287
+ except asyncio.TimeoutError:
288
+ logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt, max_attempts, connect_timeout)
289
+ if attempt < max_attempts:
290
+ logger.info("Retrying connection...")
291
+ else:
292
+ logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
293
+ except Exception as e:
294
+ logger.warning("Connection attempt %d/%d failed: %s", attempt, max_attempts, e)
295
+ if attempt < max_attempts:
296
+ logger.info("Retrying connection...")
297
+ else:
298
+ logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
299
+
300
+ self._connected = False
301
+ return False
275
302
 
276
303
  async def _get_response_handler(self, machine_id: str) -> ResponseHandler:
277
304
  """
@@ -316,6 +343,8 @@ class CommandService:
316
343
  request: CommandRequest,
317
344
  machine_id: str,
318
345
  run_id: str,
346
+ user_id: str,
347
+ username: str,
319
348
  timeout: int = 120
320
349
  ) -> Optional[NATSMessage]:
321
350
  """
@@ -325,6 +354,8 @@ class CommandService:
325
354
  request: CommandRequest model containing command details
326
355
  machine_id: Machine ID to send the command to
327
356
  run_id: Run ID for the command
357
+ user_id: User ID who initiated the command
358
+ username: Username who initiated the command
328
359
  timeout: Maximum time to wait for response in seconds
329
360
 
330
361
  Returns:
@@ -337,8 +368,8 @@ class CommandService:
337
368
  subject = f"{NAMESPACE}.{machine_id}.cmd.queue"
338
369
 
339
370
  logger.info(
340
- "Sending queue command: machine_id=%s, command=%s, run_id=%s, step_number=%s",
341
- machine_id, request.name, run_id, request.step_number
371
+ "Sending queue command: subject=%s, command=%s, run_id=%s, step_number=%s",
372
+ subject, request.name, run_id, request.step_number
342
373
  )
343
374
 
344
375
  # Get or create response handler for this machine
@@ -347,7 +378,7 @@ class CommandService:
347
378
  response_event = response_handler.register_pending(run_id, request.step_number)
348
379
 
349
380
  # Build payload
350
- payload = self._build_command_payload(request, machine_id, run_id)
381
+ payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
351
382
 
352
383
  try:
353
384
  # Publish to JetStream
@@ -370,11 +401,7 @@ class CommandService:
370
401
  await asyncio.sleep(0.1)
371
402
 
372
403
  # Get the response
373
- response_data = response_handler.get_response(run_id, request.step_number)
374
- if response_data is None:
375
- return None
376
-
377
- return NATSMessage.model_validate(response_data)
404
+ return response_handler.get_response(run_id, request.step_number)
378
405
 
379
406
  except Exception as e:
380
407
  logger.error("Error sending queue command: %s", e)
@@ -387,6 +414,8 @@ class CommandService:
387
414
  requests: list[CommandRequest],
388
415
  machine_id: str,
389
416
  run_id: str,
417
+ user_id: str,
418
+ username: str,
390
419
  timeout: int = 120
391
420
  ) -> Optional[NATSMessage]:
392
421
  """
@@ -400,6 +429,8 @@ class CommandService:
400
429
  requests: List of CommandRequest models to send sequentially
401
430
  machine_id: Machine ID to send the commands to
402
431
  run_id: Run ID for all commands
432
+ user_id: User ID who initiated the commands
433
+ username: Username who initiated the commands
403
434
  timeout: Maximum time to wait for each response in seconds
404
435
 
405
436
  Returns:
@@ -435,6 +466,8 @@ class CommandService:
435
466
  request=request,
436
467
  machine_id=machine_id,
437
468
  run_id=run_id,
469
+ user_id=user_id,
470
+ username=username,
438
471
  timeout=timeout
439
472
  )
440
473
 
@@ -495,6 +528,8 @@ class CommandService:
495
528
  request: CommandRequest,
496
529
  machine_id: str,
497
530
  run_id: str,
531
+ user_id: str,
532
+ username: str,
498
533
  timeout: int = 120
499
534
  ) -> Optional[NATSMessage]:
500
535
  """
@@ -504,6 +539,8 @@ class CommandService:
504
539
  request: CommandRequest model containing command details
505
540
  machine_id: Machine ID to send the command to
506
541
  run_id: Run ID for the command
542
+ user_id: User ID who initiated the command
543
+ username: Username who initiated the command
507
544
  timeout: Maximum time to wait for response in seconds
508
545
 
509
546
  Returns:
@@ -528,7 +565,7 @@ class CommandService:
528
565
  response_received = response_handler.register_pending(run_id, request.step_number)
529
566
 
530
567
  # Build payload
531
- payload = self._build_command_payload(request, machine_id, run_id)
568
+ payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
532
569
 
533
570
  try:
534
571
  # Publish to JetStream
@@ -551,11 +588,7 @@ class CommandService:
551
588
  await asyncio.sleep(0.1)
552
589
 
553
590
  # Get the response
554
- response_data = response_handler.get_response(run_id, request.step_number)
555
- if response_data is None:
556
- return None
557
-
558
- return NATSMessage.model_validate(response_data)
591
+ return response_handler.get_response(run_id, request.step_number)
559
592
 
560
593
  except Exception as e:
561
594
  logger.error("Error sending immediate command: %s", e)
@@ -608,7 +641,9 @@ class CommandService:
608
641
  self,
609
642
  command_request: CommandRequest,
610
643
  machine_id: str,
611
- run_id: str
644
+ run_id: str,
645
+ user_id: str,
646
+ username: str
612
647
  ) -> NATSMessage:
613
648
  """
614
649
  Build a command payload in the expected format.
@@ -617,6 +652,8 @@ class CommandService:
617
652
  command_request: CommandRequest model containing command details
618
653
  machine_id: Machine ID for the command
619
654
  run_id: Run ID for the command
655
+ user_id: User ID who initiated the command
656
+ username: Username who initiated the command
620
657
 
621
658
  Returns:
622
659
  NATSMessage object ready for NATS transmission
@@ -625,6 +662,8 @@ class CommandService:
625
662
  message_type=MessageType.COMMAND,
626
663
  version="1.0",
627
664
  timestamp=datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
665
+ user_id=user_id,
666
+ username=username,
628
667
  machine_id=machine_id,
629
668
  run_id=run_id
630
669
  )
@@ -20,7 +20,7 @@ from puda_comms.models import (
20
20
  ImmediateCommand,
21
21
  )
22
22
  from nats.js.client import JetStreamContext
23
- from nats.js.api import StreamConfig
23
+ from nats.js.api import StreamConfig, ConsumerConfig
24
24
  from nats.js.errors import NotFoundError
25
25
  from nats.aio.msg import Msg
26
26
 
@@ -69,11 +69,13 @@ class MachineClient:
69
69
 
70
70
  # Default subscriptions
71
71
  self._cmd_queue_sub = None
72
+ self._cmd_queue_task = None # Background task for pull consumer
72
73
  self._cmd_immediate_sub = None
73
74
 
74
75
  # Connection state
75
76
  self._is_connected = False
76
- self._reconnect_handlers = []
77
+ self._queue_handler = None
78
+ self._immediate_handler = None
77
79
 
78
80
  # Queue control state
79
81
  self._pause_lock = asyncio.Lock()
@@ -184,30 +186,22 @@ class MachineClient:
184
186
  logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
185
187
  raise
186
188
 
187
- async def _ensure_command_queue_stream(self):
188
- """Ensure COMMAND_QUEUE stream exists with WorkQueue retention policy."""
189
+ async def _ensure_all_streams(self):
190
+ """Ensure all required streams exist with correct retention policies."""
189
191
  await self._ensure_stream(
190
192
  self.STREAM_COMMAND_QUEUE,
191
- f"{self.NAMESPACE}.*.cmd.queue"
193
+ f"{self.NAMESPACE}.*.cmd.queue",
194
+ retention='workqueue'
192
195
  )
193
-
194
- async def _ensure_command_immediate_stream(self):
195
- """Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
196
196
  await self._ensure_stream(
197
197
  self.STREAM_COMMAND_IMMEDIATE,
198
198
  f"{self.NAMESPACE}.*.cmd.immediate"
199
199
  )
200
-
201
- async def _ensure_response_queue_stream(self):
202
- """Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
203
200
  await self._ensure_stream(
204
201
  self.STREAM_RESPONSE_QUEUE,
205
202
  f"{self.NAMESPACE}.*.cmd.response.queue",
206
203
  retention='interest'
207
204
  )
208
-
209
- async def _ensure_response_immediate_stream(self):
210
- """Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
211
205
  await self._ensure_stream(
212
206
  self.STREAM_RESPONSE_IMMEDIATE,
213
207
  f"{self.NAMESPACE}.*.cmd.response.immediate",
@@ -230,7 +224,17 @@ class MachineClient:
230
224
 
231
225
  async def _cleanup_subscriptions(self):
232
226
  """Unsubscribe from all subscriptions."""
233
- # Clean up subscriptions
227
+ # Clean up queue subscription (pull consumer)
228
+ if self._cmd_queue_task:
229
+ try:
230
+ self._cmd_queue_task.cancel()
231
+ await self._cmd_queue_task
232
+ except asyncio.CancelledError:
233
+ pass
234
+ except Exception:
235
+ pass
236
+ self._cmd_queue_task = None
237
+
234
238
  if self._cmd_queue_sub:
235
239
  try:
236
240
  await self._cmd_queue_sub.unsubscribe()
@@ -252,6 +256,7 @@ class MachineClient:
252
256
  self.kv = None
253
257
  # Subscriptions will be recreated on reconnection
254
258
  self._cmd_queue_sub = None
259
+ self._cmd_queue_task = None
255
260
  self._cmd_immediate_sub = None
256
261
 
257
262
  # ==================== CONNECTION MANAGEMENT ====================
@@ -261,6 +266,7 @@ class MachineClient:
261
266
  try:
262
267
  self.nc = await nats.connect(
263
268
  servers=self.servers,
269
+ connect_timeout=10, # 10 seconds timeout for initial connection
264
270
  reconnect_time_wait=2,
265
271
  max_reconnect_attempts=-1,
266
272
  error_cb=self._error_callback,
@@ -269,10 +275,7 @@ class MachineClient:
269
275
  closed_cb=self._closed_callback
270
276
  )
271
277
  self.js = self.nc.jetstream()
272
- await self._ensure_command_queue_stream()
273
- await self._ensure_command_immediate_stream()
274
- await self._ensure_response_queue_stream()
275
- await self._ensure_response_immediate_stream()
278
+ await self._ensure_all_streams()
276
279
  self.kv = await self._get_or_create_kv_bucket()
277
280
  self._is_connected = True
278
281
  logger.info("Connected to NATS servers: %s", self.servers)
@@ -298,32 +301,16 @@ class MachineClient:
298
301
 
299
302
  if self.nc:
300
303
  self.js = self.nc.jetstream()
301
- await self._ensure_command_queue_stream()
302
- await self._ensure_command_immediate_stream()
303
- await self._ensure_response_queue_stream()
304
- await self._ensure_response_immediate_stream()
304
+ await self._ensure_all_streams()
305
305
  self.kv = await self._get_or_create_kv_bucket()
306
306
  await self._resubscribe_handlers()
307
307
 
308
308
  async def _resubscribe_handlers(self):
309
309
  """Re-subscribe to all handlers after reconnection."""
310
- subscribe_methods = {
311
- 'queue': self.subscribe_queue,
312
- 'immediate': self.subscribe_immediate,
313
- }
314
-
315
- for handler_info in self._reconnect_handlers:
316
- try:
317
- handler_type = handler_info['type']
318
- handler = handler_info['handler']
319
- subscribe_method = subscribe_methods.get(handler_type)
320
-
321
- if subscribe_method:
322
- await subscribe_method(handler)
323
- else:
324
- logger.warning("Unknown handler type: %s", handler_type)
325
- except Exception as e:
326
- logger.error("Failed to re-subscribe %s: %s", handler_type, e)
310
+ if self._queue_handler:
311
+ await self.subscribe_queue(self._queue_handler)
312
+ if self._immediate_handler:
313
+ await self.subscribe_immediate(self._immediate_handler)
327
314
 
328
315
  async def _closed_callback(self):
329
316
  """Callback when connection is closed."""
@@ -438,7 +425,7 @@ class MachineClient:
438
425
  async def process_queue_cmd(
439
426
  self,
440
427
  msg: Msg,
441
- handler: Callable[[CommandRequest], Awaitable[CommandResponse]]
428
+ handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
442
429
  ) -> None:
443
430
  """
444
431
  Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
@@ -660,9 +647,54 @@ class MachineClient:
660
647
  )
661
648
  await self.publish_state({'state': 'error', 'run_id': None})
662
649
 
650
+ async def _verify_or_recreate_consumer(self, durable_name: str):
651
+ """
652
+ Check if consumer exists and verify/update its configuration.
653
+ Deletes and recreates the consumer if configuration doesn't match.
654
+
655
+ Args:
656
+ durable_name: Name of the durable consumer to verify
657
+ """
658
+ # Check if consumer exists and verify/update its configuration
659
+ try:
660
+ consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
661
+ logger.debug("Durable consumer %s already exists", durable_name)
662
+
663
+ # Check if consumer config matches what we need
664
+ config = consumer_info.config
665
+ needs_recreate = False
666
+ if getattr(config, 'filter_subject', None) != self.cmd_queue:
667
+ logger.warning("Consumer filter_subject mismatch: expected %s, got %s",
668
+ self.cmd_queue, getattr(config, 'filter_subject', None))
669
+ needs_recreate = True
670
+ if getattr(config, 'ack_policy', None) != 'explicit':
671
+ logger.warning("Consumer ack_policy mismatch: expected explicit, got %s",
672
+ getattr(config, 'ack_policy', None))
673
+ needs_recreate = True
674
+ if getattr(config, 'deliver_policy', None) != 'all':
675
+ logger.warning("Consumer deliver_policy mismatch: expected all, got %s",
676
+ getattr(config, 'deliver_policy', None))
677
+ needs_recreate = True
678
+
679
+ if needs_recreate:
680
+ # Consumer exists but config doesn't match - delete and recreate
681
+ logger.info("Consumer config mismatch, deleting and recreating: %s", durable_name)
682
+ try:
683
+ await self.js.delete_consumer(self.STREAM_COMMAND_QUEUE, durable_name)
684
+ except Exception as e:
685
+ logger.warning("Error deleting consumer: %s", e)
686
+ else:
687
+ # Log consumer state for diagnostics
688
+ logger.info("Consumer exists with correct config - pending: %d, delivered: %d, ack_pending: %d",
689
+ consumer_info.num_pending, consumer_info.delivered.consumer_seq,
690
+ consumer_info.num_ack_pending)
691
+ except NotFoundError:
692
+ # Consumer doesn't exist, will be created by pull_subscribe
693
+ logger.debug("Durable consumer %s does not exist, will be created", durable_name)
694
+
663
695
  async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
664
696
  """
665
- Subscribe to queue commands with default consumer.
697
+ Subscribe to queue commands with pull consumer.
666
698
 
667
699
  Args:
668
700
  handler: Async function that processes command payloads and returns CommandResponse
@@ -672,19 +704,65 @@ class MachineClient:
672
704
  return
673
705
 
674
706
  # Ensure stream exists before attempting to subscribe
675
- await self._ensure_command_queue_stream()
707
+ await self._ensure_all_streams()
676
708
 
677
709
  try:
678
- async def message_handler(msg: Msg):
679
- """Wrapper to process queue messages."""
680
- await self.process_queue_cmd(msg, handler)
681
-
682
- self._cmd_queue_sub = await self.js.subscribe(
710
+ durable_name = f"cmd_queue_{self.machine_id}"
711
+
712
+ await self._verify_or_recreate_consumer(durable_name)
713
+
714
+ # Create pull subscription - this will create the consumer if it doesn't exist
715
+ # Pass config directly to ensure correct consumer configuration
716
+ consumer_config = ConsumerConfig(
717
+ durable_name=durable_name,
718
+ filter_subject=self.cmd_queue,
719
+ ack_policy="explicit",
720
+ deliver_policy="all", # Required for WorkQueue: deliver all messages from the beginning
721
+ )
722
+
723
+ self._cmd_queue_sub = await self.js.pull_subscribe(
683
724
  subject=self.cmd_queue,
725
+ durable=durable_name,
684
726
  stream=self.STREAM_COMMAND_QUEUE,
685
- durable=f"cmd_queue_{self.machine_id}",
686
- cb=message_handler
727
+ config=consumer_config
687
728
  )
729
+
730
+ # Log final consumer info for diagnostics
731
+ try:
732
+ consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
733
+ logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s, pending: %d, ack_pending: %d",
734
+ self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE,
735
+ consumer_info.num_pending, consumer_info.num_ack_pending)
736
+ except Exception as e:
737
+ logger.warning("Could not get consumer info after subscription: %s", e)
738
+ logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s",
739
+ self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE)
740
+
741
+ # Start background task to pull and process messages
742
+ async def pull_messages():
743
+ """Continuously pull messages from the queue."""
744
+ try:
745
+ while True:
746
+ try:
747
+ # Fetch messages (batch of 1, timeout 1 second)
748
+ msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
749
+ if msgs:
750
+ logger.debug("Pulled %d message(s) from queue", len(msgs))
751
+ for msg in msgs:
752
+ await self.process_queue_cmd(msg, handler)
753
+ except asyncio.TimeoutError:
754
+ # Timeout is expected when no messages are available
755
+ continue
756
+ except Exception as e:
757
+ logger.error("Error pulling queue messages: %s", e, exc_info=True)
758
+ await asyncio.sleep(1) # Wait before retrying
759
+ except asyncio.CancelledError:
760
+ logger.debug("Queue pull task cancelled")
761
+ raise
762
+
763
+ self._cmd_queue_task = asyncio.create_task(pull_messages())
764
+ logger.info("Started background task for pulling queue messages")
765
+
688
766
  except NotFoundError:
689
767
  # Stream still not found after ensuring it exists - this shouldn't happen
690
768
  # but handle it gracefully with detailed diagnostics
@@ -702,10 +780,9 @@ class MachineClient:
702
780
  logger.error(" Stream verification failed: %s", stream_check_error)
703
781
  raise
704
782
 
705
- # Register handler for reconnection
706
- if not any(h['type'] == 'queue' for h in self._reconnect_handlers):
707
- self._reconnect_handlers.append({'type': 'queue', 'handler': handler})
708
- logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
783
+ # Store handler for reconnection
784
+ self._queue_handler = handler
785
+ logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
709
786
  self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
710
787
 
711
788
  async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
@@ -719,19 +796,26 @@ class MachineClient:
719
796
  logger.error("JetStream not available for immediate subscription")
720
797
  return
721
798
 
799
+ # Store handler for use in callback and reconnection
800
+ self._immediate_handler = handler
801
+
722
802
  async def message_handler(msg: Msg):
723
- """Wrapper to process immediate messages."""
724
- await self.process_immediate_cmd(msg, handler)
803
+ """Process immediate messages using stored handler."""
804
+ await self.process_immediate_cmd(msg, self._immediate_handler)
725
805
 
726
806
  # Ensure stream exists before attempting to subscribe
727
- await self._ensure_command_immediate_stream()
807
+ await self._ensure_stream(
808
+ self.STREAM_COMMAND_IMMEDIATE,
809
+ f"{self.NAMESPACE}.*.cmd.immediate",
810
+ retention='workqueue'
811
+ )
728
812
 
729
813
  try:
730
814
  self._cmd_immediate_sub = await self.js.subscribe(
731
815
  subject=self.cmd_immediate,
732
816
  stream=self.STREAM_COMMAND_IMMEDIATE,
733
817
  durable=f"cmd_immed_{self.machine_id}",
734
- cb=message_handler
818
+ cb=message_handler # required for push consumer to handle messages
735
819
  )
736
820
  except NotFoundError:
737
821
  # Stream still not found after ensuring it exists - this shouldn't happen
@@ -740,9 +824,6 @@ class MachineClient:
740
824
  self.STREAM_COMMAND_IMMEDIATE)
741
825
  raise
742
826
 
743
- # Register handler for reconnection
744
- if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
745
- self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
746
827
  logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
747
828
  self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
748
829
 
puda_comms/models.py CHANGED
@@ -68,18 +68,19 @@ class CommandResponse(BaseModel):
68
68
 
69
69
  class MessageHeader(BaseModel):
70
70
  """Header for NATS messages."""
71
- message_type: MessageType = Field(description="Type of message")
72
71
  version: str = Field(default="1.0", description="Message version")
73
- timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
72
+ message_type: MessageType = Field(description="Type of message")
73
+ user_id: str = Field(description="User ID")
74
+ username: str = Field(description="User name")
74
75
  machine_id: str = Field(description="Machine ID")
75
76
  run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
76
-
77
+ timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
77
78
  class NATSMessage(BaseModel):
78
79
  """
79
80
  Complete NATS message structure.
80
81
 
81
82
  Structure:
82
- - header: MessageHeader with message_type, version, timestamp, machine_id, run_id
83
+ - header: MessageHeader with message_type, version, timestamp, user_id, username, machine_id, run_id
83
84
  - command: Optional CommandRequest (for command messages)
84
85
  - response: Optional CommandResponse data (for response messages)
85
86
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: puda-comms
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Communication library for the PUDA platform.
5
5
  Author: zhao
6
6
  Author-email: zhao <20024592+agentzhao@users.noreply.github.com>
@@ -121,6 +121,8 @@ Header metadata for NATS messages.
121
121
  - `message_type` (MessageType): Type of message (COMMAND, RESPONSE, LOG, etc.)
122
122
  - `version` (str): Message version (default: "1.0")
123
123
  - `timestamp` (str): ISO 8601 UTC timestamp (auto-generated)
124
+ - `user_id` (str): User ID who initiated the command
125
+ - `username` (str): Username who initiated the command
124
126
  - `machine_id` (str): Identifier for the target machine
125
127
  - `run_id` (Optional[str]): Unique identifier (UUID) for the run/workflow
126
128
 
@@ -130,6 +132,8 @@ header = MessageHeader(
130
132
  message_type=MessageType.RESPONSE,
131
133
  version="1.0",
132
134
  timestamp="2026-01-20T02:00:46Z",
135
+ user_id="user123",
136
+ username="John Doe",
133
137
  machine_id="first",
134
138
  run_id="092073e6-13d0-4756-8d99-eff1612a5a72"
135
139
  )
@@ -154,6 +158,8 @@ Complete NATS message structure combining header with optional command or respon
154
158
  "message_type": "response",
155
159
  "version": "1.0",
156
160
  "timestamp": "2026-01-20T02:00:46Z",
161
+ "user_id": "user123",
162
+ "username": "John Doe",
157
163
  "machine_id": "first",
158
164
  "run_id": "092073e6-13d0-4756-8d99-eff1612a5a72"
159
165
  },
@@ -229,6 +235,8 @@ reply = await service.send_queue_command(
229
235
  request=request,
230
236
  machine_id="first",
231
237
  run_id=run_id,
238
+ user_id="user123",
239
+ username="John Doe",
232
240
  timeout=60 # Wait up to 60 seconds
233
241
  )
234
242
 
@@ -237,6 +245,8 @@ reply = await service.send_queue_commands(
237
245
  requests=commands,
238
246
  machine_id="first",
239
247
  run_id=run_id,
248
+ user_id="user123",
249
+ username="John Doe",
240
250
  timeout=60 # Wait up to 60 seconds per command
241
251
  )
242
252
  ```
@@ -274,7 +284,9 @@ Always check the response status and handle errors appropriately:
274
284
  reply: NATSMessage = await service.send_queue_command(
275
285
  request=request,
276
286
  machine_id="first",
277
- run_id=run_id
287
+ run_id=run_id,
288
+ user_id="user123",
289
+ username="John Doe"
278
290
  )
279
291
 
280
292
  if reply is None:
@@ -292,14 +304,30 @@ else:
292
304
 
293
305
  ### Configuration
294
306
 
295
- The `CommandService` reads NATS server URLs from the `NATS_SERVERS` environment variable, or defaults to:
307
+ #### NATS Server Configuration
308
+
309
+ The `CommandService` requires NATS server URLs to be specified explicitly. There are no default values. You must provide servers in one of two ways:
310
+
311
+ **Option 1: Via environment variable (comma-separated string)**
312
+
313
+ Set the `NATS_SERVERS` environment variable with comma-separated server URLs:
314
+
315
+ ```bash
316
+ export NATS_SERVERS="nats://192.168.50.201:4222,nats://192.168.50.201:4223,nats://192.168.50.201:4224"
296
317
  ```
297
- nats://192.168.50.201:4222,nats://192.168.50.201:4223,nats://192.168.50.201:4224
318
+
319
+ Then parse it when creating a `CommandService`:
320
+ ```python
321
+ import os
322
+ nats_servers = [s.strip() for s in os.getenv("NATS_SERVERS", "").split(",") if s.strip()]
323
+ service = CommandService(servers=nats_servers)
298
324
  ```
299
325
 
300
- You can also specify servers explicitly:
326
+ **Option 2: Directly as a list**
327
+
328
+ Specify servers directly when creating a `CommandService`:
301
329
  ```python
302
- service = CommandService(servers=["nats://localhost:4222"])
330
+ service = CommandService(servers=["nats://192.168.50.201:4222", "nats://192.168.50.201:4223", "nats://192.168.50.201:4224"])
303
331
  ```
304
332
  ## Validation
305
333
 
@@ -0,0 +1,8 @@
1
+ puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
2
+ puda_comms/command_service.py,sha256=KFremcEGfsTeUVQMIhyk1knYmUCvRYQ12vS_jy_14wA,25193
3
+ puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
4
+ puda_comms/machine_client.py,sha256=wj6t_QHGs7l1Oc8JQ6hq2hqBd5C14TCPA_dTU9qOLzw,37430
5
+ puda_comms/models.py,sha256=9ZGX0PR7SgMBOL5zVLrPuSUhZqutQU96PubyjyQLhf8,3617
6
+ puda_comms-0.0.4.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
7
+ puda_comms-0.0.4.dist-info/METADATA,sha256=0cMHDub_3NZt7Cj5U1jzrQXI8atQqpMM-i3vSMrT5lo,11512
8
+ puda_comms-0.0.4.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
2
- puda_comms/command_service.py,sha256=B4fKiQNF0slvGS1fXVoh5UZax_-xk4IS-KT96teSRfg,23272
3
- puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
4
- puda_comms/machine_client.py,sha256=F2i0BYBuOLjKAnfZAblNrb3Lzs0yhEO1d4XA-k_dkIU,33039
5
- puda_comms/models.py,sha256=cVH5uKzyLmjzPeBcm3RIJMTkoynmxqe_P26GtZwlIN8,3500
6
- puda_comms-0.0.2.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
7
- puda_comms-0.0.2.dist-info/METADATA,sha256=jHHcSSmdWOykobTsieX2bqDeRtqSaqdUd-xZeeWxJZ8,10585
8
- puda_comms-0.0.2.dist-info/RECORD,,