puda-comms 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,11 +12,17 @@ import json
12
12
  import logging
13
13
  import signal
14
14
  from datetime import datetime, timezone
15
- from typing import Dict, Any, Optional, Tuple
15
+ from typing import Dict, Any, Optional
16
16
  import nats
17
17
  from nats.js.client import JetStreamContext
18
18
  from nats.aio.msg import Msg
19
- from puda_comms.models import CommandRequest, CommandResponse, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
19
+ from puda_comms.models import (
20
+ CommandRequest,
21
+ CommandResponseStatus,
22
+ NATSMessage,
23
+ MessageHeader,
24
+ MessageType,
25
+ )
20
26
 
21
27
  logger = logging.getLogger(__name__)
22
28
 
@@ -37,7 +43,7 @@ class ResponseHandler:
37
43
  def __init__(self, js: JetStreamContext, machine_id: str):
38
44
  self.js = js
39
45
  self.machine_id = machine_id
40
- self._pending_responses: Dict[str, Tuple[asyncio.Event, CommandResponse]] = {}
46
+ self._pending_responses: Dict[str, Dict[str, Any]] = {} # {'event': asyncio.Event, 'response': Optional[NATSMessage]}
41
47
  self._queue_consumer = None
42
48
  self._immediate_consumer = None
43
49
  self._initialized = False
@@ -102,8 +108,8 @@ class ResponseHandler:
102
108
 
103
109
  # Get the pending response
104
110
  pending = self._pending_responses[key]
105
- # Store the full NATSMessage JSON structure
106
- pending['response'] = message.model_dump()
111
+ # Store the NATSMessage directly
112
+ pending['response'] = message
107
113
  # Signal that response was received
108
114
  # Don't delete here - let get_response() delete it after retrieval
109
115
  pending['event'].set()
@@ -152,7 +158,7 @@ class ResponseHandler:
152
158
  }
153
159
  return event
154
160
 
155
- def get_response(self, run_id: str, step_number: int) -> Optional[Dict[str, Any]]:
161
+ def get_response(self, run_id: str, step_number: int) -> Optional[NATSMessage]:
156
162
  """
157
163
  Get the response for a pending command.
158
164
 
@@ -161,7 +167,7 @@ class ResponseHandler:
161
167
  step_number: Step number for the command
162
168
 
163
169
  Returns:
164
- The NATSMessage dict structure if available, None otherwise
170
+ The NATSMessage if available, None otherwise
165
171
  """
166
172
  key = f"{run_id}:{str(step_number)}"
167
173
  if key in self._pending_responses:
@@ -266,9 +272,9 @@ class CommandService:
266
272
  max_attempts = 3
267
273
  connect_timeout = 3 # 3 seconds timeout per connection attempt
268
274
 
269
- for attempt in range(1, max_attempts + 1):
275
+ for attempt in range(max_attempts):
270
276
  try:
271
- logger.info("Connection attempt %d/%d to NATS servers: %s", attempt, max_attempts, self.servers)
277
+ logger.info("Connection attempt %d/%d to NATS servers: %s", attempt + 1, max_attempts, self.servers)
272
278
  self.nc = await asyncio.wait_for(
273
279
  nats.connect(
274
280
  servers=self.servers,
@@ -285,14 +291,14 @@ class CommandService:
285
291
  return True
286
292
 
287
293
  except asyncio.TimeoutError:
288
- logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt, max_attempts, connect_timeout)
289
- if attempt < max_attempts:
294
+ logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt + 1, max_attempts, connect_timeout)
295
+ if attempt < max_attempts - 1:
290
296
  logger.info("Retrying connection...")
291
297
  else:
292
298
  logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
293
299
  except Exception as e:
294
- logger.warning("Connection attempt %d/%d failed: %s", attempt, max_attempts, e)
295
- if attempt < max_attempts:
300
+ logger.warning("Connection attempt %d/%d failed: %s", attempt + 1, max_attempts, e)
301
+ if attempt < max_attempts - 1:
296
302
  logger.info("Retrying connection...")
297
303
  else:
298
304
  logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
@@ -343,6 +349,8 @@ class CommandService:
343
349
  request: CommandRequest,
344
350
  machine_id: str,
345
351
  run_id: str,
352
+ user_id: str,
353
+ username: str,
346
354
  timeout: int = 120
347
355
  ) -> Optional[NATSMessage]:
348
356
  """
@@ -352,6 +360,8 @@ class CommandService:
352
360
  request: CommandRequest model containing command details
353
361
  machine_id: Machine ID to send the command to
354
362
  run_id: Run ID for the command
363
+ user_id: User ID who initiated the command
364
+ username: Username who initiated the command
355
365
  timeout: Maximum time to wait for response in seconds
356
366
 
357
367
  Returns:
@@ -364,8 +374,8 @@ class CommandService:
364
374
  subject = f"{NAMESPACE}.{machine_id}.cmd.queue"
365
375
 
366
376
  logger.info(
367
- "Sending queue command: machine_id=%s, command=%s, run_id=%s, step_number=%s",
368
- machine_id, request.name, run_id, request.step_number
377
+ "Sending queue command: subject=%s, command=%s, run_id=%s, step_number=%s",
378
+ subject, request.name, run_id, request.step_number
369
379
  )
370
380
 
371
381
  # Get or create response handler for this machine
@@ -374,7 +384,7 @@ class CommandService:
374
384
  response_event = response_handler.register_pending(run_id, request.step_number)
375
385
 
376
386
  # Build payload
377
- payload = self._build_command_payload(request, machine_id, run_id)
387
+ payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
378
388
 
379
389
  try:
380
390
  # Publish to JetStream
@@ -397,36 +407,107 @@ class CommandService:
397
407
  await asyncio.sleep(0.1)
398
408
 
399
409
  # Get the response
400
- response_data = response_handler.get_response(run_id, request.step_number)
401
- if response_data is None:
402
- return None
403
-
404
- return NATSMessage.model_validate(response_data)
410
+ return response_handler.get_response(run_id, request.step_number)
405
411
 
406
412
  except Exception as e:
407
413
  logger.error("Error sending queue command: %s", e)
408
414
  response_handler.remove_pending(run_id, request.step_number)
409
415
  return None
410
416
 
417
+ async def start_run(
418
+ self,
419
+ machine_id: str,
420
+ run_id: str,
421
+ user_id: str,
422
+ username: str,
423
+ timeout: int = 120
424
+ ) -> Optional[NATSMessage]:
425
+ """
426
+ Send START immediate command to begin a run.
427
+
428
+ Args:
429
+ machine_id: Machine ID to send the command to
430
+ run_id: Run ID for the command
431
+ user_id: User ID who initiated the command
432
+ username: Username who initiated the command
433
+ timeout: Maximum time to wait for response in seconds
434
+
435
+ Returns:
436
+ NATSMessage if successful, None if failed or timeout
437
+ """
438
+ request = CommandRequest(
439
+ name="start",
440
+ params={},
441
+ step_number=0
442
+ )
443
+ return await self.send_immediate_command(
444
+ request=request,
445
+ machine_id=machine_id,
446
+ run_id=run_id,
447
+ user_id=user_id,
448
+ username=username,
449
+ timeout=timeout
450
+ )
451
+
452
+ async def complete_run(
453
+ self,
454
+ machine_id: str,
455
+ run_id: str,
456
+ user_id: str,
457
+ username: str,
458
+ timeout: int = 120
459
+ ) -> Optional[NATSMessage]:
460
+ """
461
+ Send COMPLETE immediate command to end a run.
462
+
463
+ Args:
464
+ machine_id: Machine ID to send the command to
465
+ run_id: Run ID for the command
466
+ user_id: User ID who initiated the command
467
+ username: Username who initiated the command
468
+ timeout: Maximum time to wait for response in seconds
469
+
470
+ Returns:
471
+ NATSMessage if successful, None if failed or timeout
472
+ """
473
+ request = CommandRequest(
474
+ name="complete",
475
+ params={},
476
+ step_number=0
477
+ )
478
+ return await self.send_immediate_command(
479
+ request=request,
480
+ machine_id=machine_id,
481
+ run_id=run_id,
482
+ user_id=user_id,
483
+ username=username,
484
+ timeout=timeout
485
+ )
486
+
411
487
  async def send_queue_commands(
412
488
  self,
413
489
  *,
414
490
  requests: list[CommandRequest],
415
491
  machine_id: str,
416
492
  run_id: str,
493
+ user_id: str,
494
+ username: str,
417
495
  timeout: int = 120
418
496
  ) -> Optional[NATSMessage]:
419
497
  """
420
498
  Send multiple queue commands sequentially and wait for responses.
421
499
 
422
- Sends commands one by one, waiting for each response before sending the next.
423
- If any command fails or times out, stops immediately and returns the error response.
424
- If all commands succeed, returns the last command's response.
500
+ Automatically sends START command before the sequence and COMPLETE command after
501
+ successful completion. Sends commands one by one, waiting for each response before
502
+ sending the next. If any command fails or times out, stops immediately and returns
503
+ the error response. If all commands succeed, returns the last command's response.
425
504
 
426
505
  Args:
427
506
  requests: List of CommandRequest models to send sequentially
428
507
  machine_id: Machine ID to send the commands to
429
508
  run_id: Run ID for all commands
509
+ user_id: User ID who initiated the commands
510
+ username: Username who initiated the commands
430
511
  timeout: Maximum time to wait for each response in seconds
431
512
 
432
513
  Returns:
@@ -447,74 +528,131 @@ class CommandService:
447
528
  run_id
448
529
  )
449
530
 
531
+ # Always send START command before sequence
532
+ logger.info("Sending START command before sequence")
533
+ start_response = await self.start_run(
534
+ machine_id=machine_id,
535
+ run_id=run_id,
536
+ user_id=user_id,
537
+ username=username,
538
+ timeout=timeout
539
+ )
540
+ if start_response is None:
541
+ logger.error("START command timed out")
542
+ return None
543
+ if start_response.response and start_response.response.status == CommandResponseStatus.ERROR:
544
+ logger.error("START command failed: %s", start_response.response.message)
545
+ return start_response
546
+
450
547
  last_response: Optional[NATSMessage] = None
451
548
 
452
- for idx, request in enumerate(requests, start=1):
453
- logger.info(
454
- "Sending command %d/%d: %s (step %s)",
455
- idx,
456
- len(requests),
457
- request.name,
458
- request.step_number
459
- )
460
-
461
- response = await self.send_queue_command(
462
- request=request,
463
- machine_id=machine_id,
464
- run_id=run_id,
465
- timeout=timeout
466
- )
467
-
468
- # Check if command failed (None means timeout or exception)
469
- if response is None:
470
- logger.error(
471
- "Command %d/%d failed or timed out: %s (step %s)",
549
+ try:
550
+ for idx, request in enumerate(requests, start=1):
551
+ # Validate request - convert dict to CommandRequest if needed
552
+ if isinstance(request, dict):
553
+ request = CommandRequest.model_validate(request)
554
+ elif not isinstance(request, CommandRequest):
555
+ raise ValueError(f"Request {idx} must be a CommandRequest or dict, got {type(request)}")
556
+
557
+ logger.info(
558
+ "Sending command %d/%d: %s (step %s)",
472
559
  idx,
473
560
  len(requests),
474
561
  request.name,
475
562
  request.step_number
476
563
  )
477
- return None
478
564
 
479
- # Check if command returned an error status
480
- if response.response is not None:
481
- if response.response.status == CommandResponseStatus.ERROR:
565
+ response = await self.send_queue_command(
566
+ request=request,
567
+ machine_id=machine_id,
568
+ run_id=run_id,
569
+ user_id=user_id,
570
+ username=username,
571
+ timeout=timeout
572
+ )
573
+
574
+ # Check if command failed (None means timeout or exception)
575
+ if response is None:
482
576
  logger.error(
483
- "Command %d/%d failed with error: %s (step %s) - code: %s, message: %s",
577
+ "Command %d/%d failed or timed out: %s (step %s)",
578
+ idx,
579
+ len(requests),
580
+ request.name,
581
+ request.step_number
582
+ )
583
+ return None
584
+
585
+ # Check if command returned an error status
586
+ if response.response is not None:
587
+ if response.response.status == CommandResponseStatus.ERROR:
588
+ logger.error(
589
+ "Command %d/%d failed with error: %s (step %s) - code: %s, message: %s",
590
+ idx,
591
+ len(requests),
592
+ request.name,
593
+ request.step_number,
594
+ response.response.code,
595
+ response.response.message
596
+ )
597
+ return response
598
+
599
+ # Command succeeded, store as last response
600
+ last_response = response
601
+ logger.info(
602
+ "Command %d/%d succeeded: %s (step %s)",
484
603
  idx,
485
604
  len(requests),
486
605
  request.name,
487
- request.step_number,
488
- response.response.code,
489
- response.response.message
606
+ request.step_number
607
+ )
608
+ else:
609
+ # Response exists but has no response data (shouldn't happen, but handle it)
610
+ logger.warning(
611
+ "Command %d/%d returned response with no response data: %s (step %s)",
612
+ idx,
613
+ len(requests),
614
+ request.name,
615
+ request.step_number
490
616
  )
491
617
  return response
492
-
493
- # Command succeeded, store as last response
494
- last_response = response
495
- logger.info(
496
- "Command %d/%d succeeded: %s (step %s)",
497
- idx,
498
- len(requests),
499
- request.name,
500
- request.step_number
501
- )
502
- else:
503
- # Response exists but has no response data (shouldn't happen, but handle it)
504
- logger.warning(
505
- "Command %d/%d returned response with no response data: %s (step %s)",
506
- idx,
507
- len(requests),
508
- request.name,
509
- request.step_number
618
+
619
+ logger.info(
620
+ "All %d commands completed successfully",
621
+ len(requests)
622
+ )
623
+
624
+ # Always send COMPLETE command after successful sequence
625
+ logger.info("Sending COMPLETE command after successful sequence")
626
+ complete_response = await self.complete_run(
627
+ machine_id=machine_id,
628
+ run_id=run_id,
629
+ user_id=user_id,
630
+ username=username,
631
+ timeout=timeout
632
+ )
633
+ if complete_response is None:
634
+ logger.error("COMPLETE command timed out")
635
+ return None
636
+ if complete_response.response and complete_response.response.status == CommandResponseStatus.ERROR:
637
+ logger.error("COMPLETE command failed: %s", complete_response.response.message)
638
+ return complete_response
639
+ # Return the last command response, not the COMPLETE response
640
+ return last_response
641
+ except Exception as e:
642
+ # If any error occurs during command execution, try to complete the run
643
+ # to clean up state (but don't fail if this also fails)
644
+ logger.warning("Error during command sequence, attempting to complete run: %s", e)
645
+ try:
646
+ await self.complete_run(
647
+ machine_id=machine_id,
648
+ run_id=run_id,
649
+ user_id=user_id,
650
+ username=username,
651
+ timeout=timeout
510
652
  )
511
- return response
512
-
513
- logger.info(
514
- "All %d commands completed successfully",
515
- len(requests)
516
- )
517
- return last_response
653
+ except Exception as cleanup_error:
654
+ logger.error("Failed to complete run during error cleanup: %s", cleanup_error)
655
+ raise
518
656
 
519
657
  async def send_immediate_command(
520
658
  self,
@@ -522,6 +660,8 @@ class CommandService:
522
660
  request: CommandRequest,
523
661
  machine_id: str,
524
662
  run_id: str,
663
+ user_id: str,
664
+ username: str,
525
665
  timeout: int = 120
526
666
  ) -> Optional[NATSMessage]:
527
667
  """
@@ -531,6 +671,8 @@ class CommandService:
531
671
  request: CommandRequest model containing command details
532
672
  machine_id: Machine ID to send the command to
533
673
  run_id: Run ID for the command
674
+ user_id: User ID who initiated the command
675
+ username: Username who initiated the command
534
676
  timeout: Maximum time to wait for response in seconds
535
677
 
536
678
  Returns:
@@ -555,7 +697,7 @@ class CommandService:
555
697
  response_received = response_handler.register_pending(run_id, request.step_number)
556
698
 
557
699
  # Build payload
558
- payload = self._build_command_payload(request, machine_id, run_id)
700
+ payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
559
701
 
560
702
  try:
561
703
  # Publish to JetStream
@@ -578,11 +720,7 @@ class CommandService:
578
720
  await asyncio.sleep(0.1)
579
721
 
580
722
  # Get the response
581
- response_data = response_handler.get_response(run_id, request.step_number)
582
- if response_data is None:
583
- return None
584
-
585
- return NATSMessage.model_validate(response_data)
723
+ return response_handler.get_response(run_id, request.step_number)
586
724
 
587
725
  except Exception as e:
588
726
  logger.error("Error sending immediate command: %s", e)
@@ -635,7 +773,9 @@ class CommandService:
635
773
  self,
636
774
  command_request: CommandRequest,
637
775
  machine_id: str,
638
- run_id: str
776
+ run_id: str,
777
+ user_id: str,
778
+ username: str
639
779
  ) -> NATSMessage:
640
780
  """
641
781
  Build a command payload in the expected format.
@@ -643,17 +783,24 @@ class CommandService:
643
783
  Args:
644
784
  command_request: CommandRequest model containing command details
645
785
  machine_id: Machine ID for the command
646
- run_id: Run ID for the command
786
+ run_id: Run ID for the command (empty string will be converted to None)
787
+ user_id: User ID who initiated the command
788
+ username: Username who initiated the command
647
789
 
648
790
  Returns:
649
791
  NATSMessage object ready for NATS transmission
650
792
  """
793
+ # Convert empty string to None for run_id
794
+ run_id_value = run_id if run_id else None
795
+
651
796
  header = MessageHeader(
652
797
  message_type=MessageType.COMMAND,
653
798
  version="1.0",
654
799
  timestamp=datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
800
+ user_id=user_id,
801
+ username=username,
655
802
  machine_id=machine_id,
656
- run_id=run_id
803
+ run_id=run_id_value
657
804
  )
658
805
 
659
806
  return NATSMessage(
@@ -19,8 +19,9 @@ from puda_comms.models import (
19
19
  MessageType,
20
20
  ImmediateCommand,
21
21
  )
22
+ from puda_comms.run_manager import RunManager
22
23
  from nats.js.client import JetStreamContext
23
- from nats.js.api import StreamConfig
24
+ from nats.js.api import StreamConfig, ConsumerConfig
24
25
  from nats.js.errors import NotFoundError
25
26
  from nats.aio.msg import Msg
26
27
 
@@ -69,16 +70,20 @@ class MachineClient:
69
70
 
70
71
  # Default subscriptions
71
72
  self._cmd_queue_sub = None
73
+ self._cmd_queue_task = None # Background task for pull consumer
72
74
  self._cmd_immediate_sub = None
73
75
 
74
76
  # Connection state
75
77
  self._is_connected = False
76
- self._reconnect_handlers = []
78
+ self._queue_handler = None
79
+ self._immediate_handler = None
77
80
 
78
81
  # Queue control state
79
82
  self._pause_lock = asyncio.Lock()
80
83
  self._is_paused = False
81
- self._cancelled_run_ids = set()
84
+
85
+ # Run state management
86
+ self.run_manager = RunManager(machine_id=machine_id)
82
87
 
83
88
  def _init_subjects(self):
84
89
  """Initialize all subject and stream names."""
@@ -184,30 +189,22 @@ class MachineClient:
184
189
  logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
185
190
  raise
186
191
 
187
- async def _ensure_command_queue_stream(self):
188
- """Ensure COMMAND_QUEUE stream exists with WorkQueue retention policy."""
192
+ async def _ensure_all_streams(self):
193
+ """Ensure all required streams exist with correct retention policies."""
189
194
  await self._ensure_stream(
190
195
  self.STREAM_COMMAND_QUEUE,
191
- f"{self.NAMESPACE}.*.cmd.queue"
196
+ f"{self.NAMESPACE}.*.cmd.queue",
197
+ retention='workqueue'
192
198
  )
193
-
194
- async def _ensure_command_immediate_stream(self):
195
- """Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
196
199
  await self._ensure_stream(
197
200
  self.STREAM_COMMAND_IMMEDIATE,
198
201
  f"{self.NAMESPACE}.*.cmd.immediate"
199
202
  )
200
-
201
- async def _ensure_response_queue_stream(self):
202
- """Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
203
203
  await self._ensure_stream(
204
204
  self.STREAM_RESPONSE_QUEUE,
205
205
  f"{self.NAMESPACE}.*.cmd.response.queue",
206
206
  retention='interest'
207
207
  )
208
-
209
- async def _ensure_response_immediate_stream(self):
210
- """Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
211
208
  await self._ensure_stream(
212
209
  self.STREAM_RESPONSE_IMMEDIATE,
213
210
  f"{self.NAMESPACE}.*.cmd.response.immediate",
@@ -230,7 +227,17 @@ class MachineClient:
230
227
 
231
228
  async def _cleanup_subscriptions(self):
232
229
  """Unsubscribe from all subscriptions."""
233
- # Clean up subscriptions
230
+ # Clean up queue subscription (pull consumer)
231
+ if self._cmd_queue_task:
232
+ try:
233
+ self._cmd_queue_task.cancel()
234
+ await self._cmd_queue_task
235
+ except asyncio.CancelledError:
236
+ pass
237
+ except Exception:
238
+ pass
239
+ self._cmd_queue_task = None
240
+
234
241
  if self._cmd_queue_sub:
235
242
  try:
236
243
  await self._cmd_queue_sub.unsubscribe()
@@ -252,6 +259,7 @@ class MachineClient:
252
259
  self.kv = None
253
260
  # Subscriptions will be recreated on reconnection
254
261
  self._cmd_queue_sub = None
262
+ self._cmd_queue_task = None
255
263
  self._cmd_immediate_sub = None
256
264
 
257
265
  # ==================== CONNECTION MANAGEMENT ====================
@@ -270,10 +278,7 @@ class MachineClient:
270
278
  closed_cb=self._closed_callback
271
279
  )
272
280
  self.js = self.nc.jetstream()
273
- await self._ensure_command_queue_stream()
274
- await self._ensure_command_immediate_stream()
275
- await self._ensure_response_queue_stream()
276
- await self._ensure_response_immediate_stream()
281
+ await self._ensure_all_streams()
277
282
  self.kv = await self._get_or_create_kv_bucket()
278
283
  self._is_connected = True
279
284
  logger.info("Connected to NATS servers: %s", self.servers)
@@ -299,32 +304,16 @@ class MachineClient:
299
304
 
300
305
  if self.nc:
301
306
  self.js = self.nc.jetstream()
302
- await self._ensure_command_queue_stream()
303
- await self._ensure_command_immediate_stream()
304
- await self._ensure_response_queue_stream()
305
- await self._ensure_response_immediate_stream()
307
+ await self._ensure_all_streams()
306
308
  self.kv = await self._get_or_create_kv_bucket()
307
309
  await self._resubscribe_handlers()
308
310
 
309
311
  async def _resubscribe_handlers(self):
310
312
  """Re-subscribe to all handlers after reconnection."""
311
- subscribe_methods = {
312
- 'queue': self.subscribe_queue,
313
- 'immediate': self.subscribe_immediate,
314
- }
315
-
316
- for handler_info in self._reconnect_handlers:
317
- try:
318
- handler_type = handler_info['type']
319
- handler = handler_info['handler']
320
- subscribe_method = subscribe_methods.get(handler_type)
321
-
322
- if subscribe_method:
323
- await subscribe_method(handler)
324
- else:
325
- logger.warning("Unknown handler type: %s", handler_type)
326
- except Exception as e:
327
- logger.error("Failed to re-subscribe %s: %s", handler_type, e)
313
+ if self._queue_handler:
314
+ await self.subscribe_queue(self._queue_handler)
315
+ if self._immediate_handler:
316
+ await self.subscribe_immediate(self._immediate_handler)
328
317
 
329
318
  async def _closed_callback(self):
330
319
  """Callback when connection is closed."""
@@ -437,41 +426,35 @@ class MachineClient:
437
426
  logger.error("Error publishing command response: %s", e)
438
427
 
439
428
  async def process_queue_cmd(
440
- self,
429
+ self,
441
430
  msg: Msg,
442
- handler: Callable[[CommandRequest], Awaitable[CommandResponse]]
431
+ handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
443
432
  ) -> None:
444
433
  """
445
434
  Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
446
435
 
447
436
  Args:
448
437
  msg: NATS message
449
- handler: Handler function that processes the message and returns CommandResponse
438
+ handler: Handler function that processes the message and returns a CommandResponse object
450
439
  """
440
+ # Initialize variables for exception handlers
441
+ run_id = None
442
+ step_number = None
443
+ command = None
444
+
451
445
  try:
452
446
  # Parse message
453
447
  message = NATSMessage.model_validate_json(msg.data)
454
448
  run_id = message.header.run_id
455
- step_number = message.command.step_number
456
- command = message.command.name
449
+ step_number = message.command.step_number if message.command else None
450
+ command = message.command.name if message.command else None
457
451
 
458
- # Check if cancelled
459
- if run_id and run_id in self._cancelled_run_ids:
460
- logger.info("Skipping cancelled command: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
461
- await msg.ack()
462
- await self._publish_command_response(
463
- msg=msg,
464
- response=CommandResponse(
465
- status=CommandResponseStatus.ERROR,
466
- code=CommandResponseCode.COMMAND_CANCELLED,
467
- message='Command cancelled'
468
- ),
469
- subject=self.response_queue
470
- )
471
- # Note: Final state update should be published by the handler with machine-specific data
472
- return
452
+ # For all commands, continue with normal processing:
453
+ # 1. Check if paused
454
+ # 2. Validate run_id matches active run
455
+ # 3. Execute handler
473
456
 
474
- # Check if paused (for queue messages)
457
+ # If machine is paused, publish error response and return
475
458
  async with self._pause_lock:
476
459
  if self._is_paused:
477
460
  await self._publish_command_response(
@@ -484,24 +467,42 @@ class MachineClient:
484
467
  subject=self.response_queue
485
468
  )
486
469
  return
487
- while self._is_paused:
488
- await msg.in_progress()
489
- await asyncio.sleep(1)
490
- # Re-check cancelled state in case it was cancelled while paused
491
- if run_id and run_id in self._cancelled_run_ids:
492
- logger.info("Command cancelled while paused: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
493
- await msg.ack()
494
- await self._publish_command_response(
495
- msg=msg,
496
- response=CommandResponse(
497
- status=CommandResponseStatus.ERROR,
498
- code=CommandResponseCode.COMMAND_CANCELLED,
499
- message='Command cancelled'
500
- ),
501
- subject=self.response_queue
502
- )
503
- # Note: Final state update should be published by the handler with machine-specific data
504
- return
470
+
471
+ # Wait while paused (release lock during wait so RESUME can acquire it)
472
+ while True:
473
+ async with self._pause_lock:
474
+ if not self._is_paused:
475
+ break
476
+ # Release lock before sleeping so RESUME can set _is_paused = False
477
+ await msg.in_progress()
478
+ await asyncio.sleep(1)
479
+
480
+ # Validate run_id matches active run (run_id is required)
481
+ if run_id is None:
482
+ await msg.ack()
483
+ await self._publish_command_response(
484
+ msg=msg,
485
+ response=CommandResponse(
486
+ status=CommandResponseStatus.ERROR,
487
+ code=CommandResponseCode.EXECUTION_ERROR,
488
+ message='Command requires run_id'
489
+ ),
490
+ subject=self.response_queue
491
+ )
492
+ return
493
+
494
+ if not await self.run_manager.validate_run_id(run_id):
495
+ await msg.ack()
496
+ await self._publish_command_response(
497
+ msg=msg,
498
+ response=CommandResponse(
499
+ status=CommandResponseStatus.ERROR,
500
+ code=CommandResponseCode.RUN_ID_MISMATCH,
501
+ message=f'Run ID mismatch: expected active run, got {run_id}'
502
+ ),
503
+ subject=self.response_queue
504
+ )
505
+ return
505
506
 
506
507
  # Execute handler with auto-heartbeat (task might take a while for machine to complete)
507
508
  # The handler should be defined in the machine-specific edge module.
@@ -553,34 +554,19 @@ class MachineClient:
553
554
  # This is a rare case - consider if handler should be called with None payload
554
555
 
555
556
  except Exception as e:
556
- # Check if cancelled before sending error response
557
- if run_id and run_id in self._cancelled_run_ids:
558
- logger.info("Command cancelled during execution (exception occurred): run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
559
- await msg.ack()
560
- await self._publish_command_response(
561
- msg=msg,
562
- response=CommandResponse(
563
- status=CommandResponseStatus.ERROR,
564
- code=CommandResponseCode.COMMAND_CANCELLED,
565
- message='Command cancelled'
566
- ),
567
- subject=self.response_queue
568
- )
569
- # Note: Final state update should be published by the handler with machine-specific data
570
- else:
571
- # Terminate all errors to prevent infinite redelivery loops
572
- logger.error("Handler failed (terminating message): %s", e)
573
- await msg.term()
574
- await self._publish_command_response(
575
- msg=msg,
576
- response=CommandResponse(
577
- status=CommandResponseStatus.ERROR,
578
- code=CommandResponseCode.EXECUTION_ERROR,
579
- message=str(e)
580
- ),
581
- subject=self.response_queue
582
- )
583
- # Note: Final state update should be published by the handler with machine-specific data
557
+ # Terminate all errors to prevent infinite redelivery loops
558
+ logger.error("Handler failed (terminating message): %s", e)
559
+ await msg.term()
560
+ await self._publish_command_response(
561
+ msg=msg,
562
+ response=CommandResponse(
563
+ status=CommandResponseStatus.ERROR,
564
+ code=CommandResponseCode.EXECUTION_ERROR,
565
+ message=str(e)
566
+ ),
567
+ subject=self.response_queue
568
+ )
569
+ # Note: Final state update should be published by the handler with machine-specific data
584
570
 
585
571
  async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
586
572
  """Process immediate commands (pause, cancel, resume, etc.)."""
@@ -595,8 +581,49 @@ class MachineClient:
595
581
  return
596
582
 
597
583
  command_name = message.command.name.lower()
584
+ run_id = message.header.run_id
585
+ response: CommandResponse
598
586
 
599
587
  match command_name:
588
+ case ImmediateCommand.START:
589
+ if run_id:
590
+ success = await self.run_manager.start_run(run_id)
591
+ if not success:
592
+ # Run already active
593
+ response = CommandResponse(
594
+ status=CommandResponseStatus.ERROR,
595
+ code=CommandResponseCode.RUN_ID_MISMATCH,
596
+ message='cannot start, another run is currently running'
597
+ )
598
+ else:
599
+ await self.publish_state({'state': 'active', 'run_id': run_id})
600
+ response = CommandResponse(status=CommandResponseStatus.SUCCESS)
601
+ else:
602
+ response = CommandResponse(
603
+ status=CommandResponseStatus.ERROR,
604
+ code=CommandResponseCode.MISSING_RUN_ID,
605
+ message='START command requires RUN_ID'
606
+ )
607
+
608
+ case ImmediateCommand.COMPLETE:
609
+ if not run_id:
610
+ response = CommandResponse(
611
+ status=CommandResponseStatus.ERROR,
612
+ code=CommandResponseCode.MISSING_RUN_ID,
613
+ message='COMPLETE command requires RUN_ID'
614
+ )
615
+ else:
616
+ success = await self.run_manager.complete_run(run_id)
617
+ if success:
618
+ await self.publish_state({'state': 'idle', 'run_id': None})
619
+ response = CommandResponse(status=CommandResponseStatus.SUCCESS)
620
+ else:
621
+ response = CommandResponse(
622
+ status=CommandResponseStatus.ERROR,
623
+ code=CommandResponseCode.RUN_ID_MISMATCH,
624
+ message=f'Run {run_id} not active'
625
+ )
626
+
600
627
  case ImmediateCommand.PAUSE:
601
628
  async with self._pause_lock:
602
629
  if not self._is_paused:
@@ -604,7 +631,7 @@ class MachineClient:
604
631
  logger.info("Queue paused")
605
632
  await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
606
633
  # Call handler and use its response
607
- response: CommandResponse = await handler(message)
634
+ response = await handler(message)
608
635
 
609
636
  case ImmediateCommand.RESUME:
610
637
  async with self._pause_lock:
@@ -613,19 +640,30 @@ class MachineClient:
613
640
  logger.info("Queue resumed")
614
641
  await self.publish_state({'state': 'idle', 'run_id': None})
615
642
  # Call handler and use its response
616
- response: CommandResponse = await handler(message)
643
+ response = await handler(message)
617
644
 
618
645
  case ImmediateCommand.CANCEL:
619
- if message.header.run_id:
620
- self._cancelled_run_ids.add(message.header.run_id)
621
- logger.info("Cancelling all commands with run_id: %s", message.header.run_id)
646
+ if not run_id:
647
+ response = CommandResponse(
648
+ status=CommandResponseStatus.ERROR,
649
+ code=CommandResponseCode.MISSING_RUN_ID,
650
+ message='CANCEL command requires RUN_ID'
651
+ )
652
+ else:
653
+ logger.info("Cancelling all commands with run_id: %s", run_id)
654
+ # Clear the active run_id when cancelling (try to complete, but clear anyway)
655
+ await self.run_manager.complete_run(run_id)
622
656
  await self.publish_state({'state': 'idle', 'run_id': None})
623
- # Call handler and use its response
624
- response: CommandResponse = await handler(message)
657
+ # Call handler and use its response
658
+ response = await handler(message)
625
659
 
626
660
  case _:
627
- # For other immediate commands, call the user-provided handler
628
- response: CommandResponse = await handler(message)
661
+ # Unknown immediate command
662
+ response = CommandResponse(
663
+ status=CommandResponseStatus.ERROR,
664
+ code=CommandResponseCode.UNKNOWN_COMMAND,
665
+ message=f'Unknown immediate command: {command_name}'
666
+ )
629
667
 
630
668
  await self._publish_command_response(
631
669
  msg=msg,
@@ -661,9 +699,54 @@ class MachineClient:
661
699
  )
662
700
  await self.publish_state({'state': 'error', 'run_id': None})
663
701
 
702
+ async def _verify_or_recreate_consumer(self, durable_name: str):
703
+ """
704
+ Check if consumer exists and verify/update its configuration.
705
+ Deletes and recreates the consumer if configuration doesn't match.
706
+
707
+ Args:
708
+ durable_name: Name of the durable consumer to verify
709
+ """
710
+ # Check if consumer exists and verify/update its configuration
711
+ try:
712
+ consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
713
+ logger.debug("Durable consumer %s already exists", durable_name)
714
+
715
+ # Check if consumer config matches what we need
716
+ config = consumer_info.config
717
+ needs_recreate = False
718
+ if getattr(config, 'filter_subject', None) != self.cmd_queue:
719
+ logger.warning("Consumer filter_subject mismatch: expected %s, got %s",
720
+ self.cmd_queue, getattr(config, 'filter_subject', None))
721
+ needs_recreate = True
722
+ if getattr(config, 'ack_policy', None) != 'explicit':
723
+ logger.warning("Consumer ack_policy mismatch: expected explicit, got %s",
724
+ getattr(config, 'ack_policy', None))
725
+ needs_recreate = True
726
+ if getattr(config, 'deliver_policy', None) != 'all':
727
+ logger.warning("Consumer deliver_policy mismatch: expected all, got %s",
728
+ getattr(config, 'deliver_policy', None))
729
+ needs_recreate = True
730
+
731
+ if needs_recreate:
732
+ # Consumer exists but config doesn't match - delete and recreate
733
+ logger.info("Consumer config mismatch, deleting and recreating: %s", durable_name)
734
+ try:
735
+ await self.js.delete_consumer(self.STREAM_COMMAND_QUEUE, durable_name)
736
+ except Exception as e:
737
+ logger.warning("Error deleting consumer: %s", e)
738
+ else:
739
+ # Log consumer state for diagnostics
740
+ logger.info("Consumer exists with correct config - pending: %d, delivered: %d, ack_pending: %d",
741
+ consumer_info.num_pending, consumer_info.delivered.consumer_seq,
742
+ consumer_info.num_ack_pending)
743
+ except NotFoundError:
744
+ # Consumer doesn't exist, will be created by pull_subscribe
745
+ logger.debug("Durable consumer %s does not exist, will be created", durable_name)
746
+
664
747
  async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
665
748
  """
666
- Subscribe to queue commands with default consumer.
749
+ Subscribe to queue commands with pull consumer.
667
750
 
668
751
  Args:
669
752
  handler: Async function that processes command payloads and returns CommandResponse
@@ -671,21 +754,69 @@ class MachineClient:
671
754
  if not self.js:
672
755
  logger.error("JetStream not available for queue subscription")
673
756
  return
757
+
758
+ # Store handler for reconnection
759
+ self._queue_handler = handler
674
760
 
675
761
  # Ensure stream exists before attempting to subscribe
676
- await self._ensure_command_queue_stream()
762
+ await self._ensure_all_streams()
677
763
 
678
764
  try:
679
- async def message_handler(msg: Msg):
680
- """Wrapper to process queue messages."""
681
- await self.process_queue_cmd(msg, handler)
682
-
683
- self._cmd_queue_sub = await self.js.subscribe(
765
+ durable_name = f"cmd_queue_{self.machine_id}"
766
+
767
+ await self._verify_or_recreate_consumer(durable_name)
768
+
769
+ # Create pull subscription - this will create the consumer if it doesn't exist
770
+ # Pass config directly to ensure correct consumer configuration
771
+ consumer_config = ConsumerConfig(
772
+ durable_name=durable_name,
773
+ filter_subject=self.cmd_queue,
774
+ ack_policy="explicit",
775
+ deliver_policy="all", # Required for WorkQueue: deliver all messages from the beginning
776
+ )
777
+
778
+ self._cmd_queue_sub = await self.js.pull_subscribe(
684
779
  subject=self.cmd_queue,
780
+ durable=durable_name,
685
781
  stream=self.STREAM_COMMAND_QUEUE,
686
- durable=f"cmd_queue_{self.machine_id}",
687
- cb=message_handler
782
+ config=consumer_config
688
783
  )
784
+
785
+ # Log final consumer info for diagnostics
786
+ try:
787
+ consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
788
+ logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s, pending: %d, ack_pending: %d",
789
+ self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE,
790
+ consumer_info.num_pending, consumer_info.num_ack_pending)
791
+ except Exception as e:
792
+ logger.warning("Could not get consumer info after subscription: %s", e)
793
+ logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s",
794
+ self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE)
795
+
796
+ # Start background task to pull and process messages
797
+ async def pull_messages():
798
+ """Continuously pull messages from the queue."""
799
+ try:
800
+ while True:
801
+ try:
802
+ # Fetch one message (timeout 1 second)
803
+ msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
804
+ if msgs:
805
+ logger.debug("Pulled message from queue")
806
+ await self.process_queue_cmd(msgs[0], handler)
807
+ except asyncio.TimeoutError:
808
+ # Timeout is expected when no messages are available
809
+ continue
810
+ except Exception as e:
811
+ logger.error("Error pulling queue messages: %s", e, exc_info=True)
812
+ await asyncio.sleep(1) # Wait before retrying
813
+ except asyncio.CancelledError:
814
+ logger.debug("Queue pull task cancelled")
815
+ raise
816
+
817
+ self._cmd_queue_task = asyncio.create_task(pull_messages())
818
+ logger.info("Started background task for pulling queue messages")
819
+
689
820
  except NotFoundError:
690
821
  # Stream still not found after ensuring it exists - this shouldn't happen
691
822
  # but handle it gracefully with detailed diagnostics
@@ -703,10 +834,7 @@ class MachineClient:
703
834
  logger.error(" Stream verification failed: %s", stream_check_error)
704
835
  raise
705
836
 
706
- # Register handler for reconnection
707
- if not any(h['type'] == 'queue' for h in self._reconnect_handlers):
708
- self._reconnect_handlers.append({'type': 'queue', 'handler': handler})
709
- logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
837
+ logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
710
838
  self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
711
839
 
712
840
  async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
@@ -720,19 +848,26 @@ class MachineClient:
720
848
  logger.error("JetStream not available for immediate subscription")
721
849
  return
722
850
 
851
+ # Store handler for use in callback and reconnection
852
+ self._immediate_handler = handler
853
+
723
854
  async def message_handler(msg: Msg):
724
- """Wrapper to process immediate messages."""
725
- await self.process_immediate_cmd(msg, handler)
855
+ """Process immediate messages using stored handler."""
856
+ await self.process_immediate_cmd(msg, self._immediate_handler)
726
857
 
727
858
  # Ensure stream exists before attempting to subscribe
728
- await self._ensure_command_immediate_stream()
859
+ await self._ensure_stream(
860
+ self.STREAM_COMMAND_IMMEDIATE,
861
+ f"{self.NAMESPACE}.*.cmd.immediate",
862
+ retention='workqueue'
863
+ )
729
864
 
730
865
  try:
731
866
  self._cmd_immediate_sub = await self.js.subscribe(
732
867
  subject=self.cmd_immediate,
733
868
  stream=self.STREAM_COMMAND_IMMEDIATE,
734
869
  durable=f"cmd_immed_{self.machine_id}",
735
- cb=message_handler
870
+ cb=message_handler # required for push consumer to handle messages
736
871
  )
737
872
  except NotFoundError:
738
873
  # Stream still not found after ensuring it exists - this shouldn't happen
@@ -741,9 +876,6 @@ class MachineClient:
741
876
  self.STREAM_COMMAND_IMMEDIATE)
742
877
  raise
743
878
 
744
- # Register handler for reconnection
745
- if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
746
- self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
747
879
  logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
748
880
  self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
749
881
 
puda_comms/models.py CHANGED
@@ -25,6 +25,7 @@ class CommandResponseCode(str, Enum):
25
25
  RESUME_ERROR = 'RESUME_ERROR'
26
26
  NO_EXECUTION = 'NO_EXECUTION'
27
27
  RUN_ID_MISMATCH = 'RUN_ID_MISMATCH'
28
+ MISSING_RUN_ID = 'MISSING_RUN_ID'
28
29
  CANCEL_ERROR = 'CANCEL_ERROR'
29
30
  MACHINE_PAUSED = 'MACHINE_PAUSED'
30
31
 
@@ -40,6 +41,8 @@ class MessageType(str, Enum):
40
41
 
41
42
  class ImmediateCommand(str, Enum):
42
43
  """Command names for immediate commands."""
44
+ START = 'start'
45
+ COMPLETE = 'complete'
43
46
  PAUSE = 'pause'
44
47
  RESUME = 'resume'
45
48
  CANCEL = 'cancel'
@@ -68,18 +71,19 @@ class CommandResponse(BaseModel):
68
71
 
69
72
  class MessageHeader(BaseModel):
70
73
  """Header for NATS messages."""
71
- message_type: MessageType = Field(description="Type of message")
72
74
  version: str = Field(default="1.0", description="Message version")
73
- timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
75
+ message_type: MessageType = Field(description="Type of message")
76
+ user_id: str = Field(description="User ID")
77
+ username: str = Field(description="User name")
74
78
  machine_id: str = Field(description="Machine ID")
75
79
  run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
76
-
80
+ timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
77
81
  class NATSMessage(BaseModel):
78
82
  """
79
83
  Complete NATS message structure.
80
84
 
81
85
  Structure:
82
- - header: MessageHeader with message_type, version, timestamp, machine_id, run_id
86
+ - header: MessageHeader with message_type, version, timestamp, user_id, username, machine_id, run_id
83
87
  - command: Optional CommandRequest (for command messages)
84
88
  - response: Optional CommandResponse data (for response messages)
85
89
  """
@@ -0,0 +1,112 @@
1
+ """
2
+ Run State Management
3
+ Provides thread-safe run state tracking and validation for machine commands.
4
+ """
5
+ import asyncio
6
+ import logging
7
+ from typing import Optional
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class RunManager:
13
+ """
14
+ Manages run state for a machine.
15
+
16
+ Tracks the active run_id and validates that commands match the active run.
17
+ Provides thread-safe operations for run lifecycle management.
18
+ """
19
+
20
+ def __init__(self, machine_id: str):
21
+ """
22
+ Initialize RunManager for a machine.
23
+
24
+ Args:
25
+ machine_id: Machine identifier
26
+ """
27
+ self.machine_id = machine_id
28
+ self._active_run_id: Optional[str] = None
29
+ self._lock = asyncio.Lock()
30
+
31
+ async def start_run(self, run_id: str) -> bool:
32
+ """
33
+ Set active run_id. Returns True if successful, False if run already active.
34
+
35
+ Args:
36
+ run_id: Run ID to set as active
37
+
38
+ Returns:
39
+ True if run was started successfully, False if another run is already active
40
+ """
41
+ async with self._lock:
42
+ if self._active_run_id is not None:
43
+ logger.warning(
44
+ "Cannot start run %s: run %s is already active on machine %s",
45
+ run_id, self._active_run_id, self.machine_id
46
+ )
47
+ return False
48
+
49
+ self._active_run_id = run_id
50
+ logger.info("Started run %s on machine %s", run_id, self.machine_id)
51
+ return True
52
+
53
+ async def complete_run(self, run_id: str) -> bool:
54
+ """
55
+ Clear run_id if it matches. Returns True if successful.
56
+
57
+ Args:
58
+ run_id: Run ID to complete
59
+
60
+ Returns:
61
+ True if run was completed successfully, False if run_id doesn't match active run
62
+ """
63
+ async with self._lock:
64
+ if self._active_run_id != run_id:
65
+ logger.warning(
66
+ "Cannot complete run %s: active run is %s on machine %s",
67
+ run_id, self._active_run_id, self.machine_id
68
+ )
69
+ return False
70
+
71
+ self._active_run_id = None
72
+ logger.info("Completed run %s on machine %s", run_id, self.machine_id)
73
+ return True
74
+
75
+ async def validate_run_id(self, run_id: str) -> bool:
76
+ """
77
+ Check if run_id matches active run. Returns True if valid.
78
+
79
+ Args:
80
+ run_id: Run ID to validate (required)
81
+
82
+ Returns:
83
+ True if run_id matches active run, False otherwise
84
+ """
85
+ async with self._lock:
86
+ # If no active run, any run_id is invalid
87
+ if self._active_run_id is None:
88
+ logger.warning(
89
+ "Run ID validation failed: no active run, got %s on machine %s",
90
+ run_id, self.machine_id
91
+ )
92
+ return False
93
+
94
+ # Run_id must match active run
95
+ if self._active_run_id != run_id:
96
+ logger.warning(
97
+ "Run ID validation failed: expected %s, got %s on machine %s",
98
+ self._active_run_id, run_id, self.machine_id
99
+ )
100
+ return False
101
+
102
+ return True
103
+
104
+ def get_active_run_id(self) -> Optional[str]:
105
+ """
106
+ Get current active run_id.
107
+
108
+ Returns:
109
+ Active run_id if one exists, None otherwise
110
+ """
111
+ return self._active_run_id
112
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: puda-comms
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: Communication library for the PUDA platform.
5
5
  Author: zhao
6
6
  Author-email: zhao <20024592+agentzhao@users.noreply.github.com>
@@ -121,6 +121,8 @@ Header metadata for NATS messages.
121
121
  - `message_type` (MessageType): Type of message (COMMAND, RESPONSE, LOG, etc.)
122
122
  - `version` (str): Message version (default: "1.0")
123
123
  - `timestamp` (str): ISO 8601 UTC timestamp (auto-generated)
124
+ - `user_id` (str): User ID who initiated the command
125
+ - `username` (str): Username who initiated the command
124
126
  - `machine_id` (str): Identifier for the target machine
125
127
  - `run_id` (Optional[str]): Unique identifier (UUID) for the run/workflow
126
128
 
@@ -130,6 +132,8 @@ header = MessageHeader(
130
132
  message_type=MessageType.RESPONSE,
131
133
  version="1.0",
132
134
  timestamp="2026-01-20T02:00:46Z",
135
+ user_id="user123",
136
+ username="John Doe",
133
137
  machine_id="first",
134
138
  run_id="092073e6-13d0-4756-8d99-eff1612a5a72"
135
139
  )
@@ -154,6 +158,8 @@ Complete NATS message structure combining header with optional command or respon
154
158
  "message_type": "response",
155
159
  "version": "1.0",
156
160
  "timestamp": "2026-01-20T02:00:46Z",
161
+ "user_id": "user123",
162
+ "username": "John Doe",
157
163
  "machine_id": "first",
158
164
  "run_id": "092073e6-13d0-4756-8d99-eff1612a5a72"
159
165
  },
@@ -229,6 +235,8 @@ reply = await service.send_queue_command(
229
235
  request=request,
230
236
  machine_id="first",
231
237
  run_id=run_id,
238
+ user_id="user123",
239
+ username="John Doe",
232
240
  timeout=60 # Wait up to 60 seconds
233
241
  )
234
242
 
@@ -237,6 +245,8 @@ reply = await service.send_queue_commands(
237
245
  requests=commands,
238
246
  machine_id="first",
239
247
  run_id=run_id,
248
+ user_id="user123",
249
+ username="John Doe",
240
250
  timeout=60 # Wait up to 60 seconds per command
241
251
  )
242
252
  ```
@@ -274,7 +284,9 @@ Always check the response status and handle errors appropriately:
274
284
  reply: NATSMessage = await service.send_queue_command(
275
285
  request=request,
276
286
  machine_id="first",
277
- run_id=run_id
287
+ run_id=run_id,
288
+ user_id="user123",
289
+ username="John Doe"
278
290
  )
279
291
 
280
292
  if reply is None:
@@ -0,0 +1,9 @@
1
+ puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
2
+ puda_comms/command_service.py,sha256=Lxk-CUan_DwftBZlSYO3VnddxaM9fYKxxhWF8VCqABY,30423
3
+ puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
4
+ puda_comms/machine_client.py,sha256=OnA8we1c62n1aEFr0NfiapklHWXR-WFzq5FXQrvuUM8,39378
5
+ puda_comms/models.py,sha256=CfXq_Wxqk5OQo5VknXR-BdLIT2SM69s8cGxGYr9T8WI,3701
6
+ puda_comms/run_manager.py,sha256=_s4VYVGwtRMcduz95_DPIObso4uWRS24n5NH7AiGgjI,3591
7
+ puda_comms-0.0.5.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
8
+ puda_comms-0.0.5.dist-info/METADATA,sha256=REBvcpJsUCxiFCKihVVReP0lh6IkJcBl4I8XohjhSHE,11512
9
+ puda_comms-0.0.5.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
2
- puda_comms/command_service.py,sha256=E5kGzl2hjkSTubxv01nxuo9XMXHY5aTEsn-k3IDJVB8,24727
3
- puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
4
- puda_comms/machine_client.py,sha256=r8oSnkRoqhKykvyR94kGlA1vRrCKLq-o9uNZQftxqDU,33120
5
- puda_comms/models.py,sha256=cVH5uKzyLmjzPeBcm3RIJMTkoynmxqe_P26GtZwlIN8,3500
6
- puda_comms-0.0.3.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
7
- puda_comms-0.0.3.dist-info/METADATA,sha256=Fnf_YWeOZAcefPUTY976BUT95M0w-8bSqAhjVMkmjxA,11158
8
- puda_comms-0.0.3.dist-info/RECORD,,