puda-comms 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,7 @@ import logging
10
10
  from typing import Dict, Any, Optional, Callable, Awaitable
11
11
  from datetime import datetime, timezone
12
12
  import nats
13
- from puda_comms.models import (
13
+ from .models import (
14
14
  CommandResponseStatus,
15
15
  CommandResponse,
16
16
  CommandResponseCode,
@@ -19,9 +19,10 @@ from puda_comms.models import (
19
19
  MessageType,
20
20
  ImmediateCommand,
21
21
  )
22
+ from .run_manager import RunManager
22
23
  from nats.js.client import JetStreamContext
23
24
  from nats.js.api import StreamConfig, ConsumerConfig
24
- from nats.js.errors import NotFoundError
25
+ from nats.js.errors import NotFoundError, Error as NATSError
25
26
  from nats.aio.msg import Msg
26
27
 
27
28
  logger = logging.getLogger(__name__)
@@ -80,7 +81,9 @@ class MachineClient:
80
81
  # Queue control state
81
82
  self._pause_lock = asyncio.Lock()
82
83
  self._is_paused = False
83
- self._cancelled_run_ids = set()
84
+
85
+ # Run state management
86
+ self.run_manager = RunManager(machine_id=machine_id)
84
87
 
85
88
  def _init_subjects(self):
86
89
  """Initialize all subject and stream names."""
@@ -423,7 +426,7 @@ class MachineClient:
423
426
  logger.error("Error publishing command response: %s", e)
424
427
 
425
428
  async def process_queue_cmd(
426
- self,
429
+ self,
427
430
  msg: Msg,
428
431
  handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
429
432
  ) -> None:
@@ -432,32 +435,26 @@ class MachineClient:
432
435
 
433
436
  Args:
434
437
  msg: NATS message
435
- handler: Handler function that processes the message and returns CommandResponse
438
+ handler: Handler function that processes the message and returns a CommandResponse object
436
439
  """
440
+ # Initialize variables for exception handlers
441
+ run_id = None
442
+ step_number = None
443
+ command = None
444
+
437
445
  try:
438
446
  # Parse message
439
447
  message = NATSMessage.model_validate_json(msg.data)
440
448
  run_id = message.header.run_id
441
- step_number = message.command.step_number
442
- command = message.command.name
449
+ step_number = message.command.step_number if message.command else None
450
+ command = message.command.name if message.command else None
443
451
 
444
- # Check if cancelled
445
- if run_id and run_id in self._cancelled_run_ids:
446
- logger.info("Skipping cancelled command: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
447
- await msg.ack()
448
- await self._publish_command_response(
449
- msg=msg,
450
- response=CommandResponse(
451
- status=CommandResponseStatus.ERROR,
452
- code=CommandResponseCode.COMMAND_CANCELLED,
453
- message='Command cancelled'
454
- ),
455
- subject=self.response_queue
456
- )
457
- # Note: Final state update should be published by the handler with machine-specific data
458
- return
452
+ # For all commands, continue with normal processing:
453
+ # 1. Check if paused
454
+ # 2. Validate run_id matches active run
455
+ # 3. Execute handler
459
456
 
460
- # Check if paused (for queue messages)
457
+ # If machine is paused, publish error response and return
461
458
  async with self._pause_lock:
462
459
  if self._is_paused:
463
460
  await self._publish_command_response(
@@ -470,24 +467,57 @@ class MachineClient:
470
467
  subject=self.response_queue
471
468
  )
472
469
  return
473
- while self._is_paused:
474
- await msg.in_progress()
475
- await asyncio.sleep(1)
476
- # Re-check cancelled state in case it was cancelled while paused
477
- if run_id and run_id in self._cancelled_run_ids:
478
- logger.info("Command cancelled while paused: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
479
- await msg.ack()
480
- await self._publish_command_response(
481
- msg=msg,
482
- response=CommandResponse(
483
- status=CommandResponseStatus.ERROR,
484
- code=CommandResponseCode.COMMAND_CANCELLED,
485
- message='Command cancelled'
486
- ),
487
- subject=self.response_queue
488
- )
489
- # Note: Final state update should be published by the handler with machine-specific data
490
- return
470
+
471
+ # Wait while paused (release lock during wait so RESUME can acquire it)
472
+ while True:
473
+ async with self._pause_lock:
474
+ if not self._is_paused:
475
+ break
476
+ # Release lock before sleeping so RESUME can set _is_paused = False
477
+ await msg.in_progress()
478
+ await asyncio.sleep(1)
479
+
480
+ # Validate run_id matches active run (run_id is required)
481
+ if run_id is None:
482
+ await msg.ack()
483
+ await self._publish_command_response(
484
+ msg=msg,
485
+ response=CommandResponse(
486
+ status=CommandResponseStatus.ERROR,
487
+ code=CommandResponseCode.EXECUTION_ERROR,
488
+ message='Command requires run_id'
489
+ ),
490
+ subject=self.response_queue
491
+ )
492
+ return
493
+
494
+ # If active run_id is None, return error response
495
+ if self.run_manager.get_active_run_id() is None:
496
+ await msg.ack()
497
+ await self._publish_command_response(
498
+ msg=msg,
499
+ response=CommandResponse(
500
+ status=CommandResponseStatus.ERROR,
501
+ code=CommandResponseCode.RUN_ID_MISMATCH,
502
+ message='Send START command to start a run before sending commands'
503
+ ),
504
+ subject=self.response_queue
505
+ )
506
+ return
507
+
508
+ # If run_id does not match active run_id, return error response
509
+ if not await self.run_manager.validate_run_id(run_id):
510
+ await msg.ack()
511
+ await self._publish_command_response(
512
+ msg=msg,
513
+ response=CommandResponse(
514
+ status=CommandResponseStatus.ERROR,
515
+ code=CommandResponseCode.RUN_ID_MISMATCH,
516
+ message=f'Run ID mismatch: expected active run, got {run_id}'
517
+ ),
518
+ subject=self.response_queue
519
+ )
520
+ return
491
521
 
492
522
  # Execute handler with auto-heartbeat (task might take a while for machine to complete)
493
523
  # The handler should be defined in the machine-specific edge module.
@@ -497,7 +527,9 @@ class MachineClient:
497
527
  # Finalize message state based on response
498
528
  if response.status == CommandResponseStatus.SUCCESS:
499
529
  await msg.ack()
500
- else:
530
+ elif response.status == CommandResponseStatus.ERROR:
531
+ # just complete the run if the command failed
532
+ await self.run_manager.complete_run(run_id)
501
533
  await msg.term()
502
534
 
503
535
  await self._publish_command_response(
@@ -511,6 +543,7 @@ class MachineClient:
511
543
  # Handler was cancelled (e.g., via task cancellation)
512
544
  logger.info("Handler execution cancelled: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
513
545
  await msg.ack()
546
+ await self.run_manager.complete_run(run_id)
514
547
  await self._publish_command_response(
515
548
  msg=msg,
516
549
  response=CommandResponse(
@@ -525,6 +558,7 @@ class MachineClient:
525
558
  except json.JSONDecodeError as e:
526
559
  logger.error("JSON Decode Error. Terminating message.")
527
560
  await msg.term()
561
+ await self.run_manager.complete_run(run_id)
528
562
  await self._publish_command_response(
529
563
  msg=msg,
530
564
  response=CommandResponse(
@@ -539,34 +573,20 @@ class MachineClient:
539
573
  # This is a rare case - consider if handler should be called with None payload
540
574
 
541
575
  except Exception as e:
542
- # Check if cancelled before sending error response
543
- if run_id and run_id in self._cancelled_run_ids:
544
- logger.info("Command cancelled during execution (exception occurred): run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
545
- await msg.ack()
546
- await self._publish_command_response(
547
- msg=msg,
548
- response=CommandResponse(
549
- status=CommandResponseStatus.ERROR,
550
- code=CommandResponseCode.COMMAND_CANCELLED,
551
- message='Command cancelled'
552
- ),
553
- subject=self.response_queue
554
- )
555
- # Note: Final state update should be published by the handler with machine-specific data
556
- else:
557
- # Terminate all errors to prevent infinite redelivery loops
558
- logger.error("Handler failed (terminating message): %s", e)
559
- await msg.term()
560
- await self._publish_command_response(
561
- msg=msg,
562
- response=CommandResponse(
563
- status=CommandResponseStatus.ERROR,
564
- code=CommandResponseCode.EXECUTION_ERROR,
565
- message=str(e)
566
- ),
567
- subject=self.response_queue
568
- )
569
- # Note: Final state update should be published by the handler with machine-specific data
576
+ # Terminate all errors to prevent infinite redelivery loops
577
+ logger.error("Handler failed (terminating message): %s", e)
578
+ await msg.term()
579
+ await self.run_manager.complete_run(run_id)
580
+ await self._publish_command_response(
581
+ msg=msg,
582
+ response=CommandResponse(
583
+ status=CommandResponseStatus.ERROR,
584
+ code=CommandResponseCode.EXECUTION_ERROR,
585
+ message=str(e)
586
+ ),
587
+ subject=self.response_queue
588
+ )
589
+ # Note: Final state update should be published by the handler with machine-specific data
570
590
 
571
591
  async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
572
592
  """Process immediate commands (pause, cancel, resume, etc.)."""
@@ -581,8 +601,49 @@ class MachineClient:
581
601
  return
582
602
 
583
603
  command_name = message.command.name.lower()
604
+ run_id = message.header.run_id
605
+ response: CommandResponse
584
606
 
585
607
  match command_name:
608
+ case ImmediateCommand.START:
609
+ if run_id:
610
+ success = await self.run_manager.start_run(run_id)
611
+ if not success:
612
+ # Run already active
613
+ response = CommandResponse(
614
+ status=CommandResponseStatus.ERROR,
615
+ code=CommandResponseCode.RUN_ID_MISMATCH,
616
+ message=f'cannot start, {self.run_manager.get_active_run_id()} is currently running'
617
+ )
618
+ else:
619
+ await self.publish_state({'state': 'active', 'run_id': run_id})
620
+ response = CommandResponse(status=CommandResponseStatus.SUCCESS)
621
+ else:
622
+ response = CommandResponse(
623
+ status=CommandResponseStatus.ERROR,
624
+ code=CommandResponseCode.MISSING_RUN_ID,
625
+ message='START command requires RUN_ID'
626
+ )
627
+
628
+ case ImmediateCommand.COMPLETE:
629
+ if not run_id:
630
+ response = CommandResponse(
631
+ status=CommandResponseStatus.ERROR,
632
+ code=CommandResponseCode.MISSING_RUN_ID,
633
+ message='COMPLETE command requires RUN_ID'
634
+ )
635
+ else:
636
+ success = await self.run_manager.complete_run(run_id)
637
+ if success:
638
+ await self.publish_state({'state': 'idle', 'run_id': None})
639
+ response = CommandResponse(status=CommandResponseStatus.SUCCESS)
640
+ else:
641
+ response = CommandResponse(
642
+ status=CommandResponseStatus.ERROR,
643
+ code=CommandResponseCode.RUN_ID_MISMATCH,
644
+ message=f'Run {run_id} not active'
645
+ )
646
+
586
647
  case ImmediateCommand.PAUSE:
587
648
  async with self._pause_lock:
588
649
  if not self._is_paused:
@@ -590,7 +651,7 @@ class MachineClient:
590
651
  logger.info("Queue paused")
591
652
  await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
592
653
  # Call handler and use its response
593
- response: CommandResponse = await handler(message)
654
+ response = await handler(message)
594
655
 
595
656
  case ImmediateCommand.RESUME:
596
657
  async with self._pause_lock:
@@ -599,19 +660,30 @@ class MachineClient:
599
660
  logger.info("Queue resumed")
600
661
  await self.publish_state({'state': 'idle', 'run_id': None})
601
662
  # Call handler and use its response
602
- response: CommandResponse = await handler(message)
663
+ response = await handler(message)
603
664
 
604
665
  case ImmediateCommand.CANCEL:
605
- if message.header.run_id:
606
- self._cancelled_run_ids.add(message.header.run_id)
607
- logger.info("Cancelling all commands with run_id: %s", message.header.run_id)
666
+ if not run_id:
667
+ response = CommandResponse(
668
+ status=CommandResponseStatus.ERROR,
669
+ code=CommandResponseCode.MISSING_RUN_ID,
670
+ message='CANCEL command requires RUN_ID'
671
+ )
672
+ else:
673
+ logger.info("Cancelling all commands with run_id: %s", run_id)
674
+ # Clear the active run_id when cancelling (try to complete, but clear anyway)
675
+ await self.run_manager.complete_run(run_id)
608
676
  await self.publish_state({'state': 'idle', 'run_id': None})
609
- # Call handler and use its response
610
- response: CommandResponse = await handler(message)
677
+ # Call handler and use its response
678
+ response = await handler(message)
611
679
 
612
680
  case _:
613
- # For other immediate commands, call the user-provided handler
614
- response: CommandResponse = await handler(message)
681
+ # Unknown immediate command
682
+ response = CommandResponse(
683
+ status=CommandResponseStatus.ERROR,
684
+ code=CommandResponseCode.UNKNOWN_COMMAND,
685
+ message=f'Unknown immediate command: {command_name}'
686
+ )
615
687
 
616
688
  await self._publish_command_response(
617
689
  msg=msg,
@@ -702,6 +774,9 @@ class MachineClient:
702
774
  if not self.js:
703
775
  logger.error("JetStream not available for queue subscription")
704
776
  return
777
+
778
+ # Store handler for reconnection
779
+ self._queue_handler = handler
705
780
 
706
781
  # Ensure stream exists before attempting to subscribe
707
782
  await self._ensure_all_streams()
@@ -744,12 +819,11 @@ class MachineClient:
744
819
  try:
745
820
  while True:
746
821
  try:
747
- # Fetch messages (batch of 1, timeout 1 second)
822
+ # Fetch one message (timeout 1 second)
748
823
  msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
749
824
  if msgs:
750
- logger.debug("Pulled %d message(s) from queue", len(msgs))
751
- for msg in msgs:
752
- await self.process_queue_cmd(msg, handler)
825
+ logger.debug("Pulled message from queue")
826
+ await self.process_queue_cmd(msgs[0], handler)
753
827
  except asyncio.TimeoutError:
754
828
  # Timeout is expected when no messages are available
755
829
  continue
@@ -780,8 +854,6 @@ class MachineClient:
780
854
  logger.error(" Stream verification failed: %s", stream_check_error)
781
855
  raise
782
856
 
783
- # Store handler for reconnection
784
- self._queue_handler = handler
785
857
  logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
786
858
  self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
787
859
 
@@ -810,21 +882,76 @@ class MachineClient:
810
882
  retention='workqueue'
811
883
  )
812
884
 
885
+ durable_name = f"cmd_immed_{self.machine_id}"
886
+
887
+ # Try to unsubscribe from existing subscription if it exists
888
+ if self._cmd_immediate_sub:
889
+ try:
890
+ await self._cmd_immediate_sub.unsubscribe()
891
+ logger.info("Unsubscribed from existing immediate command subscription")
892
+ except Exception as e:
893
+ logger.debug("Error unsubscribing from existing subscription: %s", e)
894
+ self._cmd_immediate_sub = None
895
+
896
+ # Try to delete existing consumer if it's bound (from previous run)
897
+ try:
898
+ await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
899
+ logger.info("Deleted existing immediate consumer: %s", durable_name)
900
+ except NotFoundError:
901
+ # Consumer doesn't exist, which is fine
902
+ logger.debug("Consumer %s does not exist, will be created", durable_name)
903
+ except Exception as e:
904
+ error_msg = str(e).lower()
905
+ if "bound" in error_msg or "in use" in error_msg:
906
+ # Consumer is bound but we can't delete it - try to unsubscribe first
907
+ logger.warning("Consumer %s is bound to a subscription. Attempting to force delete...", durable_name)
908
+ # Wait a moment for any pending operations to complete
909
+ await asyncio.sleep(0.5)
910
+ try:
911
+ await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
912
+ logger.info("Successfully deleted bound consumer: %s", durable_name)
913
+ except Exception as delete_error:
914
+ logger.warning("Could not delete bound consumer %s: %s. Will attempt to subscribe anyway.",
915
+ durable_name, delete_error)
916
+ else:
917
+ logger.warning("Error checking/deleting consumer %s: %s", durable_name, e)
918
+
813
919
  try:
814
920
  self._cmd_immediate_sub = await self.js.subscribe(
815
921
  subject=self.cmd_immediate,
816
922
  stream=self.STREAM_COMMAND_IMMEDIATE,
817
- durable=f"cmd_immed_{self.machine_id}",
923
+ durable=durable_name,
818
924
  cb=message_handler # required for push consumer to handle messages
819
925
  )
926
+ except NATSError as e:
927
+ error_msg = str(e).lower()
928
+ if "bound" in error_msg or "already bound" in error_msg:
929
+ # Consumer is still bound - try to delete it and retry
930
+ logger.warning("Consumer %s is still bound. Attempting to delete and retry...", durable_name)
931
+ try:
932
+ await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
933
+ await asyncio.sleep(0.5) # Brief wait for cleanup
934
+ # Retry subscription
935
+ self._cmd_immediate_sub = await self.js.subscribe(
936
+ subject=self.cmd_immediate,
937
+ stream=self.STREAM_COMMAND_IMMEDIATE,
938
+ durable=durable_name,
939
+ cb=message_handler
940
+ )
941
+ logger.info("Successfully subscribed after deleting bound consumer")
942
+ except Exception as retry_error:
943
+ logger.error("Failed to subscribe after deleting bound consumer: %s", retry_error)
944
+ raise
945
+ else:
946
+ raise
820
947
  except NotFoundError:
821
948
  # Stream still not found after ensuring it exists - this shouldn't happen
822
949
  # but handle it gracefully
823
- logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
950
+ logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
824
951
  self.STREAM_COMMAND_IMMEDIATE)
825
952
  raise
826
953
 
827
- logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
954
+ logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
828
955
  self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
829
956
 
830
957
 
puda_comms/models.py CHANGED
@@ -25,6 +25,7 @@ class CommandResponseCode(str, Enum):
25
25
  RESUME_ERROR = 'RESUME_ERROR'
26
26
  NO_EXECUTION = 'NO_EXECUTION'
27
27
  RUN_ID_MISMATCH = 'RUN_ID_MISMATCH'
28
+ MISSING_RUN_ID = 'MISSING_RUN_ID'
28
29
  CANCEL_ERROR = 'CANCEL_ERROR'
29
30
  MACHINE_PAUSED = 'MACHINE_PAUSED'
30
31
 
@@ -40,6 +41,8 @@ class MessageType(str, Enum):
40
41
 
41
42
  class ImmediateCommand(str, Enum):
42
43
  """Command names for immediate commands."""
44
+ START = 'start'
45
+ COMPLETE = 'complete'
43
46
  PAUSE = 'pause'
44
47
  RESUME = 'resume'
45
48
  CANCEL = 'cancel'
@@ -54,8 +57,10 @@ class CommandRequest(BaseModel):
54
57
  """Command request data for NATS messages."""
55
58
  name: str = Field(description="The command name (string) to send to the machine.")
56
59
  params: Dict[str, Any] = Field(default_factory=dict, description="The parameters to send to the machine.")
60
+ kwargs: Dict[str, Any] = Field(default_factory=dict, description="Additional keyword arguments (e.g., channels in Biologic).")
57
61
  step_number: int = Field(description="Execution step number (integer). Used to track the progress of a command.")
58
62
  version: str = Field(default="1.0", description="Command version.")
63
+ machine_id: str = Field(description="Machine ID to send the command to.")
59
64
 
60
65
 
61
66
  class CommandResponse(BaseModel):
@@ -64,7 +69,7 @@ class CommandResponse(BaseModel):
64
69
  completed_at: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
65
70
  code: Optional[CommandResponseCode] = Field(default=None, description="Error code")
66
71
  message: Optional[str] = Field(default=None, description="Error message (human-readable description)")
67
- data: Optional[Dict[str, Any]] = Field(default=None, description="Optional output data from the command handler")
72
+ data: Optional[Dict[Any, Any]] = Field(default=None, description="Optional output data from the command handler")
68
73
 
69
74
  class MessageHeader(BaseModel):
70
75
  """Header for NATS messages."""
@@ -75,6 +80,7 @@ class MessageHeader(BaseModel):
75
80
  machine_id: str = Field(description="Machine ID")
76
81
  run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
77
82
  timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
83
+
78
84
  class NATSMessage(BaseModel):
79
85
  """
80
86
  Complete NATS message structure.
@@ -0,0 +1,112 @@
1
+ """
2
+ Run State Management
3
+ Provides thread-safe run state tracking and validation for machine commands.
4
+ """
5
+ import asyncio
6
+ import logging
7
+ from typing import Optional
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class RunManager:
13
+ """
14
+ Manages run state for a machine.
15
+
16
+ Tracks the active run_id and validates that commands match the active run.
17
+ Provides thread-safe operations for run lifecycle management.
18
+ """
19
+
20
+ def __init__(self, machine_id: str):
21
+ """
22
+ Initialize RunManager for a machine.
23
+
24
+ Args:
25
+ machine_id: Machine identifier
26
+ """
27
+ self.machine_id = machine_id
28
+ self._active_run_id: Optional[str] = None
29
+ self._lock = asyncio.Lock()
30
+
31
+ async def start_run(self, run_id: str) -> bool:
32
+ """
33
+ Set active run_id. Returns True if successful, False if run already active.
34
+
35
+ Args:
36
+ run_id: Run ID to set as active
37
+
38
+ Returns:
39
+ True if run was started successfully, False if another run is already active
40
+ """
41
+ async with self._lock:
42
+ if self._active_run_id is not None:
43
+ logger.warning(
44
+ "Cannot start run %s: run %s is already active on machine %s",
45
+ run_id, self._active_run_id, self.machine_id
46
+ )
47
+ return False
48
+
49
+ self._active_run_id = run_id
50
+ logger.info("Started run %s on machine %s", run_id, self.machine_id)
51
+ return True
52
+
53
+ async def complete_run(self, run_id: str) -> bool:
54
+ """
55
+ Clear run_id if it matches. Returns True if successful.
56
+
57
+ Args:
58
+ run_id: Run ID to complete
59
+
60
+ Returns:
61
+ True if run was completed successfully, False if run_id doesn't match active run
62
+ """
63
+ async with self._lock:
64
+ if self._active_run_id != run_id:
65
+ logger.warning(
66
+ "Cannot complete run %s: active run is %s on machine %s",
67
+ run_id, self._active_run_id, self.machine_id
68
+ )
69
+ return False
70
+
71
+ self._active_run_id = None
72
+ logger.info("Completed run %s on machine %s", run_id, self.machine_id)
73
+ return True
74
+
75
+ async def validate_run_id(self, run_id: str) -> bool:
76
+ """
77
+ Check if run_id matches active run. Returns True if valid.
78
+
79
+ Args:
80
+ run_id: Run ID to validate (required)
81
+
82
+ Returns:
83
+ True if run_id matches active run, False otherwise
84
+ """
85
+ async with self._lock:
86
+ # If no active run, any run_id is invalid
87
+ if self._active_run_id is None:
88
+ logger.warning(
89
+ "Run ID validation failed: no active run, got %s on machine %s",
90
+ run_id, self.machine_id
91
+ )
92
+ return False
93
+
94
+ # Run_id must match active run
95
+ if self._active_run_id != run_id:
96
+ logger.warning(
97
+ "Run ID validation failed: expected %s, got %s on machine %s",
98
+ self._active_run_id, run_id, self.machine_id
99
+ )
100
+ return False
101
+
102
+ return True
103
+
104
+ def get_active_run_id(self) -> Optional[str]:
105
+ """
106
+ Get current active run_id.
107
+
108
+ Returns:
109
+ Active run_id if one exists, None otherwise
110
+ """
111
+ return self._active_run_id
112
+