puda-comms 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puda_comms/__init__.py +5 -1
- puda_comms/command_service.py +261 -85
- puda_comms/machine_client.py +215 -88
- puda_comms/models.py +7 -1
- puda_comms/run_manager.py +112 -0
- puda_comms/stream_subscriber.py +388 -0
- {puda_comms-0.0.4.dist-info → puda_comms-0.0.6.dist-info}/METADATA +12 -13
- puda_comms-0.0.6.dist-info/RECORD +10 -0
- puda_comms-0.0.4.dist-info/RECORD +0 -8
- {puda_comms-0.0.4.dist-info → puda_comms-0.0.6.dist-info}/WHEEL +0 -0
puda_comms/machine_client.py
CHANGED
|
@@ -10,7 +10,7 @@ import logging
|
|
|
10
10
|
from typing import Dict, Any, Optional, Callable, Awaitable
|
|
11
11
|
from datetime import datetime, timezone
|
|
12
12
|
import nats
|
|
13
|
-
from
|
|
13
|
+
from .models import (
|
|
14
14
|
CommandResponseStatus,
|
|
15
15
|
CommandResponse,
|
|
16
16
|
CommandResponseCode,
|
|
@@ -19,9 +19,10 @@ from puda_comms.models import (
|
|
|
19
19
|
MessageType,
|
|
20
20
|
ImmediateCommand,
|
|
21
21
|
)
|
|
22
|
+
from .run_manager import RunManager
|
|
22
23
|
from nats.js.client import JetStreamContext
|
|
23
24
|
from nats.js.api import StreamConfig, ConsumerConfig
|
|
24
|
-
from nats.js.errors import NotFoundError
|
|
25
|
+
from nats.js.errors import NotFoundError, Error as NATSError
|
|
25
26
|
from nats.aio.msg import Msg
|
|
26
27
|
|
|
27
28
|
logger = logging.getLogger(__name__)
|
|
@@ -80,7 +81,9 @@ class MachineClient:
|
|
|
80
81
|
# Queue control state
|
|
81
82
|
self._pause_lock = asyncio.Lock()
|
|
82
83
|
self._is_paused = False
|
|
83
|
-
|
|
84
|
+
|
|
85
|
+
# Run state management
|
|
86
|
+
self.run_manager = RunManager(machine_id=machine_id)
|
|
84
87
|
|
|
85
88
|
def _init_subjects(self):
|
|
86
89
|
"""Initialize all subject and stream names."""
|
|
@@ -423,7 +426,7 @@ class MachineClient:
|
|
|
423
426
|
logger.error("Error publishing command response: %s", e)
|
|
424
427
|
|
|
425
428
|
async def process_queue_cmd(
|
|
426
|
-
self,
|
|
429
|
+
self,
|
|
427
430
|
msg: Msg,
|
|
428
431
|
handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
|
|
429
432
|
) -> None:
|
|
@@ -432,32 +435,26 @@ class MachineClient:
|
|
|
432
435
|
|
|
433
436
|
Args:
|
|
434
437
|
msg: NATS message
|
|
435
|
-
handler: Handler function that processes the message and returns CommandResponse
|
|
438
|
+
handler: Handler function that processes the message and returns a CommandResponse object
|
|
436
439
|
"""
|
|
440
|
+
# Initialize variables for exception handlers
|
|
441
|
+
run_id = None
|
|
442
|
+
step_number = None
|
|
443
|
+
command = None
|
|
444
|
+
|
|
437
445
|
try:
|
|
438
446
|
# Parse message
|
|
439
447
|
message = NATSMessage.model_validate_json(msg.data)
|
|
440
448
|
run_id = message.header.run_id
|
|
441
|
-
step_number = message.command.step_number
|
|
442
|
-
command = message.command.name
|
|
449
|
+
step_number = message.command.step_number if message.command else None
|
|
450
|
+
command = message.command.name if message.command else None
|
|
443
451
|
|
|
444
|
-
#
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
await self._publish_command_response(
|
|
449
|
-
msg=msg,
|
|
450
|
-
response=CommandResponse(
|
|
451
|
-
status=CommandResponseStatus.ERROR,
|
|
452
|
-
code=CommandResponseCode.COMMAND_CANCELLED,
|
|
453
|
-
message='Command cancelled'
|
|
454
|
-
),
|
|
455
|
-
subject=self.response_queue
|
|
456
|
-
)
|
|
457
|
-
# Note: Final state update should be published by the handler with machine-specific data
|
|
458
|
-
return
|
|
452
|
+
# For all commands, continue with normal processing:
|
|
453
|
+
# 1. Check if paused
|
|
454
|
+
# 2. Validate run_id matches active run
|
|
455
|
+
# 3. Execute handler
|
|
459
456
|
|
|
460
|
-
#
|
|
457
|
+
# If machine is paused, publish error response and return
|
|
461
458
|
async with self._pause_lock:
|
|
462
459
|
if self._is_paused:
|
|
463
460
|
await self._publish_command_response(
|
|
@@ -470,24 +467,57 @@ class MachineClient:
|
|
|
470
467
|
subject=self.response_queue
|
|
471
468
|
)
|
|
472
469
|
return
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
if
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
470
|
+
|
|
471
|
+
# Wait while paused (release lock during wait so RESUME can acquire it)
|
|
472
|
+
while True:
|
|
473
|
+
async with self._pause_lock:
|
|
474
|
+
if not self._is_paused:
|
|
475
|
+
break
|
|
476
|
+
# Release lock before sleeping so RESUME can set _is_paused = False
|
|
477
|
+
await msg.in_progress()
|
|
478
|
+
await asyncio.sleep(1)
|
|
479
|
+
|
|
480
|
+
# Validate run_id matches active run (run_id is required)
|
|
481
|
+
if run_id is None:
|
|
482
|
+
await msg.ack()
|
|
483
|
+
await self._publish_command_response(
|
|
484
|
+
msg=msg,
|
|
485
|
+
response=CommandResponse(
|
|
486
|
+
status=CommandResponseStatus.ERROR,
|
|
487
|
+
code=CommandResponseCode.EXECUTION_ERROR,
|
|
488
|
+
message='Command requires run_id'
|
|
489
|
+
),
|
|
490
|
+
subject=self.response_queue
|
|
491
|
+
)
|
|
492
|
+
return
|
|
493
|
+
|
|
494
|
+
# If active run_id is None, return error response
|
|
495
|
+
if self.run_manager.get_active_run_id() is None:
|
|
496
|
+
await msg.ack()
|
|
497
|
+
await self._publish_command_response(
|
|
498
|
+
msg=msg,
|
|
499
|
+
response=CommandResponse(
|
|
500
|
+
status=CommandResponseStatus.ERROR,
|
|
501
|
+
code=CommandResponseCode.RUN_ID_MISMATCH,
|
|
502
|
+
message='Send START command to start a run before sending commands'
|
|
503
|
+
),
|
|
504
|
+
subject=self.response_queue
|
|
505
|
+
)
|
|
506
|
+
return
|
|
507
|
+
|
|
508
|
+
# If run_id does not match active run_id, return error response
|
|
509
|
+
if not await self.run_manager.validate_run_id(run_id):
|
|
510
|
+
await msg.ack()
|
|
511
|
+
await self._publish_command_response(
|
|
512
|
+
msg=msg,
|
|
513
|
+
response=CommandResponse(
|
|
514
|
+
status=CommandResponseStatus.ERROR,
|
|
515
|
+
code=CommandResponseCode.RUN_ID_MISMATCH,
|
|
516
|
+
message=f'Run ID mismatch: expected active run, got {run_id}'
|
|
517
|
+
),
|
|
518
|
+
subject=self.response_queue
|
|
519
|
+
)
|
|
520
|
+
return
|
|
491
521
|
|
|
492
522
|
# Execute handler with auto-heartbeat (task might take a while for machine to complete)
|
|
493
523
|
# The handler should be defined in the machine-specific edge module.
|
|
@@ -497,7 +527,9 @@ class MachineClient:
|
|
|
497
527
|
# Finalize message state based on response
|
|
498
528
|
if response.status == CommandResponseStatus.SUCCESS:
|
|
499
529
|
await msg.ack()
|
|
500
|
-
|
|
530
|
+
elif response.status == CommandResponseStatus.ERROR:
|
|
531
|
+
# just complete the run if the command failed
|
|
532
|
+
await self.run_manager.complete_run(run_id)
|
|
501
533
|
await msg.term()
|
|
502
534
|
|
|
503
535
|
await self._publish_command_response(
|
|
@@ -511,6 +543,7 @@ class MachineClient:
|
|
|
511
543
|
# Handler was cancelled (e.g., via task cancellation)
|
|
512
544
|
logger.info("Handler execution cancelled: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
|
|
513
545
|
await msg.ack()
|
|
546
|
+
await self.run_manager.complete_run(run_id)
|
|
514
547
|
await self._publish_command_response(
|
|
515
548
|
msg=msg,
|
|
516
549
|
response=CommandResponse(
|
|
@@ -525,6 +558,7 @@ class MachineClient:
|
|
|
525
558
|
except json.JSONDecodeError as e:
|
|
526
559
|
logger.error("JSON Decode Error. Terminating message.")
|
|
527
560
|
await msg.term()
|
|
561
|
+
await self.run_manager.complete_run(run_id)
|
|
528
562
|
await self._publish_command_response(
|
|
529
563
|
msg=msg,
|
|
530
564
|
response=CommandResponse(
|
|
@@ -539,34 +573,20 @@ class MachineClient:
|
|
|
539
573
|
# This is a rare case - consider if handler should be called with None payload
|
|
540
574
|
|
|
541
575
|
except Exception as e:
|
|
542
|
-
#
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
else:
|
|
557
|
-
# Terminate all errors to prevent infinite redelivery loops
|
|
558
|
-
logger.error("Handler failed (terminating message): %s", e)
|
|
559
|
-
await msg.term()
|
|
560
|
-
await self._publish_command_response(
|
|
561
|
-
msg=msg,
|
|
562
|
-
response=CommandResponse(
|
|
563
|
-
status=CommandResponseStatus.ERROR,
|
|
564
|
-
code=CommandResponseCode.EXECUTION_ERROR,
|
|
565
|
-
message=str(e)
|
|
566
|
-
),
|
|
567
|
-
subject=self.response_queue
|
|
568
|
-
)
|
|
569
|
-
# Note: Final state update should be published by the handler with machine-specific data
|
|
576
|
+
# Terminate all errors to prevent infinite redelivery loops
|
|
577
|
+
logger.error("Handler failed (terminating message): %s", e)
|
|
578
|
+
await msg.term()
|
|
579
|
+
await self.run_manager.complete_run(run_id)
|
|
580
|
+
await self._publish_command_response(
|
|
581
|
+
msg=msg,
|
|
582
|
+
response=CommandResponse(
|
|
583
|
+
status=CommandResponseStatus.ERROR,
|
|
584
|
+
code=CommandResponseCode.EXECUTION_ERROR,
|
|
585
|
+
message=str(e)
|
|
586
|
+
),
|
|
587
|
+
subject=self.response_queue
|
|
588
|
+
)
|
|
589
|
+
# Note: Final state update should be published by the handler with machine-specific data
|
|
570
590
|
|
|
571
591
|
async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
|
|
572
592
|
"""Process immediate commands (pause, cancel, resume, etc.)."""
|
|
@@ -581,8 +601,49 @@ class MachineClient:
|
|
|
581
601
|
return
|
|
582
602
|
|
|
583
603
|
command_name = message.command.name.lower()
|
|
604
|
+
run_id = message.header.run_id
|
|
605
|
+
response: CommandResponse
|
|
584
606
|
|
|
585
607
|
match command_name:
|
|
608
|
+
case ImmediateCommand.START:
|
|
609
|
+
if run_id:
|
|
610
|
+
success = await self.run_manager.start_run(run_id)
|
|
611
|
+
if not success:
|
|
612
|
+
# Run already active
|
|
613
|
+
response = CommandResponse(
|
|
614
|
+
status=CommandResponseStatus.ERROR,
|
|
615
|
+
code=CommandResponseCode.RUN_ID_MISMATCH,
|
|
616
|
+
message=f'cannot start, {self.run_manager.get_active_run_id()} is currently running'
|
|
617
|
+
)
|
|
618
|
+
else:
|
|
619
|
+
await self.publish_state({'state': 'active', 'run_id': run_id})
|
|
620
|
+
response = CommandResponse(status=CommandResponseStatus.SUCCESS)
|
|
621
|
+
else:
|
|
622
|
+
response = CommandResponse(
|
|
623
|
+
status=CommandResponseStatus.ERROR,
|
|
624
|
+
code=CommandResponseCode.MISSING_RUN_ID,
|
|
625
|
+
message='START command requires RUN_ID'
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
case ImmediateCommand.COMPLETE:
|
|
629
|
+
if not run_id:
|
|
630
|
+
response = CommandResponse(
|
|
631
|
+
status=CommandResponseStatus.ERROR,
|
|
632
|
+
code=CommandResponseCode.MISSING_RUN_ID,
|
|
633
|
+
message='COMPLETE command requires RUN_ID'
|
|
634
|
+
)
|
|
635
|
+
else:
|
|
636
|
+
success = await self.run_manager.complete_run(run_id)
|
|
637
|
+
if success:
|
|
638
|
+
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
639
|
+
response = CommandResponse(status=CommandResponseStatus.SUCCESS)
|
|
640
|
+
else:
|
|
641
|
+
response = CommandResponse(
|
|
642
|
+
status=CommandResponseStatus.ERROR,
|
|
643
|
+
code=CommandResponseCode.RUN_ID_MISMATCH,
|
|
644
|
+
message=f'Run {run_id} not active'
|
|
645
|
+
)
|
|
646
|
+
|
|
586
647
|
case ImmediateCommand.PAUSE:
|
|
587
648
|
async with self._pause_lock:
|
|
588
649
|
if not self._is_paused:
|
|
@@ -590,7 +651,7 @@ class MachineClient:
|
|
|
590
651
|
logger.info("Queue paused")
|
|
591
652
|
await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
|
|
592
653
|
# Call handler and use its response
|
|
593
|
-
response
|
|
654
|
+
response = await handler(message)
|
|
594
655
|
|
|
595
656
|
case ImmediateCommand.RESUME:
|
|
596
657
|
async with self._pause_lock:
|
|
@@ -599,19 +660,30 @@ class MachineClient:
|
|
|
599
660
|
logger.info("Queue resumed")
|
|
600
661
|
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
601
662
|
# Call handler and use its response
|
|
602
|
-
response
|
|
663
|
+
response = await handler(message)
|
|
603
664
|
|
|
604
665
|
case ImmediateCommand.CANCEL:
|
|
605
|
-
if
|
|
606
|
-
|
|
607
|
-
|
|
666
|
+
if not run_id:
|
|
667
|
+
response = CommandResponse(
|
|
668
|
+
status=CommandResponseStatus.ERROR,
|
|
669
|
+
code=CommandResponseCode.MISSING_RUN_ID,
|
|
670
|
+
message='CANCEL command requires RUN_ID'
|
|
671
|
+
)
|
|
672
|
+
else:
|
|
673
|
+
logger.info("Cancelling all commands with run_id: %s", run_id)
|
|
674
|
+
# Clear the active run_id when cancelling (try to complete, but clear anyway)
|
|
675
|
+
await self.run_manager.complete_run(run_id)
|
|
608
676
|
await self.publish_state({'state': 'idle', 'run_id': None})
|
|
609
|
-
|
|
610
|
-
|
|
677
|
+
# Call handler and use its response
|
|
678
|
+
response = await handler(message)
|
|
611
679
|
|
|
612
680
|
case _:
|
|
613
|
-
#
|
|
614
|
-
response
|
|
681
|
+
# Unknown immediate command
|
|
682
|
+
response = CommandResponse(
|
|
683
|
+
status=CommandResponseStatus.ERROR,
|
|
684
|
+
code=CommandResponseCode.UNKNOWN_COMMAND,
|
|
685
|
+
message=f'Unknown immediate command: {command_name}'
|
|
686
|
+
)
|
|
615
687
|
|
|
616
688
|
await self._publish_command_response(
|
|
617
689
|
msg=msg,
|
|
@@ -702,6 +774,9 @@ class MachineClient:
|
|
|
702
774
|
if not self.js:
|
|
703
775
|
logger.error("JetStream not available for queue subscription")
|
|
704
776
|
return
|
|
777
|
+
|
|
778
|
+
# Store handler for reconnection
|
|
779
|
+
self._queue_handler = handler
|
|
705
780
|
|
|
706
781
|
# Ensure stream exists before attempting to subscribe
|
|
707
782
|
await self._ensure_all_streams()
|
|
@@ -744,12 +819,11 @@ class MachineClient:
|
|
|
744
819
|
try:
|
|
745
820
|
while True:
|
|
746
821
|
try:
|
|
747
|
-
# Fetch
|
|
822
|
+
# Fetch one message (timeout 1 second)
|
|
748
823
|
msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
|
|
749
824
|
if msgs:
|
|
750
|
-
logger.debug("Pulled
|
|
751
|
-
|
|
752
|
-
await self.process_queue_cmd(msg, handler)
|
|
825
|
+
logger.debug("Pulled message from queue")
|
|
826
|
+
await self.process_queue_cmd(msgs[0], handler)
|
|
753
827
|
except asyncio.TimeoutError:
|
|
754
828
|
# Timeout is expected when no messages are available
|
|
755
829
|
continue
|
|
@@ -780,8 +854,6 @@ class MachineClient:
|
|
|
780
854
|
logger.error(" Stream verification failed: %s", stream_check_error)
|
|
781
855
|
raise
|
|
782
856
|
|
|
783
|
-
# Store handler for reconnection
|
|
784
|
-
self._queue_handler = handler
|
|
785
857
|
logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
|
|
786
858
|
self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
|
|
787
859
|
|
|
@@ -810,21 +882,76 @@ class MachineClient:
|
|
|
810
882
|
retention='workqueue'
|
|
811
883
|
)
|
|
812
884
|
|
|
885
|
+
durable_name = f"cmd_immed_{self.machine_id}"
|
|
886
|
+
|
|
887
|
+
# Try to unsubscribe from existing subscription if it exists
|
|
888
|
+
if self._cmd_immediate_sub:
|
|
889
|
+
try:
|
|
890
|
+
await self._cmd_immediate_sub.unsubscribe()
|
|
891
|
+
logger.info("Unsubscribed from existing immediate command subscription")
|
|
892
|
+
except Exception as e:
|
|
893
|
+
logger.debug("Error unsubscribing from existing subscription: %s", e)
|
|
894
|
+
self._cmd_immediate_sub = None
|
|
895
|
+
|
|
896
|
+
# Try to delete existing consumer if it's bound (from previous run)
|
|
897
|
+
try:
|
|
898
|
+
await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
|
|
899
|
+
logger.info("Deleted existing immediate consumer: %s", durable_name)
|
|
900
|
+
except NotFoundError:
|
|
901
|
+
# Consumer doesn't exist, which is fine
|
|
902
|
+
logger.debug("Consumer %s does not exist, will be created", durable_name)
|
|
903
|
+
except Exception as e:
|
|
904
|
+
error_msg = str(e).lower()
|
|
905
|
+
if "bound" in error_msg or "in use" in error_msg:
|
|
906
|
+
# Consumer is bound but we can't delete it - try to unsubscribe first
|
|
907
|
+
logger.warning("Consumer %s is bound to a subscription. Attempting to force delete...", durable_name)
|
|
908
|
+
# Wait a moment for any pending operations to complete
|
|
909
|
+
await asyncio.sleep(0.5)
|
|
910
|
+
try:
|
|
911
|
+
await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
|
|
912
|
+
logger.info("Successfully deleted bound consumer: %s", durable_name)
|
|
913
|
+
except Exception as delete_error:
|
|
914
|
+
logger.warning("Could not delete bound consumer %s: %s. Will attempt to subscribe anyway.",
|
|
915
|
+
durable_name, delete_error)
|
|
916
|
+
else:
|
|
917
|
+
logger.warning("Error checking/deleting consumer %s: %s", durable_name, e)
|
|
918
|
+
|
|
813
919
|
try:
|
|
814
920
|
self._cmd_immediate_sub = await self.js.subscribe(
|
|
815
921
|
subject=self.cmd_immediate,
|
|
816
922
|
stream=self.STREAM_COMMAND_IMMEDIATE,
|
|
817
|
-
durable=
|
|
923
|
+
durable=durable_name,
|
|
818
924
|
cb=message_handler # required for push consumer to handle messages
|
|
819
925
|
)
|
|
926
|
+
except NATSError as e:
|
|
927
|
+
error_msg = str(e).lower()
|
|
928
|
+
if "bound" in error_msg or "already bound" in error_msg:
|
|
929
|
+
# Consumer is still bound - try to delete it and retry
|
|
930
|
+
logger.warning("Consumer %s is still bound. Attempting to delete and retry...", durable_name)
|
|
931
|
+
try:
|
|
932
|
+
await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
|
|
933
|
+
await asyncio.sleep(0.5) # Brief wait for cleanup
|
|
934
|
+
# Retry subscription
|
|
935
|
+
self._cmd_immediate_sub = await self.js.subscribe(
|
|
936
|
+
subject=self.cmd_immediate,
|
|
937
|
+
stream=self.STREAM_COMMAND_IMMEDIATE,
|
|
938
|
+
durable=durable_name,
|
|
939
|
+
cb=message_handler
|
|
940
|
+
)
|
|
941
|
+
logger.info("Successfully subscribed after deleting bound consumer")
|
|
942
|
+
except Exception as retry_error:
|
|
943
|
+
logger.error("Failed to subscribe after deleting bound consumer: %s", retry_error)
|
|
944
|
+
raise
|
|
945
|
+
else:
|
|
946
|
+
raise
|
|
820
947
|
except NotFoundError:
|
|
821
948
|
# Stream still not found after ensuring it exists - this shouldn't happen
|
|
822
949
|
# but handle it gracefully
|
|
823
|
-
logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
|
|
950
|
+
logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
|
|
824
951
|
self.STREAM_COMMAND_IMMEDIATE)
|
|
825
952
|
raise
|
|
826
953
|
|
|
827
|
-
logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
|
|
954
|
+
logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
|
|
828
955
|
self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)
|
|
829
956
|
|
|
830
957
|
|
puda_comms/models.py
CHANGED
|
@@ -25,6 +25,7 @@ class CommandResponseCode(str, Enum):
|
|
|
25
25
|
RESUME_ERROR = 'RESUME_ERROR'
|
|
26
26
|
NO_EXECUTION = 'NO_EXECUTION'
|
|
27
27
|
RUN_ID_MISMATCH = 'RUN_ID_MISMATCH'
|
|
28
|
+
MISSING_RUN_ID = 'MISSING_RUN_ID'
|
|
28
29
|
CANCEL_ERROR = 'CANCEL_ERROR'
|
|
29
30
|
MACHINE_PAUSED = 'MACHINE_PAUSED'
|
|
30
31
|
|
|
@@ -40,6 +41,8 @@ class MessageType(str, Enum):
|
|
|
40
41
|
|
|
41
42
|
class ImmediateCommand(str, Enum):
|
|
42
43
|
"""Command names for immediate commands."""
|
|
44
|
+
START = 'start'
|
|
45
|
+
COMPLETE = 'complete'
|
|
43
46
|
PAUSE = 'pause'
|
|
44
47
|
RESUME = 'resume'
|
|
45
48
|
CANCEL = 'cancel'
|
|
@@ -54,8 +57,10 @@ class CommandRequest(BaseModel):
|
|
|
54
57
|
"""Command request data for NATS messages."""
|
|
55
58
|
name: str = Field(description="The command name (string) to send to the machine.")
|
|
56
59
|
params: Dict[str, Any] = Field(default_factory=dict, description="The parameters to send to the machine.")
|
|
60
|
+
kwargs: Dict[str, Any] = Field(default_factory=dict, description="Additional keyword arguments (e.g., channels in Biologic).")
|
|
57
61
|
step_number: int = Field(description="Execution step number (integer). Used to track the progress of a command.")
|
|
58
62
|
version: str = Field(default="1.0", description="Command version.")
|
|
63
|
+
machine_id: str = Field(description="Machine ID to send the command to.")
|
|
59
64
|
|
|
60
65
|
|
|
61
66
|
class CommandResponse(BaseModel):
|
|
@@ -64,7 +69,7 @@ class CommandResponse(BaseModel):
|
|
|
64
69
|
completed_at: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
|
|
65
70
|
code: Optional[CommandResponseCode] = Field(default=None, description="Error code")
|
|
66
71
|
message: Optional[str] = Field(default=None, description="Error message (human-readable description)")
|
|
67
|
-
data: Optional[Dict[
|
|
72
|
+
data: Optional[Dict[Any, Any]] = Field(default=None, description="Optional output data from the command handler")
|
|
68
73
|
|
|
69
74
|
class MessageHeader(BaseModel):
|
|
70
75
|
"""Header for NATS messages."""
|
|
@@ -75,6 +80,7 @@ class MessageHeader(BaseModel):
|
|
|
75
80
|
machine_id: str = Field(description="Machine ID")
|
|
76
81
|
run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
|
|
77
82
|
timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
|
|
83
|
+
|
|
78
84
|
class NATSMessage(BaseModel):
|
|
79
85
|
"""
|
|
80
86
|
Complete NATS message structure.
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Run State Management
|
|
3
|
+
Provides thread-safe run state tracking and validation for machine commands.
|
|
4
|
+
"""
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RunManager:
|
|
13
|
+
"""
|
|
14
|
+
Manages run state for a machine.
|
|
15
|
+
|
|
16
|
+
Tracks the active run_id and validates that commands match the active run.
|
|
17
|
+
Provides thread-safe operations for run lifecycle management.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, machine_id: str):
|
|
21
|
+
"""
|
|
22
|
+
Initialize RunManager for a machine.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
machine_id: Machine identifier
|
|
26
|
+
"""
|
|
27
|
+
self.machine_id = machine_id
|
|
28
|
+
self._active_run_id: Optional[str] = None
|
|
29
|
+
self._lock = asyncio.Lock()
|
|
30
|
+
|
|
31
|
+
async def start_run(self, run_id: str) -> bool:
|
|
32
|
+
"""
|
|
33
|
+
Set active run_id. Returns True if successful, False if run already active.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
run_id: Run ID to set as active
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
True if run was started successfully, False if another run is already active
|
|
40
|
+
"""
|
|
41
|
+
async with self._lock:
|
|
42
|
+
if self._active_run_id is not None:
|
|
43
|
+
logger.warning(
|
|
44
|
+
"Cannot start run %s: run %s is already active on machine %s",
|
|
45
|
+
run_id, self._active_run_id, self.machine_id
|
|
46
|
+
)
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
self._active_run_id = run_id
|
|
50
|
+
logger.info("Started run %s on machine %s", run_id, self.machine_id)
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
async def complete_run(self, run_id: str) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Clear run_id if it matches. Returns True if successful.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
run_id: Run ID to complete
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
True if run was completed successfully, False if run_id doesn't match active run
|
|
62
|
+
"""
|
|
63
|
+
async with self._lock:
|
|
64
|
+
if self._active_run_id != run_id:
|
|
65
|
+
logger.warning(
|
|
66
|
+
"Cannot complete run %s: active run is %s on machine %s",
|
|
67
|
+
run_id, self._active_run_id, self.machine_id
|
|
68
|
+
)
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
self._active_run_id = None
|
|
72
|
+
logger.info("Completed run %s on machine %s", run_id, self.machine_id)
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
async def validate_run_id(self, run_id: str) -> bool:
|
|
76
|
+
"""
|
|
77
|
+
Check if run_id matches active run. Returns True if valid.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
run_id: Run ID to validate (required)
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
True if run_id matches active run, False otherwise
|
|
84
|
+
"""
|
|
85
|
+
async with self._lock:
|
|
86
|
+
# If no active run, any run_id is invalid
|
|
87
|
+
if self._active_run_id is None:
|
|
88
|
+
logger.warning(
|
|
89
|
+
"Run ID validation failed: no active run, got %s on machine %s",
|
|
90
|
+
run_id, self.machine_id
|
|
91
|
+
)
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
# Run_id must match active run
|
|
95
|
+
if self._active_run_id != run_id:
|
|
96
|
+
logger.warning(
|
|
97
|
+
"Run ID validation failed: expected %s, got %s on machine %s",
|
|
98
|
+
self._active_run_id, run_id, self.machine_id
|
|
99
|
+
)
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
def get_active_run_id(self) -> Optional[str]:
|
|
105
|
+
"""
|
|
106
|
+
Get current active run_id.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Active run_id if one exists, None otherwise
|
|
110
|
+
"""
|
|
111
|
+
return self._active_run_id
|
|
112
|
+
|