PyPI - puda-comms - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

puda-comms 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

puda_comms/__init__.py +5 -1
puda_comms/command_service.py +261 -85
puda_comms/machine_client.py +215 -88
puda_comms/models.py +7 -1
puda_comms/run_manager.py +112 -0
puda_comms/stream_subscriber.py +388 -0
{puda_comms-0.0.4.dist-info → puda_comms-0.0.6.dist-info}/METADATA +12 -13
puda_comms-0.0.6.dist-info/RECORD +10 -0
puda_comms-0.0.4.dist-info/RECORD +0 -8
{puda_comms-0.0.4.dist-info → puda_comms-0.0.6.dist-info}/WHEEL +0 -0

puda_comms/machine_client.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 from typing import Dict, Any, Optional, Callable, Awaitable
 from datetime import datetime, timezone
 import nats
-from puda_comms.models import (
+from .models import (
     CommandResponseStatus,
     CommandResponse,
     CommandResponseCode,
@@ -19,9 +19,10 @@ from puda_comms.models import (
     MessageType,
     ImmediateCommand,
 )
+from .run_manager import RunManager
 from nats.js.client import JetStreamContext
 from nats.js.api import StreamConfig, ConsumerConfig
-from nats.js.errors import NotFoundError
+from nats.js.errors import NotFoundError, Error as NATSError
 from nats.aio.msg import Msg
 logger = logging.getLogger(__name__)
@@ -80,7 +81,9 @@ class MachineClient:
         # Queue control state
         self._pause_lock = asyncio.Lock()
         self._is_paused = False
-        self._cancelled_run_ids = set()
+        # Run state management
+        self.run_manager = RunManager(machine_id=machine_id)
     def _init_subjects(self):
         """Initialize all subject and stream names."""
@@ -423,7 +426,7 @@ class MachineClient:
             logger.error("Error publishing command response: %s", e)
     async def process_queue_cmd(
-        self,
+        self,
         msg: Msg,
         handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
     ) -> None:
@@ -432,32 +435,26 @@ class MachineClient:
         Args:
             msg: NATS message
-            handler: Handler function that processes the message and returns CommandResponse
+            handler: Handler function that processes the message and returns a CommandResponse object
         """
+        # Initialize variables for exception handlers
+        run_id = None
+        step_number = None
+        command = None
         try:
             # Parse message
             message = NATSMessage.model_validate_json(msg.data)
             run_id = message.header.run_id
-            step_number = message.command.step_number
-            command = message.command.name
+            step_number = message.command.step_number if message.command else None
+            command = message.command.name if message.command else None
-            # Check if cancelled
-            if run_id and run_id in self._cancelled_run_ids:
-                logger.info("Skipping cancelled command: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
-                await msg.ack()
-                await self._publish_command_response(
-                    msg=msg,
-                    response=CommandResponse(
-                        status=CommandResponseStatus.ERROR,
-                        code=CommandResponseCode.COMMAND_CANCELLED,
-                        message='Command cancelled'
-                    ),
-                    subject=self.response_queue
-                )
-                # Note: Final state update should be published by the handler with machine-specific data
-                return
+            # For all commands, continue with normal processing:
+            # 1. Check if paused
+            # 2. Validate run_id matches active run
+            # 3. Execute handler
-            # Check if paused (for queue messages)
+            # If machine is paused, publish error response and return
             async with self._pause_lock:
                 if self._is_paused:
                     await self._publish_command_response(
@@ -470,24 +467,57 @@ class MachineClient:
                         subject=self.response_queue
                     )
                     return
-                while self._is_paused:
-                    await msg.in_progress()
-                    await asyncio.sleep(1)
-                    # Re-check cancelled state in case it was cancelled while paused
-                    if run_id and run_id in self._cancelled_run_ids:
-                        logger.info("Command cancelled while paused: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
-                        await msg.ack()
-                        await self._publish_command_response(
-                            msg=msg,
-                            response=CommandResponse(
-                                status=CommandResponseStatus.ERROR,
-                                code=CommandResponseCode.COMMAND_CANCELLED,
-                                message='Command cancelled'
-                            ),
-                            subject=self.response_queue
-                        )
-                        # Note: Final state update should be published by the handler with machine-specific data
-                        return
+            # Wait while paused (release lock during wait so RESUME can acquire it)
+            while True:
+                async with self._pause_lock:
+                    if not self._is_paused:
+                        break
+                # Release lock before sleeping so RESUME can set _is_paused = False
+                await msg.in_progress()
+                await asyncio.sleep(1)
+            # Validate run_id matches active run (run_id is required)
+            if run_id is None:
+                await msg.ack()
+                await self._publish_command_response(
+                    msg=msg,
+                    response=CommandResponse(
+                        status=CommandResponseStatus.ERROR,
+                        code=CommandResponseCode.EXECUTION_ERROR,
+                        message='Command requires run_id'
+                    ),
+                    subject=self.response_queue
+                )
+                return
+            # If active run_id is None, return error response
+            if self.run_manager.get_active_run_id() is None:
+                await msg.ack()
+                await self._publish_command_response(
+                    msg=msg,
+                    response=CommandResponse(
+                        status=CommandResponseStatus.ERROR,
+                        code=CommandResponseCode.RUN_ID_MISMATCH,
+                        message='Send START command to start a run before sending commands'
+                    ),
+                    subject=self.response_queue
+                )
+                return
+            # If run_id does not match active run_id, return error response
+            if not await self.run_manager.validate_run_id(run_id):
+                await msg.ack()
+                await self._publish_command_response(
+                    msg=msg,
+                    response=CommandResponse(
+                        status=CommandResponseStatus.ERROR,
+                        code=CommandResponseCode.RUN_ID_MISMATCH,
+                        message=f'Run ID mismatch: expected active run, got {run_id}'
+                    ),
+                    subject=self.response_queue
+                )
+                return
             # Execute handler with auto-heartbeat (task might take a while for machine to complete)
             # The handler should be defined in the machine-specific edge module.
@@ -497,7 +527,9 @@ class MachineClient:
             # Finalize message state based on response
             if response.status == CommandResponseStatus.SUCCESS:
                 await msg.ack()
-            else:
+            elif response.status == CommandResponseStatus.ERROR:
+                # just complete the run if the command failed
+                await self.run_manager.complete_run(run_id)
                 await msg.term()
             await self._publish_command_response(
@@ -511,6 +543,7 @@ class MachineClient:
             # Handler was cancelled (e.g., via task cancellation)
             logger.info("Handler execution cancelled: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
             await msg.ack()
+            await self.run_manager.complete_run(run_id)
             await self._publish_command_response(
                 msg=msg,
                 response=CommandResponse(
@@ -525,6 +558,7 @@ class MachineClient:
         except json.JSONDecodeError as e:
             logger.error("JSON Decode Error. Terminating message.")
             await msg.term()
+            await self.run_manager.complete_run(run_id)
             await self._publish_command_response(
                 msg=msg,
                 response=CommandResponse(
@@ -539,34 +573,20 @@ class MachineClient:
             # This is a rare case - consider if handler should be called with None payload
         except Exception as e:
-            # Check if cancelled before sending error response
-            if run_id and run_id in self._cancelled_run_ids:
-                logger.info("Command cancelled during execution (exception occurred): run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
-                await msg.ack()
-                await self._publish_command_response(
-                    msg=msg,
-                    response=CommandResponse(
-                        status=CommandResponseStatus.ERROR,
-                        code=CommandResponseCode.COMMAND_CANCELLED,
-                        message='Command cancelled'
-                    ),
-                    subject=self.response_queue
-                )
-                # Note: Final state update should be published by the handler with machine-specific data
-            else:
-                # Terminate all errors to prevent infinite redelivery loops
-                logger.error("Handler failed (terminating message): %s", e)
-                await msg.term()
-                await self._publish_command_response(
-                    msg=msg,
-                    response=CommandResponse(
-                        status=CommandResponseStatus.ERROR,
-                        code=CommandResponseCode.EXECUTION_ERROR,
-                        message=str(e)
-                    ),
-                    subject=self.response_queue
-                )
-                # Note: Final state update should be published by the handler with machine-specific data
+            # Terminate all errors to prevent infinite redelivery loops
+            logger.error("Handler failed (terminating message): %s", e)
+            await msg.term()
+            await self.run_manager.complete_run(run_id)
+            await self._publish_command_response(
+                msg=msg,
+                response=CommandResponse(
+                    status=CommandResponseStatus.ERROR,
+                    code=CommandResponseCode.EXECUTION_ERROR,
+                    message=str(e)
+                ),
+                subject=self.response_queue
+            )
+            # Note: Final state update should be published by the handler with machine-specific data
     async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
         """Process immediate commands (pause, cancel, resume, etc.)."""
@@ -581,8 +601,49 @@ class MachineClient:
                 return
             command_name = message.command.name.lower()
+            run_id = message.header.run_id
+            response: CommandResponse
             match command_name:
+                case ImmediateCommand.START:
+                    if run_id:
+                        success = await self.run_manager.start_run(run_id)
+                        if not success:
+                            # Run already active
+                            response = CommandResponse(
+                                status=CommandResponseStatus.ERROR,
+                                code=CommandResponseCode.RUN_ID_MISMATCH,
+                                message=f'cannot start, {self.run_manager.get_active_run_id()} is currently running'
+                            )
+                        else:
+                            await self.publish_state({'state': 'active', 'run_id': run_id})
+                            response = CommandResponse(status=CommandResponseStatus.SUCCESS)
+                    else:
+                        response = CommandResponse(
+                            status=CommandResponseStatus.ERROR,
+                            code=CommandResponseCode.MISSING_RUN_ID,
+                            message='START command requires RUN_ID'
+                        )
+                case ImmediateCommand.COMPLETE:
+                    if not run_id:
+                        response = CommandResponse(
+                            status=CommandResponseStatus.ERROR,
+                            code=CommandResponseCode.MISSING_RUN_ID,
+                            message='COMPLETE command requires RUN_ID'
+                        )
+                    else:
+                        success = await self.run_manager.complete_run(run_id)
+                        if success:
+                            await self.publish_state({'state': 'idle', 'run_id': None})
+                            response = CommandResponse(status=CommandResponseStatus.SUCCESS)
+                        else:
+                            response = CommandResponse(
+                                status=CommandResponseStatus.ERROR,
+                                code=CommandResponseCode.RUN_ID_MISMATCH,
+                                message=f'Run {run_id} not active'
+                            )
                 case ImmediateCommand.PAUSE:
                     async with self._pause_lock:
                         if not self._is_paused:
@@ -590,7 +651,7 @@ class MachineClient:
                             logger.info("Queue paused")
                             await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
                     # Call handler and use its response
-                    response: CommandResponse = await handler(message)
+                    response = await handler(message)
                 case ImmediateCommand.RESUME:
                     async with self._pause_lock:
@@ -599,19 +660,30 @@ class MachineClient:
                             logger.info("Queue resumed")
                             await self.publish_state({'state': 'idle', 'run_id': None})
                     # Call handler and use its response
-                    response: CommandResponse = await handler(message)
+                    response = await handler(message)
                 case ImmediateCommand.CANCEL:
-                    if message.header.run_id:
-                        self._cancelled_run_ids.add(message.header.run_id)
-                        logger.info("Cancelling all commands with run_id: %s", message.header.run_id)
+                    if not run_id:
+                        response = CommandResponse(
+                            status=CommandResponseStatus.ERROR,
+                            code=CommandResponseCode.MISSING_RUN_ID,
+                            message='CANCEL command requires RUN_ID'
+                        )
+                    else:
+                        logger.info("Cancelling all commands with run_id: %s", run_id)
+                        # Clear the active run_id when cancelling (try to complete, but clear anyway)
+                        await self.run_manager.complete_run(run_id)
                         await self.publish_state({'state': 'idle', 'run_id': None})
-                    # Call handler and use its response
-                    response: CommandResponse = await handler(message)
+                        # Call handler and use its response
+                        response = await handler(message)
                 case _:
-                    # For other immediate commands, call the user-provided handler
-                    response: CommandResponse = await handler(message)
+                    # Unknown immediate command
+                    response = CommandResponse(
+                        status=CommandResponseStatus.ERROR,
+                        code=CommandResponseCode.UNKNOWN_COMMAND,
+                        message=f'Unknown immediate command: {command_name}'
+                    )
             await self._publish_command_response(
                 msg=msg,
@@ -702,6 +774,9 @@ class MachineClient:
         if not self.js:
             logger.error("JetStream not available for queue subscription")
             return
+        # Store handler for reconnection
+        self._queue_handler = handler
         # Ensure stream exists before attempting to subscribe
         await self._ensure_all_streams()
@@ -744,12 +819,11 @@ class MachineClient:
                 try:
                     while True:
                         try:
-                            # Fetch messages (batch of 1, timeout 1 second)
+                            # Fetch one message (timeout 1 second)
                             msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
                             if msgs:
-                                logger.debug("Pulled %d message(s) from queue", len(msgs))
-                            for msg in msgs:
-                                await self.process_queue_cmd(msg, handler)
+                                logger.debug("Pulled message from queue")
+                                await self.process_queue_cmd(msgs[0], handler)
                         except asyncio.TimeoutError:
                             # Timeout is expected when no messages are available
                             continue
@@ -780,8 +854,6 @@ class MachineClient:
                 logger.error("  Stream verification failed: %s", stream_check_error)
             raise
-        # Store handler for reconnection
-        self._queue_handler = handler
         logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
                    self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
@@ -810,21 +882,76 @@ class MachineClient:
             retention='workqueue'
         )
+        durable_name = f"cmd_immed_{self.machine_id}"
+        # Try to unsubscribe from existing subscription if it exists
+        if self._cmd_immediate_sub:
+            try:
+                await self._cmd_immediate_sub.unsubscribe()
+                logger.info("Unsubscribed from existing immediate command subscription")
+            except Exception as e:
+                logger.debug("Error unsubscribing from existing subscription: %s", e)
+            self._cmd_immediate_sub = None
+        # Try to delete existing consumer if it's bound (from previous run)
+        try:
+            await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
+            logger.info("Deleted existing immediate consumer: %s", durable_name)
+        except NotFoundError:
+            # Consumer doesn't exist, which is fine
+            logger.debug("Consumer %s does not exist, will be created", durable_name)
+        except Exception as e:
+            error_msg = str(e).lower()
+            if "bound" in error_msg or "in use" in error_msg:
+                # Consumer is bound but we can't delete it - try to unsubscribe first
+                logger.warning("Consumer %s is bound to a subscription. Attempting to force delete...", durable_name)
+                # Wait a moment for any pending operations to complete
+                await asyncio.sleep(0.5)
+                try:
+                    await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
+                    logger.info("Successfully deleted bound consumer: %s", durable_name)
+                except Exception as delete_error:
+                    logger.warning("Could not delete bound consumer %s: %s. Will attempt to subscribe anyway.",
+                                 durable_name, delete_error)
+            else:
+                logger.warning("Error checking/deleting consumer %s: %s", durable_name, e)
         try:
             self._cmd_immediate_sub = await self.js.subscribe(
                 subject=self.cmd_immediate,
                 stream=self.STREAM_COMMAND_IMMEDIATE,
-                durable=f"cmd_immed_{self.machine_id}",
+                durable=durable_name,
                 cb=message_handler  # required for push consumer to handle messages
             )
+        except NATSError as e:
+            error_msg = str(e).lower()
+            if "bound" in error_msg or "already bound" in error_msg:
+                # Consumer is still bound - try to delete it and retry
+                logger.warning("Consumer %s is still bound. Attempting to delete and retry...", durable_name)
+                try:
+                    await self.js.delete_consumer(self.STREAM_COMMAND_IMMEDIATE, durable_name)
+                    await asyncio.sleep(0.5)  # Brief wait for cleanup
+                    # Retry subscription
+                    self._cmd_immediate_sub = await self.js.subscribe(
+                        subject=self.cmd_immediate,
+                        stream=self.STREAM_COMMAND_IMMEDIATE,
+                        durable=durable_name,
+                        cb=message_handler
+                    )
+                    logger.info("Successfully subscribed after deleting bound consumer")
+                except Exception as retry_error:
+                    logger.error("Failed to subscribe after deleting bound consumer: %s", retry_error)
+                    raise
+            else:
+                raise
         except NotFoundError:
             # Stream still not found after ensuring it exists - this shouldn't happen
             # but handle it gracefully
-            logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
+            logger.error("Stream %s not found even after creation attempt. Check NATS server configuration.",
                        self.STREAM_COMMAND_IMMEDIATE)
             raise
-        logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
+        logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
                    self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)

puda_comms/models.py CHANGED Viewed

@@ -25,6 +25,7 @@ class CommandResponseCode(str, Enum):
     RESUME_ERROR = 'RESUME_ERROR'
     NO_EXECUTION = 'NO_EXECUTION'
     RUN_ID_MISMATCH = 'RUN_ID_MISMATCH'
+    MISSING_RUN_ID = 'MISSING_RUN_ID'
     CANCEL_ERROR = 'CANCEL_ERROR'
     MACHINE_PAUSED = 'MACHINE_PAUSED'
@@ -40,6 +41,8 @@ class MessageType(str, Enum):
 class ImmediateCommand(str, Enum):
     """Command names for immediate commands."""
+    START = 'start'
+    COMPLETE = 'complete'
     PAUSE = 'pause'
     RESUME = 'resume'
     CANCEL = 'cancel'
@@ -54,8 +57,10 @@ class CommandRequest(BaseModel):
     """Command request data for NATS messages."""
     name: str = Field(description="The command name (string) to send to the machine.")
     params: Dict[str, Any] = Field(default_factory=dict, description="The parameters to send to the machine.")
+    kwargs: Dict[str, Any] = Field(default_factory=dict, description="Additional keyword arguments (e.g., channels in Biologic).")
     step_number: int = Field(description="Execution step number (integer). Used to track the progress of a command.")
     version: str = Field(default="1.0", description="Command version.")
+    machine_id: str = Field(description="Machine ID to send the command to.")
 class CommandResponse(BaseModel):
@@ -64,7 +69,7 @@ class CommandResponse(BaseModel):
     completed_at: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
     code: Optional[CommandResponseCode] = Field(default=None, description="Error code")
     message: Optional[str] = Field(default=None, description="Error message (human-readable description)")
-    data: Optional[Dict[str, Any]] = Field(default=None, description="Optional output data from the command handler")
+    data: Optional[Dict[Any, Any]] = Field(default=None, description="Optional output data from the command handler")
 class MessageHeader(BaseModel):
     """Header for NATS messages."""
@@ -75,6 +80,7 @@ class MessageHeader(BaseModel):
     machine_id: str = Field(description="Machine ID")
     run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
     timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
 class NATSMessage(BaseModel):
     """
     Complete NATS message structure.

puda_comms/run_manager.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+Run State Management
+Provides thread-safe run state tracking and validation for machine commands.
+"""
+import asyncio
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+class RunManager:
+    """
+    Manages run state for a machine.
+    Tracks the active run_id and validates that commands match the active run.
+    Provides thread-safe operations for run lifecycle management.
+    """
+    def __init__(self, machine_id: str):
+        """
+        Initialize RunManager for a machine.
+        Args:
+            machine_id: Machine identifier
+        """
+        self.machine_id = machine_id
+        self._active_run_id: Optional[str] = None
+        self._lock = asyncio.Lock()
+    async def start_run(self, run_id: str) -> bool:
+        """
+        Set active run_id. Returns True if successful, False if run already active.
+        Args:
+            run_id: Run ID to set as active
+        Returns:
+            True if run was started successfully, False if another run is already active
+        """
+        async with self._lock:
+            if self._active_run_id is not None:
+                logger.warning(
+                    "Cannot start run %s: run %s is already active on machine %s",
+                    run_id, self._active_run_id, self.machine_id
+                )
+                return False
+            self._active_run_id = run_id
+            logger.info("Started run %s on machine %s", run_id, self.machine_id)
+            return True
+    async def complete_run(self, run_id: str) -> bool:
+        """
+        Clear run_id if it matches. Returns True if successful.
+        Args:
+            run_id: Run ID to complete
+        Returns:
+            True if run was completed successfully, False if run_id doesn't match active run
+        """
+        async with self._lock:
+            if self._active_run_id != run_id:
+                logger.warning(
+                    "Cannot complete run %s: active run is %s on machine %s",
+                    run_id, self._active_run_id, self.machine_id
+                )
+                return False
+            self._active_run_id = None
+            logger.info("Completed run %s on machine %s", run_id, self.machine_id)
+            return True
+    async def validate_run_id(self, run_id: str) -> bool:
+        """
+        Check if run_id matches active run. Returns True if valid.
+        Args:
+            run_id: Run ID to validate (required)
+        Returns:
+            True if run_id matches active run, False otherwise
+        """
+        async with self._lock:
+            # If no active run, any run_id is invalid
+            if self._active_run_id is None:
+                logger.warning(
+                    "Run ID validation failed: no active run, got %s on machine %s",
+                    run_id, self.machine_id
+                )
+                return False
+            # Run_id must match active run
+            if self._active_run_id != run_id:
+                logger.warning(
+                    "Run ID validation failed: expected %s, got %s on machine %s",
+                    self._active_run_id, run_id, self.machine_id
+                )
+                return False
+            return True
+    def get_active_run_id(self) -> Optional[str]:
+        """
+        Get current active run_id.
+        Returns:
+            Active run_id if one exists, None otherwise
+        """
+        return self._active_run_id

puda-comms 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

puda-comms 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl