PyPI - puda-comms - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

puda-comms 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

puda_comms/command_service.py +233 -86
puda_comms/machine_client.py +269 -137
puda_comms/models.py +8 -4
puda_comms/run_manager.py +112 -0
{puda_comms-0.0.3.dist-info → puda_comms-0.0.5.dist-info}/METADATA +14 -2
puda_comms-0.0.5.dist-info/RECORD +9 -0
puda_comms-0.0.3.dist-info/RECORD +0 -8
{puda_comms-0.0.3.dist-info → puda_comms-0.0.5.dist-info}/WHEEL +0 -0

puda_comms/command_service.py CHANGED Viewed

@@ -12,11 +12,17 @@ import json
 import logging
 import signal
 from datetime import datetime, timezone
-from typing import Dict, Any, Optional, Tuple
+from typing import Dict, Any, Optional
 import nats
 from nats.js.client import JetStreamContext
 from nats.aio.msg import Msg
-from puda_comms.models import CommandRequest, CommandResponse, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
+from puda_comms.models import (
+    CommandRequest,
+    CommandResponseStatus,
+    NATSMessage,
+    MessageHeader,
+    MessageType,
+)
 logger = logging.getLogger(__name__)
@@ -37,7 +43,7 @@ class ResponseHandler:
     def __init__(self, js: JetStreamContext, machine_id: str):
         self.js = js
         self.machine_id = machine_id
-        self._pending_responses: Dict[str, Tuple[asyncio.Event, CommandResponse]] = {}
+        self._pending_responses: Dict[str, Dict[str, Any]] = {}  # {'event': asyncio.Event, 'response': Optional[NATSMessage]}
         self._queue_consumer = None
         self._immediate_consumer = None
         self._initialized = False
@@ -102,8 +108,8 @@ class ResponseHandler:
                 # Get the pending response
                 pending = self._pending_responses[key]
-                # Store the full NATSMessage JSON structure
-                pending['response'] = message.model_dump()
+                # Store the NATSMessage directly
+                pending['response'] = message
                 # Signal that response was received
                 # Don't delete here - let get_response() delete it after retrieval
                 pending['event'].set()
@@ -152,7 +158,7 @@ class ResponseHandler:
         }
         return event
-    def get_response(self, run_id: str, step_number: int) -> Optional[Dict[str, Any]]:
+    def get_response(self, run_id: str, step_number: int) -> Optional[NATSMessage]:
         """
         Get the response for a pending command.
@@ -161,7 +167,7 @@ class ResponseHandler:
             step_number: Step number for the command
         Returns:
-            The NATSMessage dict structure if available, None otherwise
+            The NATSMessage if available, None otherwise
         """
         key = f"{run_id}:{str(step_number)}"
         if key in self._pending_responses:
@@ -266,9 +272,9 @@ class CommandService:
         max_attempts = 3
         connect_timeout = 3  # 3 seconds timeout per connection attempt
-        for attempt in range(1, max_attempts + 1):
+        for attempt in range(max_attempts):
             try:
-                logger.info("Connection attempt %d/%d to NATS servers: %s", attempt, max_attempts, self.servers)
+                logger.info("Connection attempt %d/%d to NATS servers: %s", attempt + 1, max_attempts, self.servers)
                 self.nc = await asyncio.wait_for(
                     nats.connect(
                         servers=self.servers,
@@ -285,14 +291,14 @@ class CommandService:
                 return True
             except asyncio.TimeoutError:
-                logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt, max_attempts, connect_timeout)
-                if attempt < max_attempts:
+                logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt + 1, max_attempts, connect_timeout)
+                if attempt < max_attempts - 1:
                     logger.info("Retrying connection...")
                 else:
                     logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
             except Exception as e:
-                logger.warning("Connection attempt %d/%d failed: %s", attempt, max_attempts, e)
-                if attempt < max_attempts:
+                logger.warning("Connection attempt %d/%d failed: %s", attempt + 1, max_attempts, e)
+                if attempt < max_attempts - 1:
                     logger.info("Retrying connection...")
                 else:
                     logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
@@ -343,6 +349,8 @@ class CommandService:
         request: CommandRequest,
         machine_id: str,
         run_id: str,
+        user_id: str,
+        username: str,
         timeout: int = 120
     ) -> Optional[NATSMessage]:
         """
@@ -352,6 +360,8 @@ class CommandService:
             request: CommandRequest model containing command details
             machine_id: Machine ID to send the command to
             run_id: Run ID for the command
+            user_id: User ID who initiated the command
+            username: Username who initiated the command
             timeout: Maximum time to wait for response in seconds
         Returns:
@@ -364,8 +374,8 @@ class CommandService:
         subject = f"{NAMESPACE}.{machine_id}.cmd.queue"
         logger.info(
-            "Sending queue command: machine_id=%s, command=%s, run_id=%s, step_number=%s",
-            machine_id, request.name, run_id, request.step_number
+            "Sending queue command: subject=%s, command=%s, run_id=%s, step_number=%s",
+            subject, request.name, run_id, request.step_number
         )
         # Get or create response handler for this machine
@@ -374,7 +384,7 @@ class CommandService:
         response_event = response_handler.register_pending(run_id, request.step_number)
         # Build payload
-        payload = self._build_command_payload(request, machine_id, run_id)
+        payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
         try:
             # Publish to JetStream
@@ -397,36 +407,107 @@ class CommandService:
             await asyncio.sleep(0.1)
             # Get the response
-            response_data = response_handler.get_response(run_id, request.step_number)
-            if response_data is None:
-                return None
-            return NATSMessage.model_validate(response_data)
+            return response_handler.get_response(run_id, request.step_number)
         except Exception as e:
             logger.error("Error sending queue command: %s", e)
             response_handler.remove_pending(run_id, request.step_number)
             return None
+    async def start_run(
+        self,
+        machine_id: str,
+        run_id: str,
+        user_id: str,
+        username: str,
+        timeout: int = 120
+    ) -> Optional[NATSMessage]:
+        """
+        Send START immediate command to begin a run.
+        Args:
+            machine_id: Machine ID to send the command to
+            run_id: Run ID for the command
+            user_id: User ID who initiated the command
+            username: Username who initiated the command
+            timeout: Maximum time to wait for response in seconds
+        Returns:
+            NATSMessage if successful, None if failed or timeout
+        """
+        request = CommandRequest(
+            name="start",
+            params={},
+            step_number=0
+        )
+        return await self.send_immediate_command(
+            request=request,
+            machine_id=machine_id,
+            run_id=run_id,
+            user_id=user_id,
+            username=username,
+            timeout=timeout
+        )
+    async def complete_run(
+        self,
+        machine_id: str,
+        run_id: str,
+        user_id: str,
+        username: str,
+        timeout: int = 120
+    ) -> Optional[NATSMessage]:
+        """
+        Send COMPLETE immediate command to end a run.
+        Args:
+            machine_id: Machine ID to send the command to
+            run_id: Run ID for the command
+            user_id: User ID who initiated the command
+            username: Username who initiated the command
+            timeout: Maximum time to wait for response in seconds
+        Returns:
+            NATSMessage if successful, None if failed or timeout
+        """
+        request = CommandRequest(
+            name="complete",
+            params={},
+            step_number=0
+        )
+        return await self.send_immediate_command(
+            request=request,
+            machine_id=machine_id,
+            run_id=run_id,
+            user_id=user_id,
+            username=username,
+            timeout=timeout
+        )
     async def send_queue_commands(
         self,
         *,
         requests: list[CommandRequest],
         machine_id: str,
         run_id: str,
+        user_id: str,
+        username: str,
         timeout: int = 120
     ) -> Optional[NATSMessage]:
         """
         Send multiple queue commands sequentially and wait for responses.
-        Sends commands one by one, waiting for each response before sending the next.
-        If any command fails or times out, stops immediately and returns the error response.
-        If all commands succeed, returns the last command's response.
+        Automatically sends START command before the sequence and COMPLETE command after
+        successful completion. Sends commands one by one, waiting for each response before
+        sending the next. If any command fails or times out, stops immediately and returns
+        the error response. If all commands succeed, returns the last command's response.
         Args:
             requests: List of CommandRequest models to send sequentially
             machine_id: Machine ID to send the commands to
             run_id: Run ID for all commands
+            user_id: User ID who initiated the commands
+            username: Username who initiated the commands
             timeout: Maximum time to wait for each response in seconds
         Returns:
@@ -447,74 +528,131 @@ class CommandService:
             run_id
         )
+        # Always send START command before sequence
+        logger.info("Sending START command before sequence")
+        start_response = await self.start_run(
+            machine_id=machine_id,
+            run_id=run_id,
+            user_id=user_id,
+            username=username,
+            timeout=timeout
+        )
+        if start_response is None:
+            logger.error("START command timed out")
+            return None
+        if start_response.response and start_response.response.status == CommandResponseStatus.ERROR:
+            logger.error("START command failed: %s", start_response.response.message)
+            return start_response
         last_response: Optional[NATSMessage] = None
-        for idx, request in enumerate(requests, start=1):
-            logger.info(
-                "Sending command %d/%d: %s (step %s)",
-                idx,
-                len(requests),
-                request.name,
-                request.step_number
-            )
-            response = await self.send_queue_command(
-                request=request,
-                machine_id=machine_id,
-                run_id=run_id,
-                timeout=timeout
-            )
-            # Check if command failed (None means timeout or exception)
-            if response is None:
-                logger.error(
-                    "Command %d/%d failed or timed out: %s (step %s)",
+        try:
+            for idx, request in enumerate(requests, start=1):
+                # Validate request - convert dict to CommandRequest if needed
+                if isinstance(request, dict):
+                    request = CommandRequest.model_validate(request)
+                elif not isinstance(request, CommandRequest):
+                    raise ValueError(f"Request {idx} must be a CommandRequest or dict, got {type(request)}")
+                logger.info(
+                    "Sending command %d/%d: %s (step %s)",
                     idx,
                     len(requests),
                     request.name,
                     request.step_number
                 )
-                return None
-            # Check if command returned an error status
-            if response.response is not None:
-                if response.response.status == CommandResponseStatus.ERROR:
+                response = await self.send_queue_command(
+                    request=request,
+                    machine_id=machine_id,
+                    run_id=run_id,
+                    user_id=user_id,
+                    username=username,
+                    timeout=timeout
+                )
+                # Check if command failed (None means timeout or exception)
+                if response is None:
                     logger.error(
-                        "Command %d/%d failed with error: %s (step %s) - code: %s, message: %s",
+                        "Command %d/%d failed or timed out: %s (step %s)",
+                        idx,
+                        len(requests),
+                        request.name,
+                        request.step_number
+                    )
+                    return None
+                # Check if command returned an error status
+                if response.response is not None:
+                    if response.response.status == CommandResponseStatus.ERROR:
+                        logger.error(
+                            "Command %d/%d failed with error: %s (step %s) - code: %s, message: %s",
+                            idx,
+                            len(requests),
+                            request.name,
+                            request.step_number,
+                            response.response.code,
+                            response.response.message
+                        )
+                        return response
+                    # Command succeeded, store as last response
+                    last_response = response
+                    logger.info(
+                        "Command %d/%d succeeded: %s (step %s)",
                         idx,
                         len(requests),
                         request.name,
-                        request.step_number,
-                        response.response.code,
-                        response.response.message
+                        request.step_number
+                    )
+                else:
+                    # Response exists but has no response data (shouldn't happen, but handle it)
+                    logger.warning(
+                        "Command %d/%d returned response with no response data: %s (step %s)",
+                        idx,
+                        len(requests),
+                        request.name,
+                        request.step_number
                     )
                     return response
-                # Command succeeded, store as last response
-                last_response = response
-                logger.info(
-                    "Command %d/%d succeeded: %s (step %s)",
-                    idx,
-                    len(requests),
-                    request.name,
-                    request.step_number
-                )
-            else:
-                # Response exists but has no response data (shouldn't happen, but handle it)
-                logger.warning(
-                    "Command %d/%d returned response with no response data: %s (step %s)",
-                    idx,
-                    len(requests),
-                    request.name,
-                    request.step_number
+            logger.info(
+                "All %d commands completed successfully",
+                len(requests)
+            )
+            # Always send COMPLETE command after successful sequence
+            logger.info("Sending COMPLETE command after successful sequence")
+            complete_response = await self.complete_run(
+                machine_id=machine_id,
+                run_id=run_id,
+                user_id=user_id,
+                username=username,
+                timeout=timeout
+            )
+            if complete_response is None:
+                logger.error("COMPLETE command timed out")
+                return None
+            if complete_response.response and complete_response.response.status == CommandResponseStatus.ERROR:
+                logger.error("COMPLETE command failed: %s", complete_response.response.message)
+                return complete_response
+            # Return the last command response, not the COMPLETE response
+            return last_response
+        except Exception as e:
+            # If any error occurs during command execution, try to complete the run
+            # to clean up state (but don't fail if this also fails)
+            logger.warning("Error during command sequence, attempting to complete run: %s", e)
+            try:
+                await self.complete_run(
+                    machine_id=machine_id,
+                    run_id=run_id,
+                    user_id=user_id,
+                    username=username,
+                    timeout=timeout
                 )
-                return response
-        logger.info(
-            "All %d commands completed successfully",
-            len(requests)
-        )
-        return last_response
+            except Exception as cleanup_error:
+                logger.error("Failed to complete run during error cleanup: %s", cleanup_error)
+            raise
     async def send_immediate_command(
         self,
@@ -522,6 +660,8 @@ class CommandService:
         request: CommandRequest,
         machine_id: str,
         run_id: str,
+        user_id: str,
+        username: str,
         timeout: int = 120
     ) -> Optional[NATSMessage]:
         """
@@ -531,6 +671,8 @@ class CommandService:
             request: CommandRequest model containing command details
             machine_id: Machine ID to send the command to
             run_id: Run ID for the command
+            user_id: User ID who initiated the command
+            username: Username who initiated the command
             timeout: Maximum time to wait for response in seconds
         Returns:
@@ -555,7 +697,7 @@ class CommandService:
         response_received = response_handler.register_pending(run_id, request.step_number)
         # Build payload
-        payload = self._build_command_payload(request, machine_id, run_id)
+        payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
         try:
             # Publish to JetStream
@@ -578,11 +720,7 @@ class CommandService:
             await asyncio.sleep(0.1)
             # Get the response
-            response_data = response_handler.get_response(run_id, request.step_number)
-            if response_data is None:
-                return None
-            return NATSMessage.model_validate(response_data)
+            return response_handler.get_response(run_id, request.step_number)
         except Exception as e:
             logger.error("Error sending immediate command: %s", e)
@@ -635,7 +773,9 @@ class CommandService:
         self,
         command_request: CommandRequest,
         machine_id: str,
-        run_id: str
+        run_id: str,
+        user_id: str,
+        username: str
     ) -> NATSMessage:
         """
         Build a command payload in the expected format.
@@ -643,17 +783,24 @@ class CommandService:
         Args:
             command_request: CommandRequest model containing command details
             machine_id: Machine ID for the command
-            run_id: Run ID for the command
+            run_id: Run ID for the command (empty string will be converted to None)
+            user_id: User ID who initiated the command
+            username: Username who initiated the command
         Returns:
             NATSMessage object ready for NATS transmission
         """
+        # Convert empty string to None for run_id
+        run_id_value = run_id if run_id else None
         header = MessageHeader(
             message_type=MessageType.COMMAND,
             version="1.0",
             timestamp=datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
+            user_id=user_id,
+            username=username,
             machine_id=machine_id,
-            run_id=run_id
+            run_id=run_id_value
         )
         return NATSMessage(

puda_comms/machine_client.py CHANGED Viewed

@@ -19,8 +19,9 @@ from puda_comms.models import (
     MessageType,
     ImmediateCommand,
 )
+from puda_comms.run_manager import RunManager
 from nats.js.client import JetStreamContext
-from nats.js.api import StreamConfig
+from nats.js.api import StreamConfig, ConsumerConfig
 from nats.js.errors import NotFoundError
 from nats.aio.msg import Msg
@@ -69,16 +70,20 @@ class MachineClient:
         # Default subscriptions
         self._cmd_queue_sub = None
+        self._cmd_queue_task = None  # Background task for pull consumer
         self._cmd_immediate_sub = None
         # Connection state
         self._is_connected = False
-        self._reconnect_handlers = []
+        self._queue_handler = None
+        self._immediate_handler = None
         # Queue control state
         self._pause_lock = asyncio.Lock()
         self._is_paused = False
-        self._cancelled_run_ids = set()
+        # Run state management
+        self.run_manager = RunManager(machine_id=machine_id)
     def _init_subjects(self):
         """Initialize all subject and stream names."""
@@ -184,30 +189,22 @@ class MachineClient:
             logger.error("Error ensuring %s stream: %s", stream_name, e, exc_info=True)
             raise
-    async def _ensure_command_queue_stream(self):
-        """Ensure COMMAND_QUEUE stream exists with WorkQueue retention policy."""
+    async def _ensure_all_streams(self):
+        """Ensure all required streams exist with correct retention policies."""
         await self._ensure_stream(
             self.STREAM_COMMAND_QUEUE,
-            f"{self.NAMESPACE}.*.cmd.queue"
+            f"{self.NAMESPACE}.*.cmd.queue",
+            retention='workqueue'
         )
-    async def _ensure_command_immediate_stream(self):
-        """Ensure COMMAND_IMMEDIATE stream exists with WorkQueue retention policy."""
         await self._ensure_stream(
             self.STREAM_COMMAND_IMMEDIATE,
             f"{self.NAMESPACE}.*.cmd.immediate"
         )
-    async def _ensure_response_queue_stream(self):
-        """Ensure RESPONSE_QUEUE stream exists with Interest retention policy."""
         await self._ensure_stream(
             self.STREAM_RESPONSE_QUEUE,
             f"{self.NAMESPACE}.*.cmd.response.queue",
             retention='interest'
         )
-    async def _ensure_response_immediate_stream(self):
-        """Ensure RESPONSE_IMMEDIATE stream exists with Interest retention policy."""
         await self._ensure_stream(
             self.STREAM_RESPONSE_IMMEDIATE,
             f"{self.NAMESPACE}.*.cmd.response.immediate",
@@ -230,7 +227,17 @@ class MachineClient:
     async def _cleanup_subscriptions(self):
         """Unsubscribe from all subscriptions."""
-        # Clean up subscriptions
+        # Clean up queue subscription (pull consumer)
+        if self._cmd_queue_task:
+            try:
+                self._cmd_queue_task.cancel()
+                await self._cmd_queue_task
+            except asyncio.CancelledError:
+                pass
+            except Exception:
+                pass
+            self._cmd_queue_task = None
         if self._cmd_queue_sub:
             try:
                 await self._cmd_queue_sub.unsubscribe()
@@ -252,6 +259,7 @@ class MachineClient:
         self.kv = None
         # Subscriptions will be recreated on reconnection
         self._cmd_queue_sub = None
+        self._cmd_queue_task = None
         self._cmd_immediate_sub = None
     # ==================== CONNECTION MANAGEMENT ====================
@@ -270,10 +278,7 @@ class MachineClient:
                 closed_cb=self._closed_callback
             )
             self.js = self.nc.jetstream()
-            await self._ensure_command_queue_stream()
-            await self._ensure_command_immediate_stream()
-            await self._ensure_response_queue_stream()
-            await self._ensure_response_immediate_stream()
+            await self._ensure_all_streams()
             self.kv = await self._get_or_create_kv_bucket()
             self._is_connected = True
             logger.info("Connected to NATS servers: %s", self.servers)
@@ -299,32 +304,16 @@ class MachineClient:
         if self.nc:
             self.js = self.nc.jetstream()
-            await self._ensure_command_queue_stream()
-            await self._ensure_command_immediate_stream()
-            await self._ensure_response_queue_stream()
-            await self._ensure_response_immediate_stream()
+            await self._ensure_all_streams()
             self.kv = await self._get_or_create_kv_bucket()
             await self._resubscribe_handlers()
     async def _resubscribe_handlers(self):
         """Re-subscribe to all handlers after reconnection."""
-        subscribe_methods = {
-            'queue': self.subscribe_queue,
-            'immediate': self.subscribe_immediate,
-        }
-        for handler_info in self._reconnect_handlers:
-            try:
-                handler_type = handler_info['type']
-                handler = handler_info['handler']
-                subscribe_method = subscribe_methods.get(handler_type)
-                if subscribe_method:
-                    await subscribe_method(handler)
-                else:
-                    logger.warning("Unknown handler type: %s", handler_type)
-            except Exception as e:
-                logger.error("Failed to re-subscribe %s: %s", handler_type, e)
+        if self._queue_handler:
+            await self.subscribe_queue(self._queue_handler)
+        if self._immediate_handler:
+            await self.subscribe_immediate(self._immediate_handler)
     async def _closed_callback(self):
         """Callback when connection is closed."""
@@ -437,41 +426,35 @@ class MachineClient:
             logger.error("Error publishing command response: %s", e)
     async def process_queue_cmd(
-        self,
+        self,
         msg: Msg,
-        handler: Callable[[CommandRequest], Awaitable[CommandResponse]]
+        handler: Callable[[NATSMessage], Awaitable[CommandResponse]]
     ) -> None:
         """
         Handle the lifecycle of a single message: Parse -> Handle -> Ack/Nak/Term.
         Args:
             msg: NATS message
-            handler: Handler function that processes the message and returns CommandResponse
+            handler: Handler function that processes the message and returns a CommandResponse object
         """
+        # Initialize variables for exception handlers
+        run_id = None
+        step_number = None
+        command = None
         try:
             # Parse message
             message = NATSMessage.model_validate_json(msg.data)
             run_id = message.header.run_id
-            step_number = message.command.step_number
-            command = message.command.name
+            step_number = message.command.step_number if message.command else None
+            command = message.command.name if message.command else None
-            # Check if cancelled
-            if run_id and run_id in self._cancelled_run_ids:
-                logger.info("Skipping cancelled command: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
-                await msg.ack()
-                await self._publish_command_response(
-                    msg=msg,
-                    response=CommandResponse(
-                        status=CommandResponseStatus.ERROR,
-                        code=CommandResponseCode.COMMAND_CANCELLED,
-                        message='Command cancelled'
-                    ),
-                    subject=self.response_queue
-                )
-                # Note: Final state update should be published by the handler with machine-specific data
-                return
+            # For all commands, continue with normal processing:
+            # 1. Check if paused
+            # 2. Validate run_id matches active run
+            # 3. Execute handler
-            # Check if paused (for queue messages)
+            # If machine is paused, publish error response and return
             async with self._pause_lock:
                 if self._is_paused:
                     await self._publish_command_response(
@@ -484,24 +467,42 @@ class MachineClient:
                         subject=self.response_queue
                     )
                     return
-                while self._is_paused:
-                    await msg.in_progress()
-                    await asyncio.sleep(1)
-                    # Re-check cancelled state in case it was cancelled while paused
-                    if run_id and run_id in self._cancelled_run_ids:
-                        logger.info("Command cancelled while paused: run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
-                        await msg.ack()
-                        await self._publish_command_response(
-                            msg=msg,
-                            response=CommandResponse(
-                                status=CommandResponseStatus.ERROR,
-                                code=CommandResponseCode.COMMAND_CANCELLED,
-                                message='Command cancelled'
-                            ),
-                            subject=self.response_queue
-                        )
-                        # Note: Final state update should be published by the handler with machine-specific data
-                        return
+            # Wait while paused (release lock during wait so RESUME can acquire it)
+            while True:
+                async with self._pause_lock:
+                    if not self._is_paused:
+                        break
+                # Release lock before sleeping so RESUME can set _is_paused = False
+                await msg.in_progress()
+                await asyncio.sleep(1)
+            # Validate run_id matches active run (run_id is required)
+            if run_id is None:
+                await msg.ack()
+                await self._publish_command_response(
+                    msg=msg,
+                    response=CommandResponse(
+                        status=CommandResponseStatus.ERROR,
+                        code=CommandResponseCode.EXECUTION_ERROR,
+                        message='Command requires run_id'
+                    ),
+                    subject=self.response_queue
+                )
+                return
+            if not await self.run_manager.validate_run_id(run_id):
+                await msg.ack()
+                await self._publish_command_response(
+                    msg=msg,
+                    response=CommandResponse(
+                        status=CommandResponseStatus.ERROR,
+                        code=CommandResponseCode.RUN_ID_MISMATCH,
+                        message=f'Run ID mismatch: expected active run, got {run_id}'
+                    ),
+                    subject=self.response_queue
+                )
+                return
             # Execute handler with auto-heartbeat (task might take a while for machine to complete)
             # The handler should be defined in the machine-specific edge module.
@@ -553,34 +554,19 @@ class MachineClient:
             # This is a rare case - consider if handler should be called with None payload
         except Exception as e:
-            # Check if cancelled before sending error response
-            if run_id and run_id in self._cancelled_run_ids:
-                logger.info("Command cancelled during execution (exception occurred): run_id=%s, step_number=%s, command=%s", run_id, step_number, command)
-                await msg.ack()
-                await self._publish_command_response(
-                    msg=msg,
-                    response=CommandResponse(
-                        status=CommandResponseStatus.ERROR,
-                        code=CommandResponseCode.COMMAND_CANCELLED,
-                        message='Command cancelled'
-                    ),
-                    subject=self.response_queue
-                )
-                # Note: Final state update should be published by the handler with machine-specific data
-            else:
-                # Terminate all errors to prevent infinite redelivery loops
-                logger.error("Handler failed (terminating message): %s", e)
-                await msg.term()
-                await self._publish_command_response(
-                    msg=msg,
-                    response=CommandResponse(
-                        status=CommandResponseStatus.ERROR,
-                        code=CommandResponseCode.EXECUTION_ERROR,
-                        message=str(e)
-                    ),
-                    subject=self.response_queue
-                )
-                # Note: Final state update should be published by the handler with machine-specific data
+            # Terminate all errors to prevent infinite redelivery loops
+            logger.error("Handler failed (terminating message): %s", e)
+            await msg.term()
+            await self._publish_command_response(
+                msg=msg,
+                response=CommandResponse(
+                    status=CommandResponseStatus.ERROR,
+                    code=CommandResponseCode.EXECUTION_ERROR,
+                    message=str(e)
+                ),
+                subject=self.response_queue
+            )
+            # Note: Final state update should be published by the handler with machine-specific data
     async def process_immediate_cmd(self, msg: Msg, handler: Callable[[CommandRequest], Awaitable[CommandResponse]]) -> None:
         """Process immediate commands (pause, cancel, resume, etc.)."""
@@ -595,8 +581,49 @@ class MachineClient:
                 return
             command_name = message.command.name.lower()
+            run_id = message.header.run_id
+            response: CommandResponse
             match command_name:
+                case ImmediateCommand.START:
+                    if run_id:
+                        success = await self.run_manager.start_run(run_id)
+                        if not success:
+                            # Run already active
+                            response = CommandResponse(
+                                status=CommandResponseStatus.ERROR,
+                                code=CommandResponseCode.RUN_ID_MISMATCH,
+                                message='cannot start, another run is currently running'
+                            )
+                        else:
+                            await self.publish_state({'state': 'active', 'run_id': run_id})
+                            response = CommandResponse(status=CommandResponseStatus.SUCCESS)
+                    else:
+                        response = CommandResponse(
+                            status=CommandResponseStatus.ERROR,
+                            code=CommandResponseCode.MISSING_RUN_ID,
+                            message='START command requires RUN_ID'
+                        )
+                case ImmediateCommand.COMPLETE:
+                    if not run_id:
+                        response = CommandResponse(
+                            status=CommandResponseStatus.ERROR,
+                            code=CommandResponseCode.MISSING_RUN_ID,
+                            message='COMPLETE command requires RUN_ID'
+                        )
+                    else:
+                        success = await self.run_manager.complete_run(run_id)
+                        if success:
+                            await self.publish_state({'state': 'idle', 'run_id': None})
+                            response = CommandResponse(status=CommandResponseStatus.SUCCESS)
+                        else:
+                            response = CommandResponse(
+                                status=CommandResponseStatus.ERROR,
+                                code=CommandResponseCode.RUN_ID_MISMATCH,
+                                message=f'Run {run_id} not active'
+                            )
                 case ImmediateCommand.PAUSE:
                     async with self._pause_lock:
                         if not self._is_paused:
@@ -604,7 +631,7 @@ class MachineClient:
                             logger.info("Queue paused")
                             await self.publish_state({'state': 'paused', 'run_id': message.header.run_id})
                     # Call handler and use its response
-                    response: CommandResponse = await handler(message)
+                    response = await handler(message)
                 case ImmediateCommand.RESUME:
                     async with self._pause_lock:
@@ -613,19 +640,30 @@ class MachineClient:
                             logger.info("Queue resumed")
                             await self.publish_state({'state': 'idle', 'run_id': None})
                     # Call handler and use its response
-                    response: CommandResponse = await handler(message)
+                    response = await handler(message)
                 case ImmediateCommand.CANCEL:
-                    if message.header.run_id:
-                        self._cancelled_run_ids.add(message.header.run_id)
-                        logger.info("Cancelling all commands with run_id: %s", message.header.run_id)
+                    if not run_id:
+                        response = CommandResponse(
+                            status=CommandResponseStatus.ERROR,
+                            code=CommandResponseCode.MISSING_RUN_ID,
+                            message='CANCEL command requires RUN_ID'
+                        )
+                    else:
+                        logger.info("Cancelling all commands with run_id: %s", run_id)
+                        # Clear the active run_id when cancelling (try to complete, but clear anyway)
+                        await self.run_manager.complete_run(run_id)
                         await self.publish_state({'state': 'idle', 'run_id': None})
-                    # Call handler and use its response
-                    response: CommandResponse = await handler(message)
+                        # Call handler and use its response
+                        response = await handler(message)
                 case _:
-                    # For other immediate commands, call the user-provided handler
-                    response: CommandResponse = await handler(message)
+                    # Unknown immediate command
+                    response = CommandResponse(
+                        status=CommandResponseStatus.ERROR,
+                        code=CommandResponseCode.UNKNOWN_COMMAND,
+                        message=f'Unknown immediate command: {command_name}'
+                    )
             await self._publish_command_response(
                 msg=msg,
@@ -661,9 +699,54 @@ class MachineClient:
             )
             await self.publish_state({'state': 'error', 'run_id': None})
+    async def _verify_or_recreate_consumer(self, durable_name: str):
+        """
+        Check if consumer exists and verify/update its configuration.
+        Deletes and recreates the consumer if configuration doesn't match.
+        Args:
+            durable_name: Name of the durable consumer to verify
+        """
+        # Check if consumer exists and verify/update its configuration
+        try:
+            consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
+            logger.debug("Durable consumer %s already exists", durable_name)
+            # Check if consumer config matches what we need
+            config = consumer_info.config
+            needs_recreate = False
+            if getattr(config, 'filter_subject', None) != self.cmd_queue:
+                logger.warning("Consumer filter_subject mismatch: expected %s, got %s",
+                             self.cmd_queue, getattr(config, 'filter_subject', None))
+                needs_recreate = True
+            if getattr(config, 'ack_policy', None) != 'explicit':
+                logger.warning("Consumer ack_policy mismatch: expected explicit, got %s",
+                             getattr(config, 'ack_policy', None))
+                needs_recreate = True
+            if getattr(config, 'deliver_policy', None) != 'all':
+                logger.warning("Consumer deliver_policy mismatch: expected all, got %s",
+                             getattr(config, 'deliver_policy', None))
+                needs_recreate = True
+            if needs_recreate:
+                # Consumer exists but config doesn't match - delete and recreate
+                logger.info("Consumer config mismatch, deleting and recreating: %s", durable_name)
+                try:
+                    await self.js.delete_consumer(self.STREAM_COMMAND_QUEUE, durable_name)
+                except Exception as e:
+                    logger.warning("Error deleting consumer: %s", e)
+            else:
+                # Log consumer state for diagnostics
+                logger.info("Consumer exists with correct config - pending: %d, delivered: %d, ack_pending: %d",
+                           consumer_info.num_pending, consumer_info.delivered.consumer_seq,
+                           consumer_info.num_ack_pending)
+        except NotFoundError:
+            # Consumer doesn't exist, will be created by pull_subscribe
+            logger.debug("Durable consumer %s does not exist, will be created", durable_name)
     async def subscribe_queue(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
         """
-        Subscribe to queue commands with default consumer.
+        Subscribe to queue commands with pull consumer.
         Args:
             handler: Async function that processes command payloads and returns CommandResponse
@@ -671,21 +754,69 @@ class MachineClient:
         if not self.js:
             logger.error("JetStream not available for queue subscription")
             return
+        # Store handler for reconnection
+        self._queue_handler = handler
         # Ensure stream exists before attempting to subscribe
-        await self._ensure_command_queue_stream()
+        await self._ensure_all_streams()
         try:
-            async def message_handler(msg: Msg):
-                """Wrapper to process queue messages."""
-                await self.process_queue_cmd(msg, handler)
-            self._cmd_queue_sub = await self.js.subscribe(
+            durable_name = f"cmd_queue_{self.machine_id}"
+            await self._verify_or_recreate_consumer(durable_name)
+            # Create pull subscription - this will create the consumer if it doesn't exist
+            # Pass config directly to ensure correct consumer configuration
+            consumer_config = ConsumerConfig(
+                durable_name=durable_name,
+                filter_subject=self.cmd_queue,
+                ack_policy="explicit",
+                deliver_policy="all",  # Required for WorkQueue: deliver all messages from the beginning
+            )
+            self._cmd_queue_sub = await self.js.pull_subscribe(
                 subject=self.cmd_queue,
+                durable=durable_name,
                 stream=self.STREAM_COMMAND_QUEUE,
-                durable=f"cmd_queue_{self.machine_id}",
-                cb=message_handler
+                config=consumer_config
             )
+            # Log final consumer info for diagnostics
+            try:
+                consumer_info = await self.js.consumer_info(self.STREAM_COMMAND_QUEUE, durable_name)
+                logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s, pending: %d, ack_pending: %d",
+                           self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE,
+                           consumer_info.num_pending, consumer_info.num_ack_pending)
+            except Exception as e:
+                logger.warning("Could not get consumer info after subscription: %s", e)
+                logger.info("Pull subscription created - subject: %s, durable: %s, stream: %s",
+                           self.cmd_queue, durable_name, self.STREAM_COMMAND_QUEUE)
+            # Start background task to pull and process messages
+            async def pull_messages():
+                """Continuously pull messages from the queue."""
+                try:
+                    while True:
+                        try:
+                            # Fetch one message (timeout 1 second)
+                            msgs = await self._cmd_queue_sub.fetch(batch=1, timeout=1.0)
+                            if msgs:
+                                logger.debug("Pulled message from queue")
+                                await self.process_queue_cmd(msgs[0], handler)
+                        except asyncio.TimeoutError:
+                            # Timeout is expected when no messages are available
+                            continue
+                        except Exception as e:
+                            logger.error("Error pulling queue messages: %s", e, exc_info=True)
+                            await asyncio.sleep(1)  # Wait before retrying
+                except asyncio.CancelledError:
+                    logger.debug("Queue pull task cancelled")
+                    raise
+            self._cmd_queue_task = asyncio.create_task(pull_messages())
+            logger.info("Started background task for pulling queue messages")
         except NotFoundError:
             # Stream still not found after ensuring it exists - this shouldn't happen
             # but handle it gracefully with detailed diagnostics
@@ -703,10 +834,7 @@ class MachineClient:
                 logger.error("  Stream verification failed: %s", stream_check_error)
             raise
-        # Register handler for reconnection
-        if not any(h['type'] == 'queue' for h in self._reconnect_handlers):
-            self._reconnect_handlers.append({'type': 'queue', 'handler': handler})
-        logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s)",
+        logger.info("Subscribed to queue commands: %s (durable: cmd_queue_%s, stream: %s, pull consumer)",
                    self.cmd_queue, self.machine_id, self.STREAM_COMMAND_QUEUE)
     async def subscribe_immediate(self, handler: Callable[[NATSMessage], Awaitable[CommandResponse]]):
@@ -720,19 +848,26 @@ class MachineClient:
             logger.error("JetStream not available for immediate subscription")
             return
+        # Store handler for use in callback and reconnection
+        self._immediate_handler = handler
         async def message_handler(msg: Msg):
-            """Wrapper to process immediate messages."""
-            await self.process_immediate_cmd(msg, handler)
+            """Process immediate messages using stored handler."""
+            await self.process_immediate_cmd(msg, self._immediate_handler)
         # Ensure stream exists before attempting to subscribe
-        await self._ensure_command_immediate_stream()
+        await self._ensure_stream(
+            self.STREAM_COMMAND_IMMEDIATE,
+            f"{self.NAMESPACE}.*.cmd.immediate",
+            retention='workqueue'
+        )
         try:
             self._cmd_immediate_sub = await self.js.subscribe(
                 subject=self.cmd_immediate,
                 stream=self.STREAM_COMMAND_IMMEDIATE,
                 durable=f"cmd_immed_{self.machine_id}",
-                cb=message_handler
+                cb=message_handler  # required for push consumer to handle messages
             )
         except NotFoundError:
             # Stream still not found after ensuring it exists - this shouldn't happen
@@ -741,9 +876,6 @@ class MachineClient:
                        self.STREAM_COMMAND_IMMEDIATE)
             raise
-        # Register handler for reconnection
-        if not any(h['type'] == 'immediate' for h in self._reconnect_handlers):
-            self._reconnect_handlers.append({'type': 'immediate', 'handler': handler})
         logger.info("Subscribed to immediate commands: %s (durable: cmd_immed_%s, stream: %s)",
                    self.cmd_immediate, self.machine_id, self.STREAM_COMMAND_IMMEDIATE)

puda_comms/models.py CHANGED Viewed

@@ -25,6 +25,7 @@ class CommandResponseCode(str, Enum):
     RESUME_ERROR = 'RESUME_ERROR'
     NO_EXECUTION = 'NO_EXECUTION'
     RUN_ID_MISMATCH = 'RUN_ID_MISMATCH'
+    MISSING_RUN_ID = 'MISSING_RUN_ID'
     CANCEL_ERROR = 'CANCEL_ERROR'
     MACHINE_PAUSED = 'MACHINE_PAUSED'
@@ -40,6 +41,8 @@ class MessageType(str, Enum):
 class ImmediateCommand(str, Enum):
     """Command names for immediate commands."""
+    START = 'start'
+    COMPLETE = 'complete'
     PAUSE = 'pause'
     RESUME = 'resume'
     CANCEL = 'cancel'
@@ -68,18 +71,19 @@ class CommandResponse(BaseModel):
 class MessageHeader(BaseModel):
     """Header for NATS messages."""
-    message_type: MessageType = Field(description="Type of message")
     version: str = Field(default="1.0", description="Message version")
-    timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
+    message_type: MessageType = Field(description="Type of message")
+    user_id: str = Field(description="User ID")
+    username: str = Field(description="User name")
     machine_id: str = Field(description="Machine ID")
     run_id: Optional[str] = Field(default=None, description="Unique identifier (uuid) for the run/workflow")
+    timestamp: str = Field(default_factory=_get_current_timestamp, description="ISO format timestamp (auto-set on creation)")
 class NATSMessage(BaseModel):
     """
     Complete NATS message structure.
     Structure:
-    - header: MessageHeader with message_type, version, timestamp, machine_id, run_id
+    - header: MessageHeader with message_type, version, timestamp, user_id, username, machine_id, run_id
     - command: Optional CommandRequest (for command messages)
     - response: Optional CommandResponse data (for response messages)
     """

puda_comms/run_manager.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+Run State Management
+Provides thread-safe run state tracking and validation for machine commands.
+"""
+import asyncio
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+class RunManager:
+    """
+    Manages run state for a machine.
+    Tracks the active run_id and validates that commands match the active run.
+    Provides thread-safe operations for run lifecycle management.
+    """
+    def __init__(self, machine_id: str):
+        """
+        Initialize RunManager for a machine.
+        Args:
+            machine_id: Machine identifier
+        """
+        self.machine_id = machine_id
+        self._active_run_id: Optional[str] = None
+        self._lock = asyncio.Lock()
+    async def start_run(self, run_id: str) -> bool:
+        """
+        Set active run_id. Returns True if successful, False if run already active.
+        Args:
+            run_id: Run ID to set as active
+        Returns:
+            True if run was started successfully, False if another run is already active
+        """
+        async with self._lock:
+            if self._active_run_id is not None:
+                logger.warning(
+                    "Cannot start run %s: run %s is already active on machine %s",
+                    run_id, self._active_run_id, self.machine_id
+                )
+                return False
+            self._active_run_id = run_id
+            logger.info("Started run %s on machine %s", run_id, self.machine_id)
+            return True
+    async def complete_run(self, run_id: str) -> bool:
+        """
+        Clear run_id if it matches. Returns True if successful.
+        Args:
+            run_id: Run ID to complete
+        Returns:
+            True if run was completed successfully, False if run_id doesn't match active run
+        """
+        async with self._lock:
+            if self._active_run_id != run_id:
+                logger.warning(
+                    "Cannot complete run %s: active run is %s on machine %s",
+                    run_id, self._active_run_id, self.machine_id
+                )
+                return False
+            self._active_run_id = None
+            logger.info("Completed run %s on machine %s", run_id, self.machine_id)
+            return True
+    async def validate_run_id(self, run_id: str) -> bool:
+        """
+        Check if run_id matches active run. Returns True if valid.
+        Args:
+            run_id: Run ID to validate (required)
+        Returns:
+            True if run_id matches active run, False otherwise
+        """
+        async with self._lock:
+            # If no active run, any run_id is invalid
+            if self._active_run_id is None:
+                logger.warning(
+                    "Run ID validation failed: no active run, got %s on machine %s",
+                    run_id, self.machine_id
+                )
+                return False
+            # Run_id must match active run
+            if self._active_run_id != run_id:
+                logger.warning(
+                    "Run ID validation failed: expected %s, got %s on machine %s",
+                    self._active_run_id, run_id, self.machine_id
+                )
+                return False
+            return True
+    def get_active_run_id(self) -> Optional[str]:
+        """
+        Get current active run_id.
+        Returns:
+            Active run_id if one exists, None otherwise
+        """
+        return self._active_run_id

{puda_comms-0.0.3.dist-info → puda_comms-0.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: puda-comms
-Version: 0.0.3
+Version: 0.0.5
 Summary: Communication library for the PUDA platform.
 Author: zhao
 Author-email: zhao <20024592+agentzhao@users.noreply.github.com>
@@ -121,6 +121,8 @@ Header metadata for NATS messages.
 - `message_type` (MessageType): Type of message (COMMAND, RESPONSE, LOG, etc.)
 - `version` (str): Message version (default: "1.0")
 - `timestamp` (str): ISO 8601 UTC timestamp (auto-generated)
+- `user_id` (str): User ID who initiated the command
+- `username` (str): Username who initiated the command
 - `machine_id` (str): Identifier for the target machine
 - `run_id` (Optional[str]): Unique identifier (UUID) for the run/workflow
@@ -130,6 +132,8 @@ header = MessageHeader(
     message_type=MessageType.RESPONSE,
     version="1.0",
     timestamp="2026-01-20T02:00:46Z",
+    user_id="user123",
+    username="John Doe",
     machine_id="first",
     run_id="092073e6-13d0-4756-8d99-eff1612a5a72"
 )
@@ -154,6 +158,8 @@ Complete NATS message structure combining header with optional command or respon
     "message_type": "response",
     "version": "1.0",
     "timestamp": "2026-01-20T02:00:46Z",
+    "user_id": "user123",
+    "username": "John Doe",
     "machine_id": "first",
     "run_id": "092073e6-13d0-4756-8d99-eff1612a5a72"
   },
@@ -229,6 +235,8 @@ reply = await service.send_queue_command(
     request=request,
     machine_id="first",
     run_id=run_id,
+    user_id="user123",
+    username="John Doe",
     timeout=60  # Wait up to 60 seconds
 )
@@ -237,6 +245,8 @@ reply = await service.send_queue_commands(
     requests=commands,
     machine_id="first",
     run_id=run_id,
+    user_id="user123",
+    username="John Doe",
     timeout=60  # Wait up to 60 seconds per command
 )
 ```
@@ -274,7 +284,9 @@ Always check the response status and handle errors appropriately:
 reply: NATSMessage = await service.send_queue_command(
     request=request,
     machine_id="first",
-    run_id=run_id
+    run_id=run_id,
+    user_id="user123",
+    username="John Doe"
 )
 if reply is None:

puda_comms-0.0.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
+puda_comms/command_service.py,sha256=Lxk-CUan_DwftBZlSYO3VnddxaM9fYKxxhWF8VCqABY,30423
+puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
+puda_comms/machine_client.py,sha256=OnA8we1c62n1aEFr0NfiapklHWXR-WFzq5FXQrvuUM8,39378
+puda_comms/models.py,sha256=CfXq_Wxqk5OQo5VknXR-BdLIT2SM69s8cGxGYr9T8WI,3701
+puda_comms/run_manager.py,sha256=_s4VYVGwtRMcduz95_DPIObso4uWRS24n5NH7AiGgjI,3591
+puda_comms-0.0.5.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
+puda_comms-0.0.5.dist-info/METADATA,sha256=REBvcpJsUCxiFCKihVVReP0lh6IkJcBl4I8XohjhSHE,11512
+puda_comms-0.0.5.dist-info/RECORD,,

puda_comms-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-puda_comms/__init__.py,sha256=lntvVFJJez_rv5lZy5mYj4_43B9Y3NRNzxWfBuSAQ1M,194
-puda_comms/command_service.py,sha256=E5kGzl2hjkSTubxv01nxuo9XMXHY5aTEsn-k3IDJVB8,24727
-puda_comms/execution_state.py,sha256=aTaejCnJgg1y_FP-ymIC1GQzqC81FIWo0RZ18XzAQnA,2881
-puda_comms/machine_client.py,sha256=r8oSnkRoqhKykvyR94kGlA1vRrCKLq-o9uNZQftxqDU,33120
-puda_comms/models.py,sha256=cVH5uKzyLmjzPeBcm3RIJMTkoynmxqe_P26GtZwlIN8,3500
-puda_comms-0.0.3.dist-info/WHEEL,sha256=ZyFSCYkV2BrxH6-HRVRg3R9Fo7MALzer9KiPYqNxSbo,79
-puda_comms-0.0.3.dist-info/METADATA,sha256=Fnf_YWeOZAcefPUTY976BUT95M0w-8bSqAhjVMkmjxA,11158
-puda_comms-0.0.3.dist-info/RECORD,,

{puda_comms-0.0.3.dist-info → puda_comms-0.0.5.dist-info}/WHEEL RENAMED Viewed

File without changes

puda-comms 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

puda-comms 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl