PyPI - puda-comms - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

puda-comms 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

puda_comms/__init__.py +5 -1
puda_comms/command_service.py +261 -85
puda_comms/machine_client.py +215 -88
puda_comms/models.py +7 -1
puda_comms/run_manager.py +112 -0
puda_comms/stream_subscriber.py +388 -0
{puda_comms-0.0.4.dist-info → puda_comms-0.0.6.dist-info}/METADATA +12 -13
puda_comms-0.0.6.dist-info/RECORD +10 -0
puda_comms-0.0.4.dist-info/RECORD +0 -8
{puda_comms-0.0.4.dist-info → puda_comms-0.0.6.dist-info}/WHEEL +0 -0

puda_comms/__init__.py CHANGED Viewed

@@ -1,5 +1,9 @@
+# Import models first to ensure they're initialized before other modules that depend on them
+from . import models
 from .machine_client import MachineClient
 from .execution_state import ExecutionState
 from .command_service import CommandService
+from .stream_subscriber import StreamSubscriber
-__all__ = ["MachineClient", "ExecutionState", "CommandService"]
+__all__ = ["MachineClient", "ExecutionState", "CommandService", "StreamSubscriber", "models"]

puda_comms/command_service.py CHANGED Viewed

@@ -16,7 +16,13 @@ from typing import Dict, Any, Optional
 import nats
 from nats.js.client import JetStreamContext
 from nats.aio.msg import Msg
-from puda_comms.models import CommandRequest, CommandResponseStatus, NATSMessage, MessageHeader, MessageType
+from .models import (
+    CommandRequest,
+    CommandResponseStatus,
+    NATSMessage,
+    MessageHeader,
+    MessageType,
+)
 logger = logging.getLogger(__name__)
@@ -98,7 +104,7 @@ class ResponseHandler:
                     command, step_number, run_id, message.response.status
                 )
                 if message.response.status == CommandResponseStatus.ERROR:
-                    logger.warning("Command failed: %s", message.response.message)
+                    logger.error("Error Code: %s, Message: %s", message.response.code.name, message.response.message)
                 # Get the pending response
                 pending = self._pending_responses[key]
@@ -266,9 +272,9 @@ class CommandService:
         max_attempts = 3
         connect_timeout = 3  # 3 seconds timeout per connection attempt
-        for attempt in range(1, max_attempts + 1):
+        for attempt in range(max_attempts):
             try:
-                logger.info("Connection attempt %d/%d to NATS servers: %s", attempt, max_attempts, self.servers)
+                logger.info("Connection attempt %d/%d to NATS servers: %s", attempt + 1, max_attempts, self.servers)
                 self.nc = await asyncio.wait_for(
                     nats.connect(
                         servers=self.servers,
@@ -285,14 +291,14 @@ class CommandService:
                 return True
             except asyncio.TimeoutError:
-                logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt, max_attempts, connect_timeout)
-                if attempt < max_attempts:
+                logger.warning("Connection attempt %d/%d timed out after %d seconds", attempt + 1, max_attempts, connect_timeout)
+                if attempt < max_attempts - 1:
                     logger.info("Retrying connection...")
                 else:
                     logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
             except Exception as e:
-                logger.warning("Connection attempt %d/%d failed: %s", attempt, max_attempts, e)
-                if attempt < max_attempts:
+                logger.warning("Connection attempt %d/%d failed: %s", attempt + 1, max_attempts, e)
+                if attempt < max_attempts - 1:
                     logger.info("Retrying connection...")
                 else:
                     logger.error("Failed to connect after %d attempts. Giving up.", max_attempts)
@@ -341,7 +347,6 @@ class CommandService:
         self,
         *,
         request: CommandRequest,
-        machine_id: str,
         run_id: str,
         user_id: str,
         username: str,
@@ -351,8 +356,7 @@ class CommandService:
         Send a queue command to the machine and wait for response.
         Args:
-            request: CommandRequest model containing command details
-            machine_id: Machine ID to send the command to
+            request: CommandRequest model containing command details (must include machine_id)
             run_id: Run ID for the command
             user_id: User ID who initiated the command
             username: Username who initiated the command
@@ -364,8 +368,8 @@ class CommandService:
         if not self._connected or not self.js:
             raise RuntimeError("Not connected to NATS. Call connect() first.")
-        # Determine subject
-        subject = f"{NAMESPACE}.{machine_id}.cmd.queue"
+        # Determine subject using machine_id from request
+        subject = f"{NAMESPACE}.{request.machine_id}.cmd.queue"
         logger.info(
             "Sending queue command: subject=%s, command=%s, run_id=%s, step_number=%s",
@@ -373,12 +377,12 @@ class CommandService:
         )
         # Get or create response handler for this machine
-        response_handler = await self._get_response_handler(machine_id)
+        response_handler = await self._get_response_handler(request.machine_id)
         # Register pending response
         response_event = response_handler.register_pending(run_id, request.step_number)
         # Build payload
-        payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
+        payload = self._build_command_payload(request, request.machine_id, run_id, user_id, username)
         try:
             # Publish to JetStream
@@ -408,11 +412,80 @@ class CommandService:
             response_handler.remove_pending(run_id, request.step_number)
             return None
+    async def start_run(
+        self,
+        machine_id: str,
+        run_id: str,
+        user_id: str,
+        username: str,
+        timeout: int = 120
+    ) -> Optional[NATSMessage]:
+        """
+        Send START immediate command to begin a run.
+        Args:
+            machine_id: Machine ID to send the command to
+            run_id: Run ID for the command
+            user_id: User ID who initiated the command
+            username: Username who initiated the command
+            timeout: Maximum time to wait for response in seconds
+        Returns:
+            NATSMessage if successful, None if failed or timeout
+        """
+        request = CommandRequest(
+            name="start",
+            machine_id=machine_id,
+            params={},
+            step_number=0
+        )
+        return await self.send_immediate_command(
+            request=request,
+            run_id=run_id,
+            user_id=user_id,
+            username=username,
+            timeout=timeout
+        )
+    async def complete_run(
+        self,
+        machine_id: str,
+        run_id: str,
+        user_id: str,
+        username: str,
+        timeout: int = 120
+    ) -> Optional[NATSMessage]:
+        """
+        Send COMPLETE immediate command to end a run.
+        Args:
+            machine_id: Machine ID to send the command to
+            run_id: Run ID for the command
+            user_id: User ID who initiated the command
+            username: Username who initiated the command
+            timeout: Maximum time to wait for response in seconds
+        Returns:
+            NATSMessage if successful, None if failed or timeout
+        """
+        request = CommandRequest(
+            name="complete",
+            machine_id=machine_id,
+            params={},
+            step_number=0
+        )
+        return await self.send_immediate_command(
+            request=request,
+            run_id=run_id,
+            user_id=user_id,
+            username=username,
+            timeout=timeout
+        )
     async def send_queue_commands(
         self,
         *,
         requests: list[CommandRequest],
-        machine_id: str,
         run_id: str,
         user_id: str,
         username: str,
@@ -421,13 +494,18 @@ class CommandService:
         """
         Send multiple queue commands sequentially and wait for responses.
+        Automatically sends START commands to all unique machine_ids before the sequence
+        and COMPLETE commands to all unique machine_ids after successful completion.
         Sends commands one by one, waiting for each response before sending the next.
-        If any command fails or times out, stops immediately and returns the error response.
-        If all commands succeed, returns the last command's response.
+        If any command fails or times out, stops immediately, completes all started runs,
+        and returns the error response. If all commands succeed, returns the last command's response.
+        Each command must specify its own machine_id. Commands with different machine_ids
+        will be sent to their respective machines. All machines involved will receive
+        START commands at the beginning and COMPLETE commands at the end.
         Args:
-            requests: List of CommandRequest models to send sequentially
-            machine_id: Machine ID to send the commands to
+            requests: List of CommandRequest models to send sequentially (each must include machine_id)
             run_id: Run ID for all commands
             user_id: User ID who initiated the commands
             username: Username who initiated the commands
@@ -444,89 +522,186 @@ class CommandService:
             logger.warning("No commands to send")
             return None
+        # Collect all unique machine_ids from requests
+        machine_ids = set()
+        for req in requests:
+            if isinstance(req, dict):
+                req = CommandRequest.model_validate(req)
+            elif not isinstance(req, CommandRequest):
+                raise ValueError(f"Request must be a CommandRequest or dict, got {type(req)}")
+            machine_ids.add(req.machine_id)
+        machine_ids_list = sorted(list(machine_ids))  # Sort for consistent logging
         logger.info(
-            "Sending %d queue commands sequentially: machine_id=%s, run_id=%s",
+            "Sending %d queue commands sequentially to machines: %s, run_id=%s",
             len(requests),
-            machine_id,
+            machine_ids_list,
             run_id
         )
-        last_response: Optional[NATSMessage] = None
-        for idx, request in enumerate(requests, start=1):
-            logger.info(
-                "Sending command %d/%d: %s (step %s)",
-                idx,
-                len(requests),
-                request.name,
-                request.step_number
-            )
-            response = await self.send_queue_command(
-                request=request,
+        # Send START commands to all unique machine_ids before sequence
+        logger.info("Sending START commands to all machines: %s", machine_ids_list)
+        started_machines = set()
+        for machine_id in machine_ids_list:
+            start_response = await self.start_run(
                 machine_id=machine_id,
                 run_id=run_id,
                 user_id=user_id,
                 username=username,
                 timeout=timeout
             )
-            # Check if command failed (None means timeout or exception)
-            if response is None:
-                logger.error(
-                    "Command %d/%d failed or timed out: %s (step %s)",
+            if start_response is None:
+                logger.error("START command timed out for machine: %s, aborting", machine_id)
+                return None
+            if start_response.response and start_response.response.status == CommandResponseStatus.ERROR:
+                logger.error("START command failed for machine %s: %s, aborting", machine_id, start_response.response.message)
+                return start_response
+            started_machines.add(machine_id)
+        last_response: Optional[NATSMessage] = None
+        try:
+            for idx, request in enumerate(requests, start=1):
+                # Validate request - convert dict to CommandRequest if needed
+                if isinstance(request, dict):
+                    request = CommandRequest.model_validate(request)
+                elif not isinstance(request, CommandRequest):
+                    raise ValueError(f"Request {idx} must be a CommandRequest or dict, got {type(request)}")
+                logger.info(
+                    "Sending command %d/%d: %s (step %s) to machine %s",
                     idx,
                     len(requests),
                     request.name,
-                    request.step_number
+                    request.step_number,
+                    request.machine_id
+                )
+                response = await self.send_queue_command(
+                    request=request,
+                    run_id=run_id,
+                    user_id=user_id,
+                    username=username,
+                    timeout=timeout
                 )
-                return None
-            # Check if command returned an error status
-            if response.response is not None:
-                if response.response.status == CommandResponseStatus.ERROR:
+                # Check if command failed (None means timeout or exception)
+                if response is None:
                     logger.error(
-                        "Command %d/%d failed with error: %s (step %s) - code: %s, message: %s",
+                        "Command %d/%d failed or timed out: %s (step %s)",
+                        idx,
+                        len(requests),
+                        request.name,
+                        request.step_number
+                    )
+                    return None
+                # Check if command returned an error status
+                if response.response is not None:
+                    if response.response.status == CommandResponseStatus.ERROR:
+                        logger.error(
+                            "Command %d/%d failed with error: %s (step %s) - code: %s, message: %s",
+                            idx,
+                            len(requests),
+                            request.name,
+                            request.step_number,
+                            response.response.code.name,
+                            response.response.message
+                        )
+                        # Complete the run on all machines that were started
+                        logger.info("Completing runs on all machines due to error")
+                        for machine_id_to_complete in started_machines:
+                            try:
+                                await self.complete_run(
+                                    machine_id=machine_id_to_complete,
+                                    run_id=run_id,
+                                    user_id=user_id,
+                                    username=username,
+                                    timeout=timeout
+                                )
+                            except Exception as e:
+                                logger.error("Failed to complete run for machine %s during error cleanup: %s", machine_id_to_complete, e)
+                        return response
+                    # Command succeeded, store as last response
+                    last_response = response
+                    logger.info(
+                        "Command %d/%d succeeded: %s (step %s)",
+                        idx,
+                        len(requests),
+                        request.name,
+                        request.step_number
+                    )
+                else:
+                    # Response exists but has no response data (shouldn't happen, but handle it)
+                    logger.warning(
+                        "Command %d/%d returned response with no response data: %s (step %s)",
                         idx,
                         len(requests),
                         request.name,
-                        request.step_number,
-                        response.response.code,
-                        response.response.message
+                        request.step_number
                     )
+                    # Complete the run on all machines that were started
+                    logger.info("Completing runs on all machines due to error")
+                    for machine_id_to_complete in started_machines:
+                        try:
+                            await self.complete_run(
+                                machine_id=machine_id_to_complete,
+                                run_id=run_id,
+                                user_id=user_id,
+                                username=username,
+                                timeout=timeout
+                            )
+                        except Exception as e:
+                            logger.error("Failed to complete run for machine %s during error cleanup: %s", machine_id_to_complete, e)
                     return response
-                # Command succeeded, store as last response
-                last_response = response
-                logger.info(
-                    "Command %d/%d succeeded: %s (step %s)",
-                    idx,
-                    len(requests),
-                    request.name,
-                    request.step_number
-                )
-            else:
-                # Response exists but has no response data (shouldn't happen, but handle it)
-                logger.warning(
-                    "Command %d/%d returned response with no response data: %s (step %s)",
-                    idx,
-                    len(requests),
-                    request.name,
-                    request.step_number
+            logger.info(
+                "All %d commands completed successfully",
+                len(requests)
+            )
+            # Always send COMPLETE commands to all machines after successful sequence
+            logger.info("Sending COMPLETE commands to all machines: %s", machine_ids_list)
+            for machine_id_to_complete in machine_ids_list:
+                complete_response = await self.complete_run(
+                    machine_id=machine_id_to_complete,
+                    run_id=run_id,
+                    user_id=user_id,
+                    username=username,
+                    timeout=timeout
                 )
-                return response
-        logger.info(
-            "All %d commands completed successfully",
-            len(requests)
-        )
-        return last_response
+                if complete_response is None:
+                    logger.error("COMPLETE command timed out for machine: %s, aborting", machine_id_to_complete)
+                    return None
+                if complete_response.response and complete_response.response.status == CommandResponseStatus.ERROR:
+                    logger.error("COMPLETE command failed for machine %s: %s, aborting", machine_id_to_complete, complete_response.response.message)
+                    return complete_response
+            # Return the last command response, not the COMPLETE response
+            return last_response
+        except Exception as e:
+            # If any error occurs during command execution, try to complete the run
+            # on all machines that were started to clean up state
+            logger.warning("Error during command sequence, attempting to complete runs on all machines: %s", e)
+            for machine_id_to_complete in started_machines:
+                try:
+                    await self.complete_run(
+                        machine_id=machine_id_to_complete,
+                        run_id=run_id,
+                        user_id=user_id,
+                        username=username,
+                        timeout=timeout
+                    )
+                except Exception as cleanup_error:
+                    logger.error("Failed to complete run for machine %s during error cleanup: %s", machine_id_to_complete, cleanup_error)
+            raise
     async def send_immediate_command(
         self,
         *,
         request: CommandRequest,
-        machine_id: str,
         run_id: str,
         user_id: str,
         username: str,
@@ -536,8 +711,7 @@ class CommandService:
         Send an immediate command (pause, resume, cancel) to the machine.
         Args:
-            request: CommandRequest model containing command details
-            machine_id: Machine ID to send the command to
+            request: CommandRequest model containing command details (must include machine_id)
             run_id: Run ID for the command
             user_id: User ID who initiated the command
             username: Username who initiated the command
@@ -549,23 +723,22 @@ class CommandService:
         if not self._connected or not self.js:
             raise RuntimeError("Not connected to NATS. Call connect() first.")
-        # Determine subject
-        subject = f"{NAMESPACE}.{machine_id}.cmd.immediate"
+        # Determine subject using machine_id from request
+        subject = f"{NAMESPACE}.{request.machine_id}.cmd.immediate"
         logger.info(
             "Sending immediate command: machine_id=%s, command=%s, run_id=%s, step_number=%s",
-            machine_id, request.name, run_id, request.step_number
+            request.machine_id, request.name, run_id, request.step_number
         )
         # Get or create response handler for this machine
-        response_handler = await self._get_response_handler(machine_id)
+        response_handler = await self._get_response_handler(request.machine_id)
         # Register pending response
         response_received = response_handler.register_pending(run_id, request.step_number)
         # Build payload
-        payload = self._build_command_payload(request, machine_id, run_id, user_id, username)
+        payload = self._build_command_payload(request, request.machine_id, run_id, user_id, username)
         try:
             # Publish to JetStream
@@ -651,13 +824,16 @@ class CommandService:
         Args:
             command_request: CommandRequest model containing command details
             machine_id: Machine ID for the command
-            run_id: Run ID for the command
+            run_id: Run ID for the command (empty string will be converted to None)
             user_id: User ID who initiated the command
             username: Username who initiated the command
         Returns:
             NATSMessage object ready for NATS transmission
         """
+        # Convert empty string to None for run_id
+        run_id_value = run_id if run_id else None
         header = MessageHeader(
             message_type=MessageType.COMMAND,
             version="1.0",
@@ -665,7 +841,7 @@ class CommandService:
             user_id=user_id,
             username=username,
             machine_id=machine_id,
-            run_id=run_id
+            run_id=run_id_value
         )
         return NATSMessage(

puda-comms 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

puda-comms 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl