PyPI - qalita - Versions diffs - 2.9.1__py3-none-any.whl → 2.10.0__py3-none-any.whl - Mend

qalita 2.9.1py3-none-any.whl → 2.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

qalita/grpc/client.py CHANGED Viewed

@@ -26,6 +26,7 @@ class GrpcClient:
     - Keep-alive management
     - Bidirectional streaming support
     - Thread-safe connection state
+    - Stability detection before resetting reconnection counter
     """
     def __init__(
@@ -36,6 +37,7 @@ class GrpcClient:
         max_reconnect_attempts: int = 10,
         initial_reconnect_delay: float = 1.0,
         max_reconnect_delay: float = 60.0,
+        stability_threshold_seconds: float = 30.0,
     ):
         """
         Initialize the gRPC client.
@@ -47,6 +49,7 @@ class GrpcClient:
             max_reconnect_attempts: Maximum reconnection attempts (0 = unlimited)
             initial_reconnect_delay: Initial delay between reconnection attempts
             max_reconnect_delay: Maximum delay between reconnection attempts
+            stability_threshold_seconds: Time the connection must be stable before resetting attempts counter
         """
         self._url = url
         self._token = token
@@ -54,6 +57,7 @@ class GrpcClient:
         self._max_reconnect_attempts = max_reconnect_attempts
         self._initial_reconnect_delay = initial_reconnect_delay
         self._max_reconnect_delay = max_reconnect_delay
+        self._stability_threshold_seconds = stability_threshold_seconds
         # Connection state - set before parsing URL
         self._use_secure_channel = False
@@ -66,17 +70,27 @@ class GrpcClient:
         self._stub: Optional[qalita_pb2_grpc.WorkerServiceStub] = None
         self._connected = False
         self._reconnect_attempts = 0
+        self._current_reconnect_delay = initial_reconnect_delay
+        self._last_successful_stream_start: Optional[datetime] = None
+        self._stream_healthy = False
         # Stream state
         self._stream_call = None
         self._outgoing_queue: asyncio.Queue = asyncio.Queue()
         self._stream_active = False
+        # Stream health monitoring
+        self._last_message_received: Optional[datetime] = None
+        self._last_message_sent: Optional[datetime] = None
+        self._stream_health_timeout = 45.0  # Consider stream dead if no response in 45s
+        self._force_reconnect = False
         # Callbacks
         self._on_job_received: Optional[Callable] = None
         self._on_routine_received: Optional[Callable] = None
         self._on_data_preview_request: Optional[Callable] = None
         self._on_add_source_request: Optional[Callable] = None
+        self._on_agent_action_request: Optional[Callable] = None
         self._on_disconnect: Optional[Callable] = None
     def _parse_grpc_target(self, url: str) -> str:
@@ -105,8 +119,8 @@ class GrpcClient:
             self._use_secure_channel = False
             return f"{host}:50051"
-        # For production URLs (e.g., https://api.cloud.platform.qalita.io)
-        # Convert to gRPC endpoint (e.g., grpc.cloud.platform.qalita.io:443)
+        # For production URLs (e.g., https://api.app.platform.qalita.io)
+        # Convert to gRPC endpoint (e.g., grpc.app.platform.qalita.io:443)
         self._use_secure_channel = True
         # Replace 'api.' prefix with 'grpc.' if present
@@ -128,10 +142,22 @@ class GrpcClient:
         """
         Establish connection to the gRPC server.
+        Note: This method does NOT reset _reconnect_attempts. The counter is only
+        reset after the stream has been stable for _stability_threshold_seconds.
         Returns:
             True if connection successful, False otherwise
         """
         try:
+            # Close any existing channel first
+            if self._channel:
+                try:
+                    await self._channel.close()
+                except Exception:
+                    pass
+                self._channel = None
+                self._stub = None
             # Channel options for long-running streams
             channel_options = [
                 ('grpc.keepalive_time_ms', 30000),
@@ -141,6 +167,10 @@ class GrpcClient:
                 ('grpc.http2.max_pings_without_data', 0),
                 ('grpc.max_receive_message_length', 50 * 1024 * 1024),
                 ('grpc.max_send_message_length', 50 * 1024 * 1024),
+                # Additional options for better connection resilience
+                ('grpc.initial_reconnect_backoff_ms', 1000),
+                ('grpc.max_reconnect_backoff_ms', 60000),
+                ('grpc.enable_retries', 1),
             ]
             # Create channel - secure for production, insecure for local dev
@@ -159,7 +189,7 @@ class GrpcClient:
             self._stub = qalita_pb2_grpc.WorkerServiceStub(self._channel)
             self._connected = True
-            self._reconnect_attempts = 0
+            # Note: Do NOT reset _reconnect_attempts here - only reset after stable stream
             logger.info(f"Connected to gRPC server at {self._grpc_target}")
             return True
@@ -196,31 +226,102 @@ class GrpcClient:
         """
         Attempt to reconnect with exponential backoff.
+        The reconnection counter persists across reconnection cycles. It only resets
+        when the connection has been stable (stream healthy for _stability_threshold_seconds).
         Returns:
             True if reconnection successful, False if max attempts exceeded
         """
-        delay = self._initial_reconnect_delay
+        self._reconnect_attempts += 1
+        self._stream_healthy = False
+        # Check if max attempts exceeded
+        if self._max_reconnect_attempts > 0 and self._reconnect_attempts > self._max_reconnect_attempts:
+            logger.error(
+                f"Max reconnection attempts exceeded ({self._reconnect_attempts}/{self._max_reconnect_attempts}). "
+                f"Will continue trying with max backoff delay."
+            )
+            # Don't return False - keep trying but with max delay
+            # In production, we want the worker to eventually recover
-        while (self._max_reconnect_attempts == 0 or
-               self._reconnect_attempts < self._max_reconnect_attempts):
-            self._reconnect_attempts += 1
-            logger.warning(
-                f"Reconnection attempt {self._reconnect_attempts}"
-                f"{f'/{self._max_reconnect_attempts}' if self._max_reconnect_attempts > 0 else ''}"
+        logger.warning(
+            f"Reconnection attempt {self._reconnect_attempts}"
+            f"{f'/{self._max_reconnect_attempts}' if self._max_reconnect_attempts > 0 else ''} "
+            f"(delay: {self._current_reconnect_delay:.1f}s)"
+        )
+        # Wait before attempting reconnection (exponential backoff)
+        await asyncio.sleep(self._current_reconnect_delay)
+        # Attempt to connect
+        if await self.connect():
+            # Increase delay for next attempt (in case this stream also fails quickly)
+            self._current_reconnect_delay = min(
+                self._current_reconnect_delay * 2,
+                self._max_reconnect_delay
             )
-            await asyncio.sleep(delay)
-            if await self.connect():
-                return True
-            # Exponential backoff
-            delay = min(delay * 2, self._max_reconnect_delay)
+            return True
-        logger.error("Max reconnection attempts exceeded")
+        # Connection failed, increase delay for next attempt
+        self._current_reconnect_delay = min(
+            self._current_reconnect_delay * 2,
+            self._max_reconnect_delay
+        )
         return False
+    def _mark_stream_stable(self) -> None:
+        """
+        Mark the stream as stable and reset reconnection counters.
+        Called when the stream has been healthy for _stability_threshold_seconds.
+        """
+        if not self._stream_healthy:
+            logger.info("Stream connection is now stable - resetting reconnection counters")
+            self._stream_healthy = True
+            self._reconnect_attempts = 0
+            self._current_reconnect_delay = self._initial_reconnect_delay
+    async def _check_stream_health(self) -> None:
+        """
+        Check if the stream is actually working by comparing sent vs received timestamps.
+        If we've been sending messages but haven't received any response (ack or other)
+        for _stream_health_timeout seconds, the stream is probably dead and we should reconnect.
+        """
+        now = datetime.now(timezone.utc)
+        # Need both timestamps to make a comparison
+        if not self._last_message_sent:
+            return
+        # Calculate time since last message sent and received
+        time_since_sent = (now - self._last_message_sent).total_seconds()
+        if self._last_message_received:
+            time_since_received = (now - self._last_message_received).total_seconds()
+        else:
+            # Never received anything - use time since stream started
+            if self._last_successful_stream_start:
+                time_since_received = (now - self._last_successful_stream_start).total_seconds()
+            else:
+                return
+        # If we've been sending but not receiving for too long, stream is dead
+        if time_since_received > self._stream_health_timeout:
+            logger.warning(
+                f"Stream appears dead: last sent {time_since_sent:.1f}s ago, "
+                f"last received {time_since_received:.1f}s ago (timeout: {self._stream_health_timeout}s)"
+            )
+            logger.warning("Forcing reconnection due to unresponsive stream...")
+            self._force_reconnect = True
+            # Cancel the stream call to force the error path
+            if self._stream_call:
+                try:
+                    self._stream_call.cancel()
+                except Exception as e:
+                    logger.debug(f"Error cancelling stream for forced reconnect: {e}")
     # =========================================================================
     # Unary RPCs
     # =========================================================================
@@ -470,6 +571,10 @@ class GrpcClient:
         """Set callback for when an add source request is received via stream."""
         self._on_add_source_request = callback
+    def on_agent_action_request(self, callback: Callable[[qalita_pb2.AgentActionRequest], Any]) -> None:
+        """Set callback for when an agent action request is received via stream."""
+        self._on_agent_action_request = callback
     def on_disconnect(self, callback: Callable[[], Any]) -> None:
         """Set callback for when connection is lost."""
         self._on_disconnect = callback
@@ -609,15 +714,46 @@ class GrpcClient:
         msg = qalita_pb2.WorkerMessage(add_source_response=response)
         await self._outgoing_queue.put(msg)
+    async def send_agent_action_response(
+        self,
+        request_id: str,
+        ok: bool,
+        action_type: str,
+        error: Optional[str] = None,
+        result_json: Optional[str] = None,
+        data: Optional[qalita_pb2.DataPreviewResponse] = None,
+        execution_time_ms: Optional[int] = None,
+    ) -> None:
+        """Send an agent action response through the stream."""
+        response = qalita_pb2.AgentActionResponse(
+            request_id=request_id,
+            ok=ok,
+            action_type=action_type,
+        )
+        if error:
+            response.error = error
+        if result_json:
+            response.result_json = result_json
+        if data:
+            response.data.CopyFrom(data)
+        if execution_time_ms is not None:
+            response.execution_time_ms = execution_time_ms
+        msg = qalita_pb2.WorkerMessage(agent_action_response=response)
+        await self._outgoing_queue.put(msg)
     async def _outgoing_messages(self) -> AsyncIterator[qalita_pb2.WorkerMessage]:
         """Generator for outgoing stream messages."""
         logger.info("Outgoing messages generator started")
-        while self._stream_active:
+        while self._stream_active and not self._force_reconnect:
             try:
                 # Use get_nowait in a loop with sleep to avoid blocking gRPC
                 try:
                     msg = self._outgoing_queue.get_nowait()
-                    logger.debug(f"Yielding message type: {msg.WhichOneof('payload')}")
+                    msg_type = msg.WhichOneof('payload')
+                    logger.debug(f"Yielding message type: {msg_type}")
+                    self._last_message_sent = datetime.now(timezone.utc)
                     yield msg
                 except asyncio.QueueEmpty:
                     # No message available, yield control briefly
@@ -638,7 +774,9 @@ class GrpcClient:
         - Keep-alive signals (sent every 10 seconds)
         - Incoming job assignments
         - Incoming routine triggers
-        - Automatic reconnection on failure
+        - Automatic reconnection on failure with exponential backoff
+        - Stability detection to reset reconnection counters
+        - Dead stream detection (sending but not receiving)
         """
         if not self._connected:
             if not await self.connect():
@@ -647,14 +785,30 @@ class GrpcClient:
         # Recreate queue in async context to ensure proper event loop binding
         self._outgoing_queue = asyncio.Queue()
         self._stream_active = True
+        self._stream_healthy = False
+        self._last_successful_stream_start = None
+        self._last_message_received = None
+        self._last_message_sent = None
+        self._force_reconnect = False
         async def keep_alive_loop():
-            """Send keep-alive every 10 seconds."""
+            """Send keep-alive every 10 seconds and monitor stream health."""
             logger.info(f"Keep-alive loop started, worker_id={self._worker_id}")
-            while self._stream_active:
+            while self._stream_active and not self._force_reconnect:
                 try:
                     logger.debug(f"Sending keep-alive for worker {self._worker_id}")
                     await self.send_keep_alive()
+                    # Check if stream has been healthy long enough to reset counters
+                    if (self._last_successful_stream_start and
+                        not self._stream_healthy):
+                        elapsed = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
+                        if elapsed >= self._stability_threshold_seconds:
+                            self._mark_stream_stable()
+                    # Health check: detect dead stream (sending but not receiving)
+                    await self._check_stream_health()
                     await asyncio.sleep(10)
                 except asyncio.CancelledError:
                     logger.info("Keep-alive loop cancelled")
@@ -662,15 +816,28 @@ class GrpcClient:
                 except Exception as e:
                     logger.error(f"Keep-alive error: {e}")
-        async def process_stream():
-            """Process incoming stream messages."""
+        async def process_single_stream() -> bool:
+            """
+            Process incoming stream messages for one connection attempt.
+            Returns:
+                True if stream ended gracefully (should not reconnect)
+                False if stream had an error (should attempt reconnection)
+            """
             try:
                 self._stream_call = self._stub.Connect(
                     self._outgoing_messages(),
                     metadata=self.metadata,
                 )
+                # Mark the time when stream successfully started
+                self._last_successful_stream_start = datetime.now(timezone.utc)
+                logger.info("Stream established successfully")
                 async for msg in self._stream_call:
+                    # Each message received confirms the stream is working
+                    self._last_message_received = datetime.now(timezone.utc)
                     if msg.HasField('job_assignment'):
                         job = msg.job_assignment.job
                         logger.info(f"Received job assignment: {job.id}")
@@ -695,28 +862,87 @@ class GrpcClient:
                         if self._on_add_source_request:
                             await self._on_add_source_request(request)
+                    elif msg.HasField('agent_action_request'):
+                        request = msg.agent_action_request
+                        logger.info(f"Received agent action request: {request.request_id} type={request.action_type}")
+                        if self._on_agent_action_request:
+                            await self._on_agent_action_request(request)
                     elif msg.HasField('ack'):
                         logger.debug(f"Received ack: {msg.ack.message_type}")
+                        # Ack received means stream is working, check stability
+                        if (self._last_successful_stream_start and
+                            not self._stream_healthy):
+                            elapsed = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
+                            if elapsed >= self._stability_threshold_seconds:
+                                self._mark_stream_stable()
                     elif msg.HasField('error'):
                         logger.error(f"Server error: {msg.error.code} - {msg.error.message}")
+                # Stream ended normally (server closed it gracefully)
+                logger.info("Stream ended normally")
+                return False  # Still try to reconnect for continuous operation
             except grpc.aio.AioRpcError as e:
                 if e.code() == grpc.StatusCode.CANCELLED:
-                    logger.info("Stream cancelled")
+                    if self._force_reconnect:
+                        logger.info("Stream cancelled due to forced reconnect (dead stream detection)")
+                        return False  # Reconnect
+                    else:
+                        logger.info("Stream cancelled by client")
+                        return True  # Don't reconnect if we intentionally cancelled it
                 else:
-                    logger.error(f"Stream error: {e.code()} - {e.details()}")
-                    # Attempt reconnection
-                    if self._stream_active and await self._reconnect():
-                        await process_stream()
+                    # Calculate how long the stream was alive
+                    stream_duration = 0
+                    if self._last_successful_stream_start:
+                        stream_duration = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
+                    logger.error(
+                        f"Stream error after {stream_duration:.1f}s: {e.code()} - {e.details()}"
+                    )
+                    return False  # Should attempt reconnection
+            except Exception as e:
+                logger.error(f"Unexpected stream error: {e}")
+                return False  # Should attempt reconnection
-        # Run keep-alive and stream processing concurrently
+        # Main stream loop with reconnection handling
         keep_alive_task = asyncio.create_task(keep_alive_loop())
         try:
-            await process_stream()
+            while self._stream_active:
+                # Reset state before starting/restarting stream
+                self._force_reconnect = False
+                self._last_message_received = None
+                self._last_message_sent = None
+                # Process the stream
+                should_stop = await process_single_stream()
+                if should_stop or not self._stream_active:
+                    break
+                # Stream failed, attempt reconnection
+                self._last_successful_stream_start = None
+                # Recreate the outgoing queue to clear any stale messages
+                self._outgoing_queue = asyncio.Queue()
+                # Attempt reconnection (this handles backoff)
+                if not await self._reconnect():
+                    # _reconnect now always returns True after sleeping and connecting
+                    # It only returns False if connect() itself fails
+                    # In that case, keep trying
+                    logger.warning("Reconnection failed, will retry...")
+                    continue
+                # Reconnected successfully, loop will start a new stream
+                logger.info("Reconnected, restarting stream...")
         finally:
             self._stream_active = False
+            self._force_reconnect = True  # Stop the outgoing generator
             keep_alive_task.cancel()
             try:
                 await keep_alive_task

qalita/grpc/protos/qalita.proto CHANGED Viewed

@@ -49,6 +49,7 @@ message WorkerMessage {
     JobLogLine log_line = 4;
     DataPreviewResponse data_preview_response = 5;
     AddSourceResponse add_source_response = 6;
+    AgentActionResponse agent_action_response = 7;
   }
 }
@@ -69,6 +70,7 @@ message ServerMessage {
     ServerError error = 4;
     DataPreviewRequest data_preview_request = 5;
     AddSourceRequest add_source_request = 6;
+    AgentActionRequest agent_action_request = 7;
   }
 }
@@ -389,3 +391,27 @@ message AddSourceResponse {
   optional int32 source_id = 4;   // ID assigned by worker in local config
   bool connectivity_verified = 5; // Whether connection to source was verified
 }
+// =============================================================================
+// Agent Actions (Studio LLM -> Worker)
+// =============================================================================
+// Request from LLM agent to execute an action on a data source
+message AgentActionRequest {
+  string request_id = 1;          // Unique ID to correlate request/response
+  string action_type = 2;         // query, read_data, filter, aggregate, describe, sample
+  int32 source_id = 3;            // Source to operate on
+  string parameters_json = 4;     // Action parameters as JSON
+  optional int32 timeout_seconds = 5;  // Optional timeout for the action
+}
+// Response from worker after executing an agent action
+message AgentActionResponse {
+  string request_id = 1;          // Correlates with request
+  bool ok = 2;                    // Whether operation succeeded
+  string action_type = 3;         // Echo back the action type
+  optional string error = 4;      // Error message if ok=false
+  optional string result_json = 5;     // Structured result as JSON (for metadata, stats)
+  optional DataPreviewResponse data = 6;  // Tabular data result if applicable
+  optional int64 execution_time_ms = 7;   // How long the action took
+}

qalita 2.9.1__py3-none-any.whl → 2.10.0__py3-none-any.whl

qalita 2.9.1py3-none-any.whl → 2.10.0py3-none-any.whl