qalita 2.9.1__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. qalita/_frontend/.next/BUILD_ID +1 -1
  2. qalita/_frontend/.next/build-manifest.json +7 -7
  3. qalita/_frontend/.next/prerender-manifest.json +3 -3
  4. qalita/_frontend/.next/required-server-files.json +196 -40
  5. qalita/_frontend/.next/server/app/_global-error/page/build-manifest.json +5 -5
  6. qalita/_frontend/.next/server/app/_global-error/page_client-reference-manifest.js +1 -1
  7. qalita/_frontend/.next/server/app/_global-error.html +2 -2
  8. qalita/_frontend/.next/server/app/_global-error.rsc +7 -7
  9. qalita/_frontend/.next/server/app/_global-error.segments/__PAGE__.segment.rsc +2 -2
  10. qalita/_frontend/.next/server/app/_global-error.segments/_full.segment.rsc +7 -7
  11. qalita/_frontend/.next/server/app/_global-error.segments/_head.segment.rsc +3 -3
  12. qalita/_frontend/.next/server/app/_global-error.segments/_index.segment.rsc +3 -3
  13. qalita/_frontend/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
  14. qalita/_frontend/.next/server/app/_not-found/page/build-manifest.json +5 -5
  15. qalita/_frontend/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
  16. qalita/_frontend/.next/server/app/_not-found.html +1 -1
  17. qalita/_frontend/.next/server/app/_not-found.rsc +9 -9
  18. qalita/_frontend/.next/server/app/_not-found.segments/_full.segment.rsc +9 -9
  19. qalita/_frontend/.next/server/app/_not-found.segments/_head.segment.rsc +3 -3
  20. qalita/_frontend/.next/server/app/_not-found.segments/_index.segment.rsc +5 -5
  21. qalita/_frontend/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +2 -2
  22. qalita/_frontend/.next/server/app/_not-found.segments/_not-found.segment.rsc +3 -3
  23. qalita/_frontend/.next/server/app/_not-found.segments/_tree.segment.rsc +2 -2
  24. qalita/_frontend/.next/server/app/page/build-manifest.json +5 -5
  25. qalita/_frontend/.next/server/app/page_client-reference-manifest.js +1 -1
  26. qalita/_frontend/.next/server/app/sources/add/page/build-manifest.json +5 -5
  27. qalita/_frontend/.next/server/app/sources/add/page_client-reference-manifest.js +1 -1
  28. qalita/_frontend/.next/server/app/sources/add.html +1 -1
  29. qalita/_frontend/.next/server/app/sources/add.rsc +11 -11
  30. qalita/_frontend/.next/server/app/sources/add.segments/_full.segment.rsc +11 -11
  31. qalita/_frontend/.next/server/app/sources/add.segments/_head.segment.rsc +3 -3
  32. qalita/_frontend/.next/server/app/sources/add.segments/_index.segment.rsc +5 -5
  33. qalita/_frontend/.next/server/app/sources/add.segments/_tree.segment.rsc +2 -2
  34. qalita/_frontend/.next/server/app/sources/add.segments/sources/add/__PAGE__.segment.rsc +4 -4
  35. qalita/_frontend/.next/server/app/sources/add.segments/sources/add.segment.rsc +3 -3
  36. qalita/_frontend/.next/server/app/sources/add.segments/sources.segment.rsc +3 -3
  37. qalita/_frontend/.next/server/app/sources/edit/[id]/page/build-manifest.json +5 -5
  38. qalita/_frontend/.next/server/app/sources/edit/[id]/page_client-reference-manifest.js +1 -1
  39. qalita/_frontend/.next/server/app/sources/page/build-manifest.json +5 -5
  40. qalita/_frontend/.next/server/app/sources/page_client-reference-manifest.js +1 -1
  41. qalita/_frontend/.next/server/app/sources.html +1 -1
  42. qalita/_frontend/.next/server/app/sources.rsc +11 -11
  43. qalita/_frontend/.next/server/app/sources.segments/_full.segment.rsc +11 -11
  44. qalita/_frontend/.next/server/app/sources.segments/_head.segment.rsc +3 -3
  45. qalita/_frontend/.next/server/app/sources.segments/_index.segment.rsc +5 -5
  46. qalita/_frontend/.next/server/app/sources.segments/_tree.segment.rsc +2 -2
  47. qalita/_frontend/.next/server/app/sources.segments/sources/__PAGE__.segment.rsc +4 -4
  48. qalita/_frontend/.next/server/app/sources.segments/sources.segment.rsc +3 -3
  49. qalita/_frontend/.next/server/chunks/[root-of-the-server]__bf0c3d33._.js +3 -3
  50. qalita/_frontend/.next/server/chunks/[root-of-the-server]__f408c708._.js +2 -2
  51. qalita/_frontend/.next/server/chunks/ssr/[root-of-the-server]__be91267c._.js +1 -1
  52. qalita/_frontend/.next/server/chunks/ssr/_404f6e81._.js +2 -2
  53. qalita/_frontend/.next/server/chunks/ssr/_6a67f6f0._.js +2 -2
  54. qalita/_frontend/.next/server/chunks/ssr/_cb7b44d6._.js +1 -1
  55. qalita/_frontend/.next/server/chunks/ssr/_d44c43ed._.js +1 -1
  56. qalita/_frontend/.next/server/chunks/ssr/components_DashboardContent_tsx_c3635665._.js +1 -1
  57. qalita/_frontend/.next/server/chunks/ssr/node_modules_next_dist_4b9a0874._.js +1 -1
  58. qalita/_frontend/.next/server/middleware-build-manifest.js +5 -5
  59. qalita/_frontend/.next/server/pages/404.html +1 -1
  60. qalita/_frontend/.next/server/pages/500.html +2 -2
  61. qalita/_frontend/.next/server/server-reference-manifest.js +1 -1
  62. qalita/_frontend/.next/server/server-reference-manifest.json +1 -1
  63. qalita/_frontend/.next/static/chunks/0c7542414b6a6f86.js +2 -0
  64. qalita/_frontend/.next/static/chunks/{89ba62a8ba9b79ce.js → 12daa96885968840.js} +1 -1
  65. qalita/_frontend/.next/static/chunks/1e6a98e93c470083.css +1 -0
  66. qalita/_frontend/.next/static/chunks/499b7099996cc9f9.js +1 -0
  67. qalita/_frontend/.next/static/chunks/694836347d1e5ef3.js +1 -0
  68. qalita/_frontend/.next/static/chunks/7ea91ca84dc4b3a4.js +1 -0
  69. qalita/_frontend/.next/static/chunks/89c689b5748e28ed.js +1 -0
  70. qalita/_frontend/.next/static/chunks/9e71bf77f23416e6.js +1 -0
  71. qalita/_frontend/.next/static/chunks/aa2a44cc19d89bdb.js +1 -0
  72. qalita/_frontend/.next/static/chunks/ba22289f779d638e.js +1 -0
  73. qalita/_frontend/.next/static/chunks/bb05964d928aa166.js +3 -0
  74. qalita/_frontend/.next/static/chunks/dde1c328f398837e.js +1 -0
  75. qalita/_frontend/.next/static/chunks/ecbb64dc112ad516.js +1 -0
  76. qalita/_frontend/.next/static/chunks/facd124df217e016.js +1 -0
  77. qalita/_frontend/.next/static/chunks/turbopack-9fc8bcb3a9806c66.js +4 -0
  78. qalita/_frontend/node_modules/@next/env/package.json +1 -1
  79. qalita/_frontend/node_modules/next/dist/build/index.js +10 -4
  80. qalita/_frontend/node_modules/next/dist/build/swc/index.js +1 -1
  81. qalita/_frontend/node_modules/next/dist/build/webpack-config.js +3 -3
  82. qalita/_frontend/node_modules/next/dist/client/components/segment-cache/lru.js +2 -0
  83. qalita/_frontend/node_modules/next/dist/compiled/next-server/app-page-turbo-experimental.runtime.prod.js +1 -1
  84. qalita/_frontend/node_modules/next/dist/compiled/next-server/app-page-turbo.runtime.prod.js +1 -1
  85. qalita/_frontend/node_modules/next/dist/server/config-shared.js +4 -0
  86. qalita/_frontend/node_modules/next/dist/server/dev/hot-reloader-turbopack.js +1 -1
  87. qalita/_frontend/node_modules/next/dist/server/dev/hot-reloader-webpack.js +1 -1
  88. qalita/_frontend/node_modules/next/dist/server/lib/app-info-log.js +1 -1
  89. qalita/_frontend/node_modules/next/dist/server/lib/start-server.js +1 -1
  90. qalita/_frontend/node_modules/next/dist/server/web/adapter.js +1 -1
  91. qalita/_frontend/node_modules/next/dist/shared/lib/errors/canary-only-config-error.js +1 -1
  92. qalita/_frontend/node_modules/next/dist/telemetry/anonymous-meta.js +1 -1
  93. qalita/_frontend/node_modules/next/dist/telemetry/events/version.js +2 -2
  94. qalita/_frontend/node_modules/next/package.json +15 -15
  95. qalita/_frontend/package.json +4 -4
  96. qalita/_frontend/server.js +1 -1
  97. qalita/commands/source.py +166 -2
  98. qalita/commands/worker.py +3 -3
  99. qalita/commands/worker_grpc.py +113 -3
  100. qalita/grpc/client.py +260 -34
  101. qalita/grpc/protos/qalita.proto +26 -0
  102. qalita/grpc/protos/qalita_pb2.py +80 -76
  103. qalita/grpc/protos/qalita_pb2_grpc.py +1 -1
  104. qalita/internal/action_executor.py +1009 -0
  105. qalita/internal/utils.py +1 -1
  106. {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/METADATA +4 -3
  107. {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/RECORD +113 -111
  108. qalita/_frontend/.next/static/chunks/02a64570f0a14789.js +0 -1
  109. qalita/_frontend/.next/static/chunks/0b082245f106d665.js +0 -1
  110. qalita/_frontend/.next/static/chunks/27b3ba70c7ef50a8.js +0 -1
  111. qalita/_frontend/.next/static/chunks/517e9b74d1a3c0ce.js +0 -1
  112. qalita/_frontend/.next/static/chunks/58689c96b0676c41.js +0 -1
  113. qalita/_frontend/.next/static/chunks/6c99da4248e4fcfc.js +0 -1
  114. qalita/_frontend/.next/static/chunks/acc5da18ff20daa1.js +0 -3
  115. qalita/_frontend/.next/static/chunks/bdc8a8e7721f5675.js +0 -2
  116. qalita/_frontend/.next/static/chunks/e0df86cbf44bbf9f.js +0 -1
  117. qalita/_frontend/.next/static/chunks/e4c3a252774ab7fd.css +0 -1
  118. qalita/_frontend/.next/static/chunks/e6ce59ba40b863f2.js +0 -1
  119. qalita/_frontend/.next/static/chunks/ec4b1f1e3cd3ae43.js +0 -1
  120. qalita/_frontend/.next/static/chunks/turbopack-d21156d03715fafa.js +0 -4
  121. /qalita/_frontend/.next/static/{M1H4Lcjc6A78n9p1qVA6d → NJRrkC0Gn13ofbqb0Lb0C}/_buildManifest.js +0 -0
  122. /qalita/_frontend/.next/static/{M1H4Lcjc6A78n9p1qVA6d → NJRrkC0Gn13ofbqb0Lb0C}/_clientMiddlewareManifest.json +0 -0
  123. /qalita/_frontend/.next/static/{M1H4Lcjc6A78n9p1qVA6d → NJRrkC0Gn13ofbqb0Lb0C}/_ssgManifest.js +0 -0
  124. {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/WHEEL +0 -0
  125. {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/entry_points.txt +0 -0
  126. {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/licenses/LICENSE +0 -0
qalita/grpc/client.py CHANGED
@@ -26,6 +26,7 @@ class GrpcClient:
26
26
  - Keep-alive management
27
27
  - Bidirectional streaming support
28
28
  - Thread-safe connection state
29
+ - Stability detection before resetting reconnection counter
29
30
  """
30
31
 
31
32
  def __init__(
@@ -36,6 +37,7 @@ class GrpcClient:
36
37
  max_reconnect_attempts: int = 10,
37
38
  initial_reconnect_delay: float = 1.0,
38
39
  max_reconnect_delay: float = 60.0,
40
+ stability_threshold_seconds: float = 30.0,
39
41
  ):
40
42
  """
41
43
  Initialize the gRPC client.
@@ -47,6 +49,7 @@ class GrpcClient:
47
49
  max_reconnect_attempts: Maximum reconnection attempts (0 = unlimited)
48
50
  initial_reconnect_delay: Initial delay between reconnection attempts
49
51
  max_reconnect_delay: Maximum delay between reconnection attempts
52
+ stability_threshold_seconds: Time the connection must be stable before resetting attempts counter
50
53
  """
51
54
  self._url = url
52
55
  self._token = token
@@ -54,6 +57,7 @@ class GrpcClient:
54
57
  self._max_reconnect_attempts = max_reconnect_attempts
55
58
  self._initial_reconnect_delay = initial_reconnect_delay
56
59
  self._max_reconnect_delay = max_reconnect_delay
60
+ self._stability_threshold_seconds = stability_threshold_seconds
57
61
 
58
62
  # Connection state - set before parsing URL
59
63
  self._use_secure_channel = False
@@ -66,17 +70,27 @@ class GrpcClient:
66
70
  self._stub: Optional[qalita_pb2_grpc.WorkerServiceStub] = None
67
71
  self._connected = False
68
72
  self._reconnect_attempts = 0
73
+ self._current_reconnect_delay = initial_reconnect_delay
74
+ self._last_successful_stream_start: Optional[datetime] = None
75
+ self._stream_healthy = False
69
76
 
70
77
  # Stream state
71
78
  self._stream_call = None
72
79
  self._outgoing_queue: asyncio.Queue = asyncio.Queue()
73
80
  self._stream_active = False
74
81
 
82
+ # Stream health monitoring
83
+ self._last_message_received: Optional[datetime] = None
84
+ self._last_message_sent: Optional[datetime] = None
85
+ self._stream_health_timeout = 45.0 # Consider stream dead if no response in 45s
86
+ self._force_reconnect = False
87
+
75
88
  # Callbacks
76
89
  self._on_job_received: Optional[Callable] = None
77
90
  self._on_routine_received: Optional[Callable] = None
78
91
  self._on_data_preview_request: Optional[Callable] = None
79
92
  self._on_add_source_request: Optional[Callable] = None
93
+ self._on_agent_action_request: Optional[Callable] = None
80
94
  self._on_disconnect: Optional[Callable] = None
81
95
 
82
96
  def _parse_grpc_target(self, url: str) -> str:
@@ -105,8 +119,8 @@ class GrpcClient:
105
119
  self._use_secure_channel = False
106
120
  return f"{host}:50051"
107
121
 
108
- # For production URLs (e.g., https://api.cloud.platform.qalita.io)
109
- # Convert to gRPC endpoint (e.g., grpc.cloud.platform.qalita.io:443)
122
+ # For production URLs (e.g., https://api.app.platform.qalita.io)
123
+ # Convert to gRPC endpoint (e.g., grpc.app.platform.qalita.io:443)
110
124
  self._use_secure_channel = True
111
125
 
112
126
  # Replace 'api.' prefix with 'grpc.' if present
@@ -128,10 +142,22 @@ class GrpcClient:
128
142
  """
129
143
  Establish connection to the gRPC server.
130
144
 
145
+ Note: This method does NOT reset _reconnect_attempts. The counter is only
146
+ reset after the stream has been stable for _stability_threshold_seconds.
147
+
131
148
  Returns:
132
149
  True if connection successful, False otherwise
133
150
  """
134
151
  try:
152
+ # Close any existing channel first
153
+ if self._channel:
154
+ try:
155
+ await self._channel.close()
156
+ except Exception:
157
+ pass
158
+ self._channel = None
159
+ self._stub = None
160
+
135
161
  # Channel options for long-running streams
136
162
  channel_options = [
137
163
  ('grpc.keepalive_time_ms', 30000),
@@ -141,6 +167,10 @@ class GrpcClient:
141
167
  ('grpc.http2.max_pings_without_data', 0),
142
168
  ('grpc.max_receive_message_length', 50 * 1024 * 1024),
143
169
  ('grpc.max_send_message_length', 50 * 1024 * 1024),
170
+ # Additional options for better connection resilience
171
+ ('grpc.initial_reconnect_backoff_ms', 1000),
172
+ ('grpc.max_reconnect_backoff_ms', 60000),
173
+ ('grpc.enable_retries', 1),
144
174
  ]
145
175
 
146
176
  # Create channel - secure for production, insecure for local dev
@@ -159,7 +189,7 @@ class GrpcClient:
159
189
 
160
190
  self._stub = qalita_pb2_grpc.WorkerServiceStub(self._channel)
161
191
  self._connected = True
162
- self._reconnect_attempts = 0
192
+ # Note: Do NOT reset _reconnect_attempts here - only reset after stable stream
163
193
 
164
194
  logger.info(f"Connected to gRPC server at {self._grpc_target}")
165
195
  return True
@@ -196,31 +226,102 @@ class GrpcClient:
196
226
  """
197
227
  Attempt to reconnect with exponential backoff.
198
228
 
229
+ The reconnection counter persists across reconnection cycles. It only resets
230
+ when the connection has been stable (stream healthy for _stability_threshold_seconds).
231
+
199
232
  Returns:
200
233
  True if reconnection successful, False if max attempts exceeded
201
234
  """
202
- delay = self._initial_reconnect_delay
235
+ self._reconnect_attempts += 1
236
+ self._stream_healthy = False
237
+
238
+ # Check if max attempts exceeded
239
+ if self._max_reconnect_attempts > 0 and self._reconnect_attempts > self._max_reconnect_attempts:
240
+ logger.error(
241
+ f"Max reconnection attempts exceeded ({self._reconnect_attempts}/{self._max_reconnect_attempts}). "
242
+ f"Will continue trying with max backoff delay."
243
+ )
244
+ # Don't return False - keep trying but with max delay
245
+ # In production, we want the worker to eventually recover
203
246
 
204
- while (self._max_reconnect_attempts == 0 or
205
- self._reconnect_attempts < self._max_reconnect_attempts):
206
-
207
- self._reconnect_attempts += 1
208
- logger.warning(
209
- f"Reconnection attempt {self._reconnect_attempts}"
210
- f"{f'/{self._max_reconnect_attempts}' if self._max_reconnect_attempts > 0 else ''}"
247
+ logger.warning(
248
+ f"Reconnection attempt {self._reconnect_attempts}"
249
+ f"{f'/{self._max_reconnect_attempts}' if self._max_reconnect_attempts > 0 else ''} "
250
+ f"(delay: {self._current_reconnect_delay:.1f}s)"
251
+ )
252
+
253
+ # Wait before attempting reconnection (exponential backoff)
254
+ await asyncio.sleep(self._current_reconnect_delay)
255
+
256
+ # Attempt to connect
257
+ if await self.connect():
258
+ # Increase delay for next attempt (in case this stream also fails quickly)
259
+ self._current_reconnect_delay = min(
260
+ self._current_reconnect_delay * 2,
261
+ self._max_reconnect_delay
211
262
  )
212
-
213
- await asyncio.sleep(delay)
214
-
215
- if await self.connect():
216
- return True
217
-
218
- # Exponential backoff
219
- delay = min(delay * 2, self._max_reconnect_delay)
263
+ return True
220
264
 
221
- logger.error("Max reconnection attempts exceeded")
265
+ # Connection failed, increase delay for next attempt
266
+ self._current_reconnect_delay = min(
267
+ self._current_reconnect_delay * 2,
268
+ self._max_reconnect_delay
269
+ )
222
270
  return False
223
271
 
272
+ def _mark_stream_stable(self) -> None:
273
+ """
274
+ Mark the stream as stable and reset reconnection counters.
275
+
276
+ Called when the stream has been healthy for _stability_threshold_seconds.
277
+ """
278
+ if not self._stream_healthy:
279
+ logger.info("Stream connection is now stable - resetting reconnection counters")
280
+ self._stream_healthy = True
281
+ self._reconnect_attempts = 0
282
+ self._current_reconnect_delay = self._initial_reconnect_delay
283
+
284
+ async def _check_stream_health(self) -> None:
285
+ """
286
+ Check if the stream is actually working by comparing sent vs received timestamps.
287
+
288
+ If we've been sending messages but haven't received any response (ack or other)
289
+ for _stream_health_timeout seconds, the stream is probably dead and we should reconnect.
290
+ """
291
+ now = datetime.now(timezone.utc)
292
+
293
+ # Need both timestamps to make a comparison
294
+ if not self._last_message_sent:
295
+ return
296
+
297
+ # Calculate time since last message sent and received
298
+ time_since_sent = (now - self._last_message_sent).total_seconds()
299
+
300
+ if self._last_message_received:
301
+ time_since_received = (now - self._last_message_received).total_seconds()
302
+ else:
303
+ # Never received anything - use time since stream started
304
+ if self._last_successful_stream_start:
305
+ time_since_received = (now - self._last_successful_stream_start).total_seconds()
306
+ else:
307
+ return
308
+
309
+ # If we've been sending but not receiving for too long, stream is dead
310
+ if time_since_received > self._stream_health_timeout:
311
+ logger.warning(
312
+ f"Stream appears dead: last sent {time_since_sent:.1f}s ago, "
313
+ f"last received {time_since_received:.1f}s ago (timeout: {self._stream_health_timeout}s)"
314
+ )
315
+ logger.warning("Forcing reconnection due to unresponsive stream...")
316
+ self._force_reconnect = True
317
+
318
+ # Cancel the stream call to force the error path
319
+ if self._stream_call:
320
+ try:
321
+ self._stream_call.cancel()
322
+ except Exception as e:
323
+ logger.debug(f"Error cancelling stream for forced reconnect: {e}")
324
+
224
325
  # =========================================================================
225
326
  # Unary RPCs
226
327
  # =========================================================================
@@ -470,6 +571,10 @@ class GrpcClient:
470
571
  """Set callback for when an add source request is received via stream."""
471
572
  self._on_add_source_request = callback
472
573
 
574
+ def on_agent_action_request(self, callback: Callable[[qalita_pb2.AgentActionRequest], Any]) -> None:
575
+ """Set callback for when an agent action request is received via stream."""
576
+ self._on_agent_action_request = callback
577
+
473
578
  def on_disconnect(self, callback: Callable[[], Any]) -> None:
474
579
  """Set callback for when connection is lost."""
475
580
  self._on_disconnect = callback
@@ -609,15 +714,46 @@ class GrpcClient:
609
714
  msg = qalita_pb2.WorkerMessage(add_source_response=response)
610
715
  await self._outgoing_queue.put(msg)
611
716
 
717
+ async def send_agent_action_response(
718
+ self,
719
+ request_id: str,
720
+ ok: bool,
721
+ action_type: str,
722
+ error: Optional[str] = None,
723
+ result_json: Optional[str] = None,
724
+ data: Optional[qalita_pb2.DataPreviewResponse] = None,
725
+ execution_time_ms: Optional[int] = None,
726
+ ) -> None:
727
+ """Send an agent action response through the stream."""
728
+ response = qalita_pb2.AgentActionResponse(
729
+ request_id=request_id,
730
+ ok=ok,
731
+ action_type=action_type,
732
+ )
733
+
734
+ if error:
735
+ response.error = error
736
+ if result_json:
737
+ response.result_json = result_json
738
+ if data:
739
+ response.data.CopyFrom(data)
740
+ if execution_time_ms is not None:
741
+ response.execution_time_ms = execution_time_ms
742
+
743
+ msg = qalita_pb2.WorkerMessage(agent_action_response=response)
744
+ await self._outgoing_queue.put(msg)
745
+
612
746
  async def _outgoing_messages(self) -> AsyncIterator[qalita_pb2.WorkerMessage]:
613
747
  """Generator for outgoing stream messages."""
614
748
  logger.info("Outgoing messages generator started")
615
- while self._stream_active:
749
+ while self._stream_active and not self._force_reconnect:
616
750
  try:
617
751
  # Use get_nowait in a loop with sleep to avoid blocking gRPC
618
752
  try:
619
753
  msg = self._outgoing_queue.get_nowait()
620
- logger.debug(f"Yielding message type: {msg.WhichOneof('payload')}")
754
+ msg_type = msg.WhichOneof('payload')
755
+ logger.debug(f"Yielding message type: {msg_type}")
756
+ self._last_message_sent = datetime.now(timezone.utc)
621
757
  yield msg
622
758
  except asyncio.QueueEmpty:
623
759
  # No message available, yield control briefly
@@ -638,7 +774,9 @@ class GrpcClient:
638
774
  - Keep-alive signals (sent every 10 seconds)
639
775
  - Incoming job assignments
640
776
  - Incoming routine triggers
641
- - Automatic reconnection on failure
777
+ - Automatic reconnection on failure with exponential backoff
778
+ - Stability detection to reset reconnection counters
779
+ - Dead stream detection (sending but not receiving)
642
780
  """
643
781
  if not self._connected:
644
782
  if not await self.connect():
@@ -647,14 +785,30 @@ class GrpcClient:
647
785
  # Recreate queue in async context to ensure proper event loop binding
648
786
  self._outgoing_queue = asyncio.Queue()
649
787
  self._stream_active = True
788
+ self._stream_healthy = False
789
+ self._last_successful_stream_start = None
790
+ self._last_message_received = None
791
+ self._last_message_sent = None
792
+ self._force_reconnect = False
650
793
 
651
794
  async def keep_alive_loop():
652
- """Send keep-alive every 10 seconds."""
795
+ """Send keep-alive every 10 seconds and monitor stream health."""
653
796
  logger.info(f"Keep-alive loop started, worker_id={self._worker_id}")
654
- while self._stream_active:
797
+ while self._stream_active and not self._force_reconnect:
655
798
  try:
656
799
  logger.debug(f"Sending keep-alive for worker {self._worker_id}")
657
800
  await self.send_keep_alive()
801
+
802
+ # Check if stream has been healthy long enough to reset counters
803
+ if (self._last_successful_stream_start and
804
+ not self._stream_healthy):
805
+ elapsed = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
806
+ if elapsed >= self._stability_threshold_seconds:
807
+ self._mark_stream_stable()
808
+
809
+ # Health check: detect dead stream (sending but not receiving)
810
+ await self._check_stream_health()
811
+
658
812
  await asyncio.sleep(10)
659
813
  except asyncio.CancelledError:
660
814
  logger.info("Keep-alive loop cancelled")
@@ -662,15 +816,28 @@ class GrpcClient:
662
816
  except Exception as e:
663
817
  logger.error(f"Keep-alive error: {e}")
664
818
 
665
- async def process_stream():
666
- """Process incoming stream messages."""
819
+ async def process_single_stream() -> bool:
820
+ """
821
+ Process incoming stream messages for one connection attempt.
822
+
823
+ Returns:
824
+ True if stream ended gracefully (should not reconnect)
825
+ False if stream had an error (should attempt reconnection)
826
+ """
667
827
  try:
668
828
  self._stream_call = self._stub.Connect(
669
829
  self._outgoing_messages(),
670
830
  metadata=self.metadata,
671
831
  )
672
832
 
833
+ # Mark the time when stream successfully started
834
+ self._last_successful_stream_start = datetime.now(timezone.utc)
835
+ logger.info("Stream established successfully")
836
+
673
837
  async for msg in self._stream_call:
838
+ # Each message received confirms the stream is working
839
+ self._last_message_received = datetime.now(timezone.utc)
840
+
674
841
  if msg.HasField('job_assignment'):
675
842
  job = msg.job_assignment.job
676
843
  logger.info(f"Received job assignment: {job.id}")
@@ -695,28 +862,87 @@ class GrpcClient:
695
862
  if self._on_add_source_request:
696
863
  await self._on_add_source_request(request)
697
864
 
865
+ elif msg.HasField('agent_action_request'):
866
+ request = msg.agent_action_request
867
+ logger.info(f"Received agent action request: {request.request_id} type={request.action_type}")
868
+ if self._on_agent_action_request:
869
+ await self._on_agent_action_request(request)
870
+
698
871
  elif msg.HasField('ack'):
699
872
  logger.debug(f"Received ack: {msg.ack.message_type}")
873
+ # Ack received means stream is working, check stability
874
+ if (self._last_successful_stream_start and
875
+ not self._stream_healthy):
876
+ elapsed = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
877
+ if elapsed >= self._stability_threshold_seconds:
878
+ self._mark_stream_stable()
700
879
 
701
880
  elif msg.HasField('error'):
702
881
  logger.error(f"Server error: {msg.error.code} - {msg.error.message}")
882
+
883
+ # Stream ended normally (server closed it gracefully)
884
+ logger.info("Stream ended normally")
885
+ return False # Still try to reconnect for continuous operation
703
886
 
704
887
  except grpc.aio.AioRpcError as e:
705
888
  if e.code() == grpc.StatusCode.CANCELLED:
706
- logger.info("Stream cancelled")
889
+ if self._force_reconnect:
890
+ logger.info("Stream cancelled due to forced reconnect (dead stream detection)")
891
+ return False # Reconnect
892
+ else:
893
+ logger.info("Stream cancelled by client")
894
+ return True # Don't reconnect if we intentionally cancelled it
707
895
  else:
708
- logger.error(f"Stream error: {e.code()} - {e.details()}")
709
- # Attempt reconnection
710
- if self._stream_active and await self._reconnect():
711
- await process_stream()
896
+ # Calculate how long the stream was alive
897
+ stream_duration = 0
898
+ if self._last_successful_stream_start:
899
+ stream_duration = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
900
+
901
+ logger.error(
902
+ f"Stream error after {stream_duration:.1f}s: {e.code()} - {e.details()}"
903
+ )
904
+ return False # Should attempt reconnection
905
+
906
+ except Exception as e:
907
+ logger.error(f"Unexpected stream error: {e}")
908
+ return False # Should attempt reconnection
712
909
 
713
- # Run keep-alive and stream processing concurrently
910
+ # Main stream loop with reconnection handling
714
911
  keep_alive_task = asyncio.create_task(keep_alive_loop())
715
912
 
716
913
  try:
717
- await process_stream()
914
+ while self._stream_active:
915
+ # Reset state before starting/restarting stream
916
+ self._force_reconnect = False
917
+ self._last_message_received = None
918
+ self._last_message_sent = None
919
+
920
+ # Process the stream
921
+ should_stop = await process_single_stream()
922
+
923
+ if should_stop or not self._stream_active:
924
+ break
925
+
926
+ # Stream failed, attempt reconnection
927
+ self._last_successful_stream_start = None
928
+
929
+ # Recreate the outgoing queue to clear any stale messages
930
+ self._outgoing_queue = asyncio.Queue()
931
+
932
+ # Attempt reconnection (this handles backoff)
933
+ if not await self._reconnect():
934
+ # _reconnect now always returns True after sleeping and connecting
935
+ # It only returns False if connect() itself fails
936
+ # In that case, keep trying
937
+ logger.warning("Reconnection failed, will retry...")
938
+ continue
939
+
940
+ # Reconnected successfully, loop will start a new stream
941
+ logger.info("Reconnected, restarting stream...")
942
+
718
943
  finally:
719
944
  self._stream_active = False
945
+ self._force_reconnect = True # Stop the outgoing generator
720
946
  keep_alive_task.cancel()
721
947
  try:
722
948
  await keep_alive_task
@@ -49,6 +49,7 @@ message WorkerMessage {
49
49
  JobLogLine log_line = 4;
50
50
  DataPreviewResponse data_preview_response = 5;
51
51
  AddSourceResponse add_source_response = 6;
52
+ AgentActionResponse agent_action_response = 7;
52
53
  }
53
54
  }
54
55
 
@@ -69,6 +70,7 @@ message ServerMessage {
69
70
  ServerError error = 4;
70
71
  DataPreviewRequest data_preview_request = 5;
71
72
  AddSourceRequest add_source_request = 6;
73
+ AgentActionRequest agent_action_request = 7;
72
74
  }
73
75
  }
74
76
 
@@ -389,3 +391,27 @@ message AddSourceResponse {
389
391
  optional int32 source_id = 4; // ID assigned by worker in local config
390
392
  bool connectivity_verified = 5; // Whether connection to source was verified
391
393
  }
394
+
395
+ // =============================================================================
396
+ // Agent Actions (Studio LLM -> Worker)
397
+ // =============================================================================
398
+
399
+ // Request from LLM agent to execute an action on a data source
400
+ message AgentActionRequest {
401
+ string request_id = 1; // Unique ID to correlate request/response
402
+ string action_type = 2; // query, read_data, filter, aggregate, describe, sample
403
+ int32 source_id = 3; // Source to operate on
404
+ string parameters_json = 4; // Action parameters as JSON
405
+ optional int32 timeout_seconds = 5; // Optional timeout for the action
406
+ }
407
+
408
+ // Response from worker after executing an agent action
409
+ message AgentActionResponse {
410
+ string request_id = 1; // Correlates with request
411
+ bool ok = 2; // Whether operation succeeded
412
+ string action_type = 3; // Echo back the action type
413
+ optional string error = 4; // Error message if ok=false
414
+ optional string result_json = 5; // Structured result as JSON (for metadata, stats)
415
+ optional DataPreviewResponse data = 6; // Tabular data result if applicable
416
+ optional int64 execution_time_ms = 7; // How long the action took
417
+ }