qalita 2.9.1__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qalita/_frontend/.next/BUILD_ID +1 -1
- qalita/_frontend/.next/build-manifest.json +7 -7
- qalita/_frontend/.next/prerender-manifest.json +3 -3
- qalita/_frontend/.next/required-server-files.json +196 -40
- qalita/_frontend/.next/server/app/_global-error/page/build-manifest.json +5 -5
- qalita/_frontend/.next/server/app/_global-error/page_client-reference-manifest.js +1 -1
- qalita/_frontend/.next/server/app/_global-error.html +2 -2
- qalita/_frontend/.next/server/app/_global-error.rsc +7 -7
- qalita/_frontend/.next/server/app/_global-error.segments/__PAGE__.segment.rsc +2 -2
- qalita/_frontend/.next/server/app/_global-error.segments/_full.segment.rsc +7 -7
- qalita/_frontend/.next/server/app/_global-error.segments/_head.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/_global-error.segments/_index.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
- qalita/_frontend/.next/server/app/_not-found/page/build-manifest.json +5 -5
- qalita/_frontend/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
- qalita/_frontend/.next/server/app/_not-found.html +1 -1
- qalita/_frontend/.next/server/app/_not-found.rsc +9 -9
- qalita/_frontend/.next/server/app/_not-found.segments/_full.segment.rsc +9 -9
- qalita/_frontend/.next/server/app/_not-found.segments/_head.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/_not-found.segments/_index.segment.rsc +5 -5
- qalita/_frontend/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +2 -2
- qalita/_frontend/.next/server/app/_not-found.segments/_not-found.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/_not-found.segments/_tree.segment.rsc +2 -2
- qalita/_frontend/.next/server/app/page/build-manifest.json +5 -5
- qalita/_frontend/.next/server/app/page_client-reference-manifest.js +1 -1
- qalita/_frontend/.next/server/app/sources/add/page/build-manifest.json +5 -5
- qalita/_frontend/.next/server/app/sources/add/page_client-reference-manifest.js +1 -1
- qalita/_frontend/.next/server/app/sources/add.html +1 -1
- qalita/_frontend/.next/server/app/sources/add.rsc +11 -11
- qalita/_frontend/.next/server/app/sources/add.segments/_full.segment.rsc +11 -11
- qalita/_frontend/.next/server/app/sources/add.segments/_head.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/sources/add.segments/_index.segment.rsc +5 -5
- qalita/_frontend/.next/server/app/sources/add.segments/_tree.segment.rsc +2 -2
- qalita/_frontend/.next/server/app/sources/add.segments/sources/add/__PAGE__.segment.rsc +4 -4
- qalita/_frontend/.next/server/app/sources/add.segments/sources/add.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/sources/add.segments/sources.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/sources/edit/[id]/page/build-manifest.json +5 -5
- qalita/_frontend/.next/server/app/sources/edit/[id]/page_client-reference-manifest.js +1 -1
- qalita/_frontend/.next/server/app/sources/page/build-manifest.json +5 -5
- qalita/_frontend/.next/server/app/sources/page_client-reference-manifest.js +1 -1
- qalita/_frontend/.next/server/app/sources.html +1 -1
- qalita/_frontend/.next/server/app/sources.rsc +11 -11
- qalita/_frontend/.next/server/app/sources.segments/_full.segment.rsc +11 -11
- qalita/_frontend/.next/server/app/sources.segments/_head.segment.rsc +3 -3
- qalita/_frontend/.next/server/app/sources.segments/_index.segment.rsc +5 -5
- qalita/_frontend/.next/server/app/sources.segments/_tree.segment.rsc +2 -2
- qalita/_frontend/.next/server/app/sources.segments/sources/__PAGE__.segment.rsc +4 -4
- qalita/_frontend/.next/server/app/sources.segments/sources.segment.rsc +3 -3
- qalita/_frontend/.next/server/chunks/[root-of-the-server]__bf0c3d33._.js +3 -3
- qalita/_frontend/.next/server/chunks/[root-of-the-server]__f408c708._.js +2 -2
- qalita/_frontend/.next/server/chunks/ssr/[root-of-the-server]__be91267c._.js +1 -1
- qalita/_frontend/.next/server/chunks/ssr/_404f6e81._.js +2 -2
- qalita/_frontend/.next/server/chunks/ssr/_6a67f6f0._.js +2 -2
- qalita/_frontend/.next/server/chunks/ssr/_cb7b44d6._.js +1 -1
- qalita/_frontend/.next/server/chunks/ssr/_d44c43ed._.js +1 -1
- qalita/_frontend/.next/server/chunks/ssr/components_DashboardContent_tsx_c3635665._.js +1 -1
- qalita/_frontend/.next/server/chunks/ssr/node_modules_next_dist_4b9a0874._.js +1 -1
- qalita/_frontend/.next/server/middleware-build-manifest.js +5 -5
- qalita/_frontend/.next/server/pages/404.html +1 -1
- qalita/_frontend/.next/server/pages/500.html +2 -2
- qalita/_frontend/.next/server/server-reference-manifest.js +1 -1
- qalita/_frontend/.next/server/server-reference-manifest.json +1 -1
- qalita/_frontend/.next/static/chunks/0c7542414b6a6f86.js +2 -0
- qalita/_frontend/.next/static/chunks/{89ba62a8ba9b79ce.js → 12daa96885968840.js} +1 -1
- qalita/_frontend/.next/static/chunks/1e6a98e93c470083.css +1 -0
- qalita/_frontend/.next/static/chunks/499b7099996cc9f9.js +1 -0
- qalita/_frontend/.next/static/chunks/694836347d1e5ef3.js +1 -0
- qalita/_frontend/.next/static/chunks/7ea91ca84dc4b3a4.js +1 -0
- qalita/_frontend/.next/static/chunks/89c689b5748e28ed.js +1 -0
- qalita/_frontend/.next/static/chunks/9e71bf77f23416e6.js +1 -0
- qalita/_frontend/.next/static/chunks/aa2a44cc19d89bdb.js +1 -0
- qalita/_frontend/.next/static/chunks/ba22289f779d638e.js +1 -0
- qalita/_frontend/.next/static/chunks/bb05964d928aa166.js +3 -0
- qalita/_frontend/.next/static/chunks/dde1c328f398837e.js +1 -0
- qalita/_frontend/.next/static/chunks/ecbb64dc112ad516.js +1 -0
- qalita/_frontend/.next/static/chunks/facd124df217e016.js +1 -0
- qalita/_frontend/.next/static/chunks/turbopack-9fc8bcb3a9806c66.js +4 -0
- qalita/_frontend/node_modules/@next/env/package.json +1 -1
- qalita/_frontend/node_modules/next/dist/build/index.js +10 -4
- qalita/_frontend/node_modules/next/dist/build/swc/index.js +1 -1
- qalita/_frontend/node_modules/next/dist/build/webpack-config.js +3 -3
- qalita/_frontend/node_modules/next/dist/client/components/segment-cache/lru.js +2 -0
- qalita/_frontend/node_modules/next/dist/compiled/next-server/app-page-turbo-experimental.runtime.prod.js +1 -1
- qalita/_frontend/node_modules/next/dist/compiled/next-server/app-page-turbo.runtime.prod.js +1 -1
- qalita/_frontend/node_modules/next/dist/server/config-shared.js +4 -0
- qalita/_frontend/node_modules/next/dist/server/dev/hot-reloader-turbopack.js +1 -1
- qalita/_frontend/node_modules/next/dist/server/dev/hot-reloader-webpack.js +1 -1
- qalita/_frontend/node_modules/next/dist/server/lib/app-info-log.js +1 -1
- qalita/_frontend/node_modules/next/dist/server/lib/start-server.js +1 -1
- qalita/_frontend/node_modules/next/dist/server/web/adapter.js +1 -1
- qalita/_frontend/node_modules/next/dist/shared/lib/errors/canary-only-config-error.js +1 -1
- qalita/_frontend/node_modules/next/dist/telemetry/anonymous-meta.js +1 -1
- qalita/_frontend/node_modules/next/dist/telemetry/events/version.js +2 -2
- qalita/_frontend/node_modules/next/package.json +15 -15
- qalita/_frontend/package.json +4 -4
- qalita/_frontend/server.js +1 -1
- qalita/commands/source.py +166 -2
- qalita/commands/worker.py +3 -3
- qalita/commands/worker_grpc.py +113 -3
- qalita/grpc/client.py +260 -34
- qalita/grpc/protos/qalita.proto +26 -0
- qalita/grpc/protos/qalita_pb2.py +80 -76
- qalita/grpc/protos/qalita_pb2_grpc.py +1 -1
- qalita/internal/action_executor.py +1009 -0
- qalita/internal/utils.py +1 -1
- {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/METADATA +4 -3
- {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/RECORD +113 -111
- qalita/_frontend/.next/static/chunks/02a64570f0a14789.js +0 -1
- qalita/_frontend/.next/static/chunks/0b082245f106d665.js +0 -1
- qalita/_frontend/.next/static/chunks/27b3ba70c7ef50a8.js +0 -1
- qalita/_frontend/.next/static/chunks/517e9b74d1a3c0ce.js +0 -1
- qalita/_frontend/.next/static/chunks/58689c96b0676c41.js +0 -1
- qalita/_frontend/.next/static/chunks/6c99da4248e4fcfc.js +0 -1
- qalita/_frontend/.next/static/chunks/acc5da18ff20daa1.js +0 -3
- qalita/_frontend/.next/static/chunks/bdc8a8e7721f5675.js +0 -2
- qalita/_frontend/.next/static/chunks/e0df86cbf44bbf9f.js +0 -1
- qalita/_frontend/.next/static/chunks/e4c3a252774ab7fd.css +0 -1
- qalita/_frontend/.next/static/chunks/e6ce59ba40b863f2.js +0 -1
- qalita/_frontend/.next/static/chunks/ec4b1f1e3cd3ae43.js +0 -1
- qalita/_frontend/.next/static/chunks/turbopack-d21156d03715fafa.js +0 -4
- /qalita/_frontend/.next/static/{M1H4Lcjc6A78n9p1qVA6d → NJRrkC0Gn13ofbqb0Lb0C}/_buildManifest.js +0 -0
- /qalita/_frontend/.next/static/{M1H4Lcjc6A78n9p1qVA6d → NJRrkC0Gn13ofbqb0Lb0C}/_clientMiddlewareManifest.json +0 -0
- /qalita/_frontend/.next/static/{M1H4Lcjc6A78n9p1qVA6d → NJRrkC0Gn13ofbqb0Lb0C}/_ssgManifest.js +0 -0
- {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/WHEEL +0 -0
- {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/entry_points.txt +0 -0
- {qalita-2.9.1.dist-info → qalita-2.10.0.dist-info}/licenses/LICENSE +0 -0
qalita/grpc/client.py
CHANGED
|
@@ -26,6 +26,7 @@ class GrpcClient:
|
|
|
26
26
|
- Keep-alive management
|
|
27
27
|
- Bidirectional streaming support
|
|
28
28
|
- Thread-safe connection state
|
|
29
|
+
- Stability detection before resetting reconnection counter
|
|
29
30
|
"""
|
|
30
31
|
|
|
31
32
|
def __init__(
|
|
@@ -36,6 +37,7 @@ class GrpcClient:
|
|
|
36
37
|
max_reconnect_attempts: int = 10,
|
|
37
38
|
initial_reconnect_delay: float = 1.0,
|
|
38
39
|
max_reconnect_delay: float = 60.0,
|
|
40
|
+
stability_threshold_seconds: float = 30.0,
|
|
39
41
|
):
|
|
40
42
|
"""
|
|
41
43
|
Initialize the gRPC client.
|
|
@@ -47,6 +49,7 @@ class GrpcClient:
|
|
|
47
49
|
max_reconnect_attempts: Maximum reconnection attempts (0 = unlimited)
|
|
48
50
|
initial_reconnect_delay: Initial delay between reconnection attempts
|
|
49
51
|
max_reconnect_delay: Maximum delay between reconnection attempts
|
|
52
|
+
stability_threshold_seconds: Time the connection must be stable before resetting attempts counter
|
|
50
53
|
"""
|
|
51
54
|
self._url = url
|
|
52
55
|
self._token = token
|
|
@@ -54,6 +57,7 @@ class GrpcClient:
|
|
|
54
57
|
self._max_reconnect_attempts = max_reconnect_attempts
|
|
55
58
|
self._initial_reconnect_delay = initial_reconnect_delay
|
|
56
59
|
self._max_reconnect_delay = max_reconnect_delay
|
|
60
|
+
self._stability_threshold_seconds = stability_threshold_seconds
|
|
57
61
|
|
|
58
62
|
# Connection state - set before parsing URL
|
|
59
63
|
self._use_secure_channel = False
|
|
@@ -66,17 +70,27 @@ class GrpcClient:
|
|
|
66
70
|
self._stub: Optional[qalita_pb2_grpc.WorkerServiceStub] = None
|
|
67
71
|
self._connected = False
|
|
68
72
|
self._reconnect_attempts = 0
|
|
73
|
+
self._current_reconnect_delay = initial_reconnect_delay
|
|
74
|
+
self._last_successful_stream_start: Optional[datetime] = None
|
|
75
|
+
self._stream_healthy = False
|
|
69
76
|
|
|
70
77
|
# Stream state
|
|
71
78
|
self._stream_call = None
|
|
72
79
|
self._outgoing_queue: asyncio.Queue = asyncio.Queue()
|
|
73
80
|
self._stream_active = False
|
|
74
81
|
|
|
82
|
+
# Stream health monitoring
|
|
83
|
+
self._last_message_received: Optional[datetime] = None
|
|
84
|
+
self._last_message_sent: Optional[datetime] = None
|
|
85
|
+
self._stream_health_timeout = 45.0 # Consider stream dead if no response in 45s
|
|
86
|
+
self._force_reconnect = False
|
|
87
|
+
|
|
75
88
|
# Callbacks
|
|
76
89
|
self._on_job_received: Optional[Callable] = None
|
|
77
90
|
self._on_routine_received: Optional[Callable] = None
|
|
78
91
|
self._on_data_preview_request: Optional[Callable] = None
|
|
79
92
|
self._on_add_source_request: Optional[Callable] = None
|
|
93
|
+
self._on_agent_action_request: Optional[Callable] = None
|
|
80
94
|
self._on_disconnect: Optional[Callable] = None
|
|
81
95
|
|
|
82
96
|
def _parse_grpc_target(self, url: str) -> str:
|
|
@@ -105,8 +119,8 @@ class GrpcClient:
|
|
|
105
119
|
self._use_secure_channel = False
|
|
106
120
|
return f"{host}:50051"
|
|
107
121
|
|
|
108
|
-
# For production URLs (e.g., https://api.
|
|
109
|
-
# Convert to gRPC endpoint (e.g., grpc.
|
|
122
|
+
# For production URLs (e.g., https://api.app.platform.qalita.io)
|
|
123
|
+
# Convert to gRPC endpoint (e.g., grpc.app.platform.qalita.io:443)
|
|
110
124
|
self._use_secure_channel = True
|
|
111
125
|
|
|
112
126
|
# Replace 'api.' prefix with 'grpc.' if present
|
|
@@ -128,10 +142,22 @@ class GrpcClient:
|
|
|
128
142
|
"""
|
|
129
143
|
Establish connection to the gRPC server.
|
|
130
144
|
|
|
145
|
+
Note: This method does NOT reset _reconnect_attempts. The counter is only
|
|
146
|
+
reset after the stream has been stable for _stability_threshold_seconds.
|
|
147
|
+
|
|
131
148
|
Returns:
|
|
132
149
|
True if connection successful, False otherwise
|
|
133
150
|
"""
|
|
134
151
|
try:
|
|
152
|
+
# Close any existing channel first
|
|
153
|
+
if self._channel:
|
|
154
|
+
try:
|
|
155
|
+
await self._channel.close()
|
|
156
|
+
except Exception:
|
|
157
|
+
pass
|
|
158
|
+
self._channel = None
|
|
159
|
+
self._stub = None
|
|
160
|
+
|
|
135
161
|
# Channel options for long-running streams
|
|
136
162
|
channel_options = [
|
|
137
163
|
('grpc.keepalive_time_ms', 30000),
|
|
@@ -141,6 +167,10 @@ class GrpcClient:
|
|
|
141
167
|
('grpc.http2.max_pings_without_data', 0),
|
|
142
168
|
('grpc.max_receive_message_length', 50 * 1024 * 1024),
|
|
143
169
|
('grpc.max_send_message_length', 50 * 1024 * 1024),
|
|
170
|
+
# Additional options for better connection resilience
|
|
171
|
+
('grpc.initial_reconnect_backoff_ms', 1000),
|
|
172
|
+
('grpc.max_reconnect_backoff_ms', 60000),
|
|
173
|
+
('grpc.enable_retries', 1),
|
|
144
174
|
]
|
|
145
175
|
|
|
146
176
|
# Create channel - secure for production, insecure for local dev
|
|
@@ -159,7 +189,7 @@ class GrpcClient:
|
|
|
159
189
|
|
|
160
190
|
self._stub = qalita_pb2_grpc.WorkerServiceStub(self._channel)
|
|
161
191
|
self._connected = True
|
|
162
|
-
|
|
192
|
+
# Note: Do NOT reset _reconnect_attempts here - only reset after stable stream
|
|
163
193
|
|
|
164
194
|
logger.info(f"Connected to gRPC server at {self._grpc_target}")
|
|
165
195
|
return True
|
|
@@ -196,31 +226,102 @@ class GrpcClient:
|
|
|
196
226
|
"""
|
|
197
227
|
Attempt to reconnect with exponential backoff.
|
|
198
228
|
|
|
229
|
+
The reconnection counter persists across reconnection cycles. It only resets
|
|
230
|
+
when the connection has been stable (stream healthy for _stability_threshold_seconds).
|
|
231
|
+
|
|
199
232
|
Returns:
|
|
200
233
|
True if reconnection successful, False if max attempts exceeded
|
|
201
234
|
"""
|
|
202
|
-
|
|
235
|
+
self._reconnect_attempts += 1
|
|
236
|
+
self._stream_healthy = False
|
|
237
|
+
|
|
238
|
+
# Check if max attempts exceeded
|
|
239
|
+
if self._max_reconnect_attempts > 0 and self._reconnect_attempts > self._max_reconnect_attempts:
|
|
240
|
+
logger.error(
|
|
241
|
+
f"Max reconnection attempts exceeded ({self._reconnect_attempts}/{self._max_reconnect_attempts}). "
|
|
242
|
+
f"Will continue trying with max backoff delay."
|
|
243
|
+
)
|
|
244
|
+
# Don't return False - keep trying but with max delay
|
|
245
|
+
# In production, we want the worker to eventually recover
|
|
203
246
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
self.
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
247
|
+
logger.warning(
|
|
248
|
+
f"Reconnection attempt {self._reconnect_attempts}"
|
|
249
|
+
f"{f'/{self._max_reconnect_attempts}' if self._max_reconnect_attempts > 0 else ''} "
|
|
250
|
+
f"(delay: {self._current_reconnect_delay:.1f}s)"
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Wait before attempting reconnection (exponential backoff)
|
|
254
|
+
await asyncio.sleep(self._current_reconnect_delay)
|
|
255
|
+
|
|
256
|
+
# Attempt to connect
|
|
257
|
+
if await self.connect():
|
|
258
|
+
# Increase delay for next attempt (in case this stream also fails quickly)
|
|
259
|
+
self._current_reconnect_delay = min(
|
|
260
|
+
self._current_reconnect_delay * 2,
|
|
261
|
+
self._max_reconnect_delay
|
|
211
262
|
)
|
|
212
|
-
|
|
213
|
-
await asyncio.sleep(delay)
|
|
214
|
-
|
|
215
|
-
if await self.connect():
|
|
216
|
-
return True
|
|
217
|
-
|
|
218
|
-
# Exponential backoff
|
|
219
|
-
delay = min(delay * 2, self._max_reconnect_delay)
|
|
263
|
+
return True
|
|
220
264
|
|
|
221
|
-
|
|
265
|
+
# Connection failed, increase delay for next attempt
|
|
266
|
+
self._current_reconnect_delay = min(
|
|
267
|
+
self._current_reconnect_delay * 2,
|
|
268
|
+
self._max_reconnect_delay
|
|
269
|
+
)
|
|
222
270
|
return False
|
|
223
271
|
|
|
272
|
+
def _mark_stream_stable(self) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Mark the stream as stable and reset reconnection counters.
|
|
275
|
+
|
|
276
|
+
Called when the stream has been healthy for _stability_threshold_seconds.
|
|
277
|
+
"""
|
|
278
|
+
if not self._stream_healthy:
|
|
279
|
+
logger.info("Stream connection is now stable - resetting reconnection counters")
|
|
280
|
+
self._stream_healthy = True
|
|
281
|
+
self._reconnect_attempts = 0
|
|
282
|
+
self._current_reconnect_delay = self._initial_reconnect_delay
|
|
283
|
+
|
|
284
|
+
async def _check_stream_health(self) -> None:
|
|
285
|
+
"""
|
|
286
|
+
Check if the stream is actually working by comparing sent vs received timestamps.
|
|
287
|
+
|
|
288
|
+
If we've been sending messages but haven't received any response (ack or other)
|
|
289
|
+
for _stream_health_timeout seconds, the stream is probably dead and we should reconnect.
|
|
290
|
+
"""
|
|
291
|
+
now = datetime.now(timezone.utc)
|
|
292
|
+
|
|
293
|
+
# Need both timestamps to make a comparison
|
|
294
|
+
if not self._last_message_sent:
|
|
295
|
+
return
|
|
296
|
+
|
|
297
|
+
# Calculate time since last message sent and received
|
|
298
|
+
time_since_sent = (now - self._last_message_sent).total_seconds()
|
|
299
|
+
|
|
300
|
+
if self._last_message_received:
|
|
301
|
+
time_since_received = (now - self._last_message_received).total_seconds()
|
|
302
|
+
else:
|
|
303
|
+
# Never received anything - use time since stream started
|
|
304
|
+
if self._last_successful_stream_start:
|
|
305
|
+
time_since_received = (now - self._last_successful_stream_start).total_seconds()
|
|
306
|
+
else:
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
# If we've been sending but not receiving for too long, stream is dead
|
|
310
|
+
if time_since_received > self._stream_health_timeout:
|
|
311
|
+
logger.warning(
|
|
312
|
+
f"Stream appears dead: last sent {time_since_sent:.1f}s ago, "
|
|
313
|
+
f"last received {time_since_received:.1f}s ago (timeout: {self._stream_health_timeout}s)"
|
|
314
|
+
)
|
|
315
|
+
logger.warning("Forcing reconnection due to unresponsive stream...")
|
|
316
|
+
self._force_reconnect = True
|
|
317
|
+
|
|
318
|
+
# Cancel the stream call to force the error path
|
|
319
|
+
if self._stream_call:
|
|
320
|
+
try:
|
|
321
|
+
self._stream_call.cancel()
|
|
322
|
+
except Exception as e:
|
|
323
|
+
logger.debug(f"Error cancelling stream for forced reconnect: {e}")
|
|
324
|
+
|
|
224
325
|
# =========================================================================
|
|
225
326
|
# Unary RPCs
|
|
226
327
|
# =========================================================================
|
|
@@ -470,6 +571,10 @@ class GrpcClient:
|
|
|
470
571
|
"""Set callback for when an add source request is received via stream."""
|
|
471
572
|
self._on_add_source_request = callback
|
|
472
573
|
|
|
574
|
+
def on_agent_action_request(self, callback: Callable[[qalita_pb2.AgentActionRequest], Any]) -> None:
|
|
575
|
+
"""Set callback for when an agent action request is received via stream."""
|
|
576
|
+
self._on_agent_action_request = callback
|
|
577
|
+
|
|
473
578
|
def on_disconnect(self, callback: Callable[[], Any]) -> None:
|
|
474
579
|
"""Set callback for when connection is lost."""
|
|
475
580
|
self._on_disconnect = callback
|
|
@@ -609,15 +714,46 @@ class GrpcClient:
|
|
|
609
714
|
msg = qalita_pb2.WorkerMessage(add_source_response=response)
|
|
610
715
|
await self._outgoing_queue.put(msg)
|
|
611
716
|
|
|
717
|
+
async def send_agent_action_response(
|
|
718
|
+
self,
|
|
719
|
+
request_id: str,
|
|
720
|
+
ok: bool,
|
|
721
|
+
action_type: str,
|
|
722
|
+
error: Optional[str] = None,
|
|
723
|
+
result_json: Optional[str] = None,
|
|
724
|
+
data: Optional[qalita_pb2.DataPreviewResponse] = None,
|
|
725
|
+
execution_time_ms: Optional[int] = None,
|
|
726
|
+
) -> None:
|
|
727
|
+
"""Send an agent action response through the stream."""
|
|
728
|
+
response = qalita_pb2.AgentActionResponse(
|
|
729
|
+
request_id=request_id,
|
|
730
|
+
ok=ok,
|
|
731
|
+
action_type=action_type,
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
if error:
|
|
735
|
+
response.error = error
|
|
736
|
+
if result_json:
|
|
737
|
+
response.result_json = result_json
|
|
738
|
+
if data:
|
|
739
|
+
response.data.CopyFrom(data)
|
|
740
|
+
if execution_time_ms is not None:
|
|
741
|
+
response.execution_time_ms = execution_time_ms
|
|
742
|
+
|
|
743
|
+
msg = qalita_pb2.WorkerMessage(agent_action_response=response)
|
|
744
|
+
await self._outgoing_queue.put(msg)
|
|
745
|
+
|
|
612
746
|
async def _outgoing_messages(self) -> AsyncIterator[qalita_pb2.WorkerMessage]:
|
|
613
747
|
"""Generator for outgoing stream messages."""
|
|
614
748
|
logger.info("Outgoing messages generator started")
|
|
615
|
-
while self._stream_active:
|
|
749
|
+
while self._stream_active and not self._force_reconnect:
|
|
616
750
|
try:
|
|
617
751
|
# Use get_nowait in a loop with sleep to avoid blocking gRPC
|
|
618
752
|
try:
|
|
619
753
|
msg = self._outgoing_queue.get_nowait()
|
|
620
|
-
|
|
754
|
+
msg_type = msg.WhichOneof('payload')
|
|
755
|
+
logger.debug(f"Yielding message type: {msg_type}")
|
|
756
|
+
self._last_message_sent = datetime.now(timezone.utc)
|
|
621
757
|
yield msg
|
|
622
758
|
except asyncio.QueueEmpty:
|
|
623
759
|
# No message available, yield control briefly
|
|
@@ -638,7 +774,9 @@ class GrpcClient:
|
|
|
638
774
|
- Keep-alive signals (sent every 10 seconds)
|
|
639
775
|
- Incoming job assignments
|
|
640
776
|
- Incoming routine triggers
|
|
641
|
-
- Automatic reconnection on failure
|
|
777
|
+
- Automatic reconnection on failure with exponential backoff
|
|
778
|
+
- Stability detection to reset reconnection counters
|
|
779
|
+
- Dead stream detection (sending but not receiving)
|
|
642
780
|
"""
|
|
643
781
|
if not self._connected:
|
|
644
782
|
if not await self.connect():
|
|
@@ -647,14 +785,30 @@ class GrpcClient:
|
|
|
647
785
|
# Recreate queue in async context to ensure proper event loop binding
|
|
648
786
|
self._outgoing_queue = asyncio.Queue()
|
|
649
787
|
self._stream_active = True
|
|
788
|
+
self._stream_healthy = False
|
|
789
|
+
self._last_successful_stream_start = None
|
|
790
|
+
self._last_message_received = None
|
|
791
|
+
self._last_message_sent = None
|
|
792
|
+
self._force_reconnect = False
|
|
650
793
|
|
|
651
794
|
async def keep_alive_loop():
|
|
652
|
-
"""Send keep-alive every 10 seconds."""
|
|
795
|
+
"""Send keep-alive every 10 seconds and monitor stream health."""
|
|
653
796
|
logger.info(f"Keep-alive loop started, worker_id={self._worker_id}")
|
|
654
|
-
while self._stream_active:
|
|
797
|
+
while self._stream_active and not self._force_reconnect:
|
|
655
798
|
try:
|
|
656
799
|
logger.debug(f"Sending keep-alive for worker {self._worker_id}")
|
|
657
800
|
await self.send_keep_alive()
|
|
801
|
+
|
|
802
|
+
# Check if stream has been healthy long enough to reset counters
|
|
803
|
+
if (self._last_successful_stream_start and
|
|
804
|
+
not self._stream_healthy):
|
|
805
|
+
elapsed = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
|
|
806
|
+
if elapsed >= self._stability_threshold_seconds:
|
|
807
|
+
self._mark_stream_stable()
|
|
808
|
+
|
|
809
|
+
# Health check: detect dead stream (sending but not receiving)
|
|
810
|
+
await self._check_stream_health()
|
|
811
|
+
|
|
658
812
|
await asyncio.sleep(10)
|
|
659
813
|
except asyncio.CancelledError:
|
|
660
814
|
logger.info("Keep-alive loop cancelled")
|
|
@@ -662,15 +816,28 @@ class GrpcClient:
|
|
|
662
816
|
except Exception as e:
|
|
663
817
|
logger.error(f"Keep-alive error: {e}")
|
|
664
818
|
|
|
665
|
-
async def
|
|
666
|
-
"""
|
|
819
|
+
async def process_single_stream() -> bool:
|
|
820
|
+
"""
|
|
821
|
+
Process incoming stream messages for one connection attempt.
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
True if stream ended gracefully (should not reconnect)
|
|
825
|
+
False if stream had an error (should attempt reconnection)
|
|
826
|
+
"""
|
|
667
827
|
try:
|
|
668
828
|
self._stream_call = self._stub.Connect(
|
|
669
829
|
self._outgoing_messages(),
|
|
670
830
|
metadata=self.metadata,
|
|
671
831
|
)
|
|
672
832
|
|
|
833
|
+
# Mark the time when stream successfully started
|
|
834
|
+
self._last_successful_stream_start = datetime.now(timezone.utc)
|
|
835
|
+
logger.info("Stream established successfully")
|
|
836
|
+
|
|
673
837
|
async for msg in self._stream_call:
|
|
838
|
+
# Each message received confirms the stream is working
|
|
839
|
+
self._last_message_received = datetime.now(timezone.utc)
|
|
840
|
+
|
|
674
841
|
if msg.HasField('job_assignment'):
|
|
675
842
|
job = msg.job_assignment.job
|
|
676
843
|
logger.info(f"Received job assignment: {job.id}")
|
|
@@ -695,28 +862,87 @@ class GrpcClient:
|
|
|
695
862
|
if self._on_add_source_request:
|
|
696
863
|
await self._on_add_source_request(request)
|
|
697
864
|
|
|
865
|
+
elif msg.HasField('agent_action_request'):
|
|
866
|
+
request = msg.agent_action_request
|
|
867
|
+
logger.info(f"Received agent action request: {request.request_id} type={request.action_type}")
|
|
868
|
+
if self._on_agent_action_request:
|
|
869
|
+
await self._on_agent_action_request(request)
|
|
870
|
+
|
|
698
871
|
elif msg.HasField('ack'):
|
|
699
872
|
logger.debug(f"Received ack: {msg.ack.message_type}")
|
|
873
|
+
# Ack received means stream is working, check stability
|
|
874
|
+
if (self._last_successful_stream_start and
|
|
875
|
+
not self._stream_healthy):
|
|
876
|
+
elapsed = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
|
|
877
|
+
if elapsed >= self._stability_threshold_seconds:
|
|
878
|
+
self._mark_stream_stable()
|
|
700
879
|
|
|
701
880
|
elif msg.HasField('error'):
|
|
702
881
|
logger.error(f"Server error: {msg.error.code} - {msg.error.message}")
|
|
882
|
+
|
|
883
|
+
# Stream ended normally (server closed it gracefully)
|
|
884
|
+
logger.info("Stream ended normally")
|
|
885
|
+
return False # Still try to reconnect for continuous operation
|
|
703
886
|
|
|
704
887
|
except grpc.aio.AioRpcError as e:
|
|
705
888
|
if e.code() == grpc.StatusCode.CANCELLED:
|
|
706
|
-
|
|
889
|
+
if self._force_reconnect:
|
|
890
|
+
logger.info("Stream cancelled due to forced reconnect (dead stream detection)")
|
|
891
|
+
return False # Reconnect
|
|
892
|
+
else:
|
|
893
|
+
logger.info("Stream cancelled by client")
|
|
894
|
+
return True # Don't reconnect if we intentionally cancelled it
|
|
707
895
|
else:
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
if self.
|
|
711
|
-
|
|
896
|
+
# Calculate how long the stream was alive
|
|
897
|
+
stream_duration = 0
|
|
898
|
+
if self._last_successful_stream_start:
|
|
899
|
+
stream_duration = (datetime.now(timezone.utc) - self._last_successful_stream_start).total_seconds()
|
|
900
|
+
|
|
901
|
+
logger.error(
|
|
902
|
+
f"Stream error after {stream_duration:.1f}s: {e.code()} - {e.details()}"
|
|
903
|
+
)
|
|
904
|
+
return False # Should attempt reconnection
|
|
905
|
+
|
|
906
|
+
except Exception as e:
|
|
907
|
+
logger.error(f"Unexpected stream error: {e}")
|
|
908
|
+
return False # Should attempt reconnection
|
|
712
909
|
|
|
713
|
-
#
|
|
910
|
+
# Main stream loop with reconnection handling
|
|
714
911
|
keep_alive_task = asyncio.create_task(keep_alive_loop())
|
|
715
912
|
|
|
716
913
|
try:
|
|
717
|
-
|
|
914
|
+
while self._stream_active:
|
|
915
|
+
# Reset state before starting/restarting stream
|
|
916
|
+
self._force_reconnect = False
|
|
917
|
+
self._last_message_received = None
|
|
918
|
+
self._last_message_sent = None
|
|
919
|
+
|
|
920
|
+
# Process the stream
|
|
921
|
+
should_stop = await process_single_stream()
|
|
922
|
+
|
|
923
|
+
if should_stop or not self._stream_active:
|
|
924
|
+
break
|
|
925
|
+
|
|
926
|
+
# Stream failed, attempt reconnection
|
|
927
|
+
self._last_successful_stream_start = None
|
|
928
|
+
|
|
929
|
+
# Recreate the outgoing queue to clear any stale messages
|
|
930
|
+
self._outgoing_queue = asyncio.Queue()
|
|
931
|
+
|
|
932
|
+
# Attempt reconnection (this handles backoff)
|
|
933
|
+
if not await self._reconnect():
|
|
934
|
+
# _reconnect now always returns True after sleeping and connecting
|
|
935
|
+
# It only returns False if connect() itself fails
|
|
936
|
+
# In that case, keep trying
|
|
937
|
+
logger.warning("Reconnection failed, will retry...")
|
|
938
|
+
continue
|
|
939
|
+
|
|
940
|
+
# Reconnected successfully, loop will start a new stream
|
|
941
|
+
logger.info("Reconnected, restarting stream...")
|
|
942
|
+
|
|
718
943
|
finally:
|
|
719
944
|
self._stream_active = False
|
|
945
|
+
self._force_reconnect = True # Stop the outgoing generator
|
|
720
946
|
keep_alive_task.cancel()
|
|
721
947
|
try:
|
|
722
948
|
await keep_alive_task
|
qalita/grpc/protos/qalita.proto
CHANGED
|
@@ -49,6 +49,7 @@ message WorkerMessage {
|
|
|
49
49
|
JobLogLine log_line = 4;
|
|
50
50
|
DataPreviewResponse data_preview_response = 5;
|
|
51
51
|
AddSourceResponse add_source_response = 6;
|
|
52
|
+
AgentActionResponse agent_action_response = 7;
|
|
52
53
|
}
|
|
53
54
|
}
|
|
54
55
|
|
|
@@ -69,6 +70,7 @@ message ServerMessage {
|
|
|
69
70
|
ServerError error = 4;
|
|
70
71
|
DataPreviewRequest data_preview_request = 5;
|
|
71
72
|
AddSourceRequest add_source_request = 6;
|
|
73
|
+
AgentActionRequest agent_action_request = 7;
|
|
72
74
|
}
|
|
73
75
|
}
|
|
74
76
|
|
|
@@ -389,3 +391,27 @@ message AddSourceResponse {
|
|
|
389
391
|
optional int32 source_id = 4; // ID assigned by worker in local config
|
|
390
392
|
bool connectivity_verified = 5; // Whether connection to source was verified
|
|
391
393
|
}
|
|
394
|
+
|
|
395
|
+
// =============================================================================
|
|
396
|
+
// Agent Actions (Studio LLM -> Worker)
|
|
397
|
+
// =============================================================================
|
|
398
|
+
|
|
399
|
+
// Request from LLM agent to execute an action on a data source
|
|
400
|
+
message AgentActionRequest {
|
|
401
|
+
string request_id = 1; // Unique ID to correlate request/response
|
|
402
|
+
string action_type = 2; // query, read_data, filter, aggregate, describe, sample
|
|
403
|
+
int32 source_id = 3; // Source to operate on
|
|
404
|
+
string parameters_json = 4; // Action parameters as JSON
|
|
405
|
+
optional int32 timeout_seconds = 5; // Optional timeout for the action
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Response from worker after executing an agent action
|
|
409
|
+
message AgentActionResponse {
|
|
410
|
+
string request_id = 1; // Correlates with request
|
|
411
|
+
bool ok = 2; // Whether operation succeeded
|
|
412
|
+
string action_type = 3; // Echo back the action type
|
|
413
|
+
optional string error = 4; // Error message if ok=false
|
|
414
|
+
optional string result_json = 5; // Structured result as JSON (for metadata, stats)
|
|
415
|
+
optional DataPreviewResponse data = 6; // Tabular data result if applicable
|
|
416
|
+
optional int64 execution_time_ms = 7; // How long the action took
|
|
417
|
+
}
|