great-expectations-cloud 20250811.1.dev0__py3-none-any.whl → 20260113.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of great-expectations-cloud might be problematic. Click here for more details.
- great_expectations_cloud/agent/actions/agent_action.py +3 -3
- great_expectations_cloud/agent/actions/draft_datasource_config_action.py +2 -2
- great_expectations_cloud/agent/actions/generate_data_quality_check_expectations_action.py +47 -24
- great_expectations_cloud/agent/actions/list_asset_names.py +4 -5
- great_expectations_cloud/agent/actions/run_checkpoint.py +64 -3
- great_expectations_cloud/agent/actions/run_metric_list_action.py +3 -3
- great_expectations_cloud/agent/actions/run_scheduled_checkpoint.py +28 -5
- great_expectations_cloud/agent/actions/run_window_checkpoint.py +2 -4
- great_expectations_cloud/agent/actions/utils.py +13 -4
- great_expectations_cloud/agent/agent.py +259 -36
- great_expectations_cloud/agent/event_handler.py +8 -7
- great_expectations_cloud/agent/message_service/asyncio_rabbit_mq_client.py +33 -8
- great_expectations_cloud/agent/message_service/subscriber.py +4 -0
- great_expectations_cloud/agent/models.py +13 -0
- {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/METADATA +7 -5
- {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/RECORD +19 -19
- {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/WHEEL +1 -1
- {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/entry_points.txt +0 -0
- {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info/licenses}/LICENSE +0 -0
|
@@ -2,13 +2,20 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
|
+
import os
|
|
6
|
+
import resource
|
|
7
|
+
import signal
|
|
8
|
+
import socket
|
|
5
9
|
import sys
|
|
10
|
+
import threading
|
|
11
|
+
import time
|
|
6
12
|
import traceback
|
|
7
13
|
import warnings
|
|
8
14
|
from collections import defaultdict
|
|
9
15
|
from concurrent.futures import Future
|
|
10
16
|
from concurrent.futures.thread import ThreadPoolExecutor
|
|
11
17
|
from functools import partial
|
|
18
|
+
from http import HTTPStatus
|
|
12
19
|
from importlib.metadata import version as metadata_version
|
|
13
20
|
from typing import TYPE_CHECKING, Any, Callable, Final, Literal
|
|
14
21
|
from urllib.parse import urljoin, urlparse
|
|
@@ -16,13 +23,12 @@ from uuid import UUID
|
|
|
16
23
|
|
|
17
24
|
import orjson
|
|
18
25
|
import requests
|
|
26
|
+
from great_expectations import __version__, get_context
|
|
27
|
+
from great_expectations.core import http
|
|
19
28
|
from great_expectations.core.http import create_session
|
|
20
29
|
from great_expectations.data_context.cloud_constants import CLOUD_DEFAULT_BASE_URL
|
|
21
|
-
from great_expectations.data_context.data_context.context_factory import get_context
|
|
22
30
|
from great_expectations.data_context.types.base import ProgressBarsConfig
|
|
23
|
-
from pika.adapters.utils.connection_workflow import
|
|
24
|
-
AMQPConnectorException,
|
|
25
|
-
)
|
|
31
|
+
from pika.adapters.utils.connection_workflow import AMQPConnectorException
|
|
26
32
|
from pika.exceptions import (
|
|
27
33
|
AMQPConnectionError,
|
|
28
34
|
AMQPError,
|
|
@@ -45,9 +51,7 @@ from great_expectations_cloud.agent.config import (
|
|
|
45
51
|
generate_config_validation_error_text,
|
|
46
52
|
)
|
|
47
53
|
from great_expectations_cloud.agent.constants import USER_AGENT_HEADER, HeaderName
|
|
48
|
-
from great_expectations_cloud.agent.event_handler import
|
|
49
|
-
EventHandler,
|
|
50
|
-
)
|
|
54
|
+
from great_expectations_cloud.agent.event_handler import EventHandler
|
|
51
55
|
from great_expectations_cloud.agent.exceptions import (
|
|
52
56
|
GXAgentConfigError,
|
|
53
57
|
GXAgentError,
|
|
@@ -67,6 +71,7 @@ from great_expectations_cloud.agent.models import (
|
|
|
67
71
|
AgentBaseExtraForbid,
|
|
68
72
|
CreateScheduledJobAndSetJobStarted,
|
|
69
73
|
CreateScheduledJobAndSetJobStartedRequest,
|
|
74
|
+
DomainContext,
|
|
70
75
|
JobCompleted,
|
|
71
76
|
JobStarted,
|
|
72
77
|
JobStatus,
|
|
@@ -138,6 +143,9 @@ class GXAgent:
|
|
|
138
143
|
_PYPI_GX_AGENT_PACKAGE_NAME = "great_expectations_cloud"
|
|
139
144
|
_PYPI_GREAT_EXPECTATIONS_PACKAGE_NAME = "great_expectations"
|
|
140
145
|
|
|
146
|
+
# Heartbeat interval in seconds (log progress every 60 seconds during job processing)
|
|
147
|
+
_HEARTBEAT_INTERVAL_SECONDS = 60
|
|
148
|
+
|
|
141
149
|
def __init__(self: Self):
|
|
142
150
|
self._config = self._create_config()
|
|
143
151
|
|
|
@@ -150,19 +158,6 @@ class GXAgent:
|
|
|
150
158
|
"great_expectations_version": great_expectations_version,
|
|
151
159
|
},
|
|
152
160
|
)
|
|
153
|
-
LOGGER.debug("Loading a DataContext - this might take a moment.")
|
|
154
|
-
|
|
155
|
-
with warnings.catch_warnings():
|
|
156
|
-
# suppress warnings about GX version
|
|
157
|
-
warnings.filterwarnings("ignore", message="You are using great_expectations version")
|
|
158
|
-
self._context: CloudDataContext = get_context(
|
|
159
|
-
cloud_mode=True,
|
|
160
|
-
user_agent_str=self.user_agent_str,
|
|
161
|
-
)
|
|
162
|
-
self._configure_progress_bars(data_context=self._context)
|
|
163
|
-
LOGGER.debug("DataContext is ready.")
|
|
164
|
-
|
|
165
|
-
self._set_http_session_headers(data_context=self._context)
|
|
166
161
|
|
|
167
162
|
# Create a thread pool with a single worker, so we can run long-lived
|
|
168
163
|
# GX processes and maintain our connection to the broker. Note that
|
|
@@ -174,6 +169,15 @@ class GXAgent:
|
|
|
174
169
|
self._correlation_ids: defaultdict[str, int] = defaultdict(lambda: 0)
|
|
175
170
|
self._listen_tries = 0
|
|
176
171
|
|
|
172
|
+
# Heartbeat tracking
|
|
173
|
+
self._heartbeat_stop_event: threading.Event | None = None
|
|
174
|
+
self._heartbeat_thread: threading.Thread | None = None
|
|
175
|
+
self._current_job_correlation_id: str | None = None
|
|
176
|
+
self._current_job_start_time: float | None = None
|
|
177
|
+
|
|
178
|
+
# Install signal handlers for graceful shutdown logging
|
|
179
|
+
self._install_signal_handlers()
|
|
180
|
+
|
|
177
181
|
def run(self) -> None:
|
|
178
182
|
"""Open a connection to GX Cloud."""
|
|
179
183
|
|
|
@@ -218,7 +222,7 @@ class GXAgent:
|
|
|
218
222
|
LOGGER.exception("The connection to GX Cloud has encountered an error.")
|
|
219
223
|
except GXAgentUnrecoverableConnectionError:
|
|
220
224
|
LOGGER.exception("The connection to GX Cloud has encountered an unrecoverable error.")
|
|
221
|
-
|
|
225
|
+
os.kill(os.getpid(), signal.SIGTERM)
|
|
222
226
|
except (
|
|
223
227
|
AuthenticationError,
|
|
224
228
|
ProbableAuthenticationError,
|
|
@@ -233,6 +237,98 @@ class GXAgent:
|
|
|
233
237
|
if subscriber is not None:
|
|
234
238
|
subscriber.close()
|
|
235
239
|
|
|
240
|
+
def _install_signal_handlers(self) -> None:
|
|
241
|
+
"""Install signal handlers to log when the process receives shutdown signals."""
|
|
242
|
+
original_sigterm = signal.getsignal(signal.SIGTERM)
|
|
243
|
+
original_sigint = signal.getsignal(signal.SIGINT)
|
|
244
|
+
|
|
245
|
+
def sigterm_handler(signum: int, frame: Any) -> None:
|
|
246
|
+
self._log_signal_received("SIGTERM", signum)
|
|
247
|
+
if callable(original_sigterm):
|
|
248
|
+
original_sigterm(signum, frame)
|
|
249
|
+
elif original_sigterm == signal.SIG_DFL:
|
|
250
|
+
raise SystemExit(128 + signum)
|
|
251
|
+
|
|
252
|
+
def sigint_handler(signum: int, frame: Any) -> None:
|
|
253
|
+
self._log_signal_received("SIGINT", signum)
|
|
254
|
+
if callable(original_sigint):
|
|
255
|
+
original_sigint(signum, frame)
|
|
256
|
+
elif original_sigint == signal.SIG_DFL:
|
|
257
|
+
raise KeyboardInterrupt
|
|
258
|
+
|
|
259
|
+
signal.signal(signal.SIGTERM, sigterm_handler)
|
|
260
|
+
signal.signal(signal.SIGINT, sigint_handler)
|
|
261
|
+
|
|
262
|
+
def _log_signal_received(self, signal_name: str, signum: int) -> None:
|
|
263
|
+
"""Log when a shutdown signal is received, including current job info."""
|
|
264
|
+
memory_mb = self._get_memory_usage_mb()
|
|
265
|
+
LOGGER.warning(
|
|
266
|
+
f"Received {signal_name} signal - shutting down",
|
|
267
|
+
extra={
|
|
268
|
+
"signal": signal_name,
|
|
269
|
+
"signal_number": signum,
|
|
270
|
+
"hostname": socket.gethostname(),
|
|
271
|
+
"current_job_correlation_id": self._current_job_correlation_id,
|
|
272
|
+
"job_elapsed_seconds": (
|
|
273
|
+
time.time() - self._current_job_start_time
|
|
274
|
+
if self._current_job_start_time
|
|
275
|
+
else None
|
|
276
|
+
),
|
|
277
|
+
"memory_usage_mb": memory_mb,
|
|
278
|
+
"has_active_task": self._current_task is not None and not self._current_task.done(),
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def _get_memory_usage_mb(self) -> float:
|
|
283
|
+
"""Get current memory usage in MB using resource module."""
|
|
284
|
+
# ru_maxrss is in KB on Linux, bytes on macOS
|
|
285
|
+
usage = resource.getrusage(resource.RUSAGE_SELF)
|
|
286
|
+
# On macOS, ru_maxrss is in bytes; on Linux, it's in KB
|
|
287
|
+
if sys.platform == "darwin":
|
|
288
|
+
return usage.ru_maxrss / (1024 * 1024)
|
|
289
|
+
return usage.ru_maxrss / 1024
|
|
290
|
+
|
|
291
|
+
def _start_heartbeat(self, correlation_id: str, org_id: UUID, workspace_id: UUID) -> None:
|
|
292
|
+
"""Start a background thread that logs periodic heartbeats during job processing."""
|
|
293
|
+
self._current_job_correlation_id = correlation_id
|
|
294
|
+
self._current_job_start_time = time.time()
|
|
295
|
+
self._heartbeat_stop_event = threading.Event()
|
|
296
|
+
|
|
297
|
+
def heartbeat_loop() -> None:
|
|
298
|
+
stop_event = self._heartbeat_stop_event
|
|
299
|
+
if stop_event is None:
|
|
300
|
+
return
|
|
301
|
+
while not stop_event.wait(timeout=self._HEARTBEAT_INTERVAL_SECONDS):
|
|
302
|
+
if stop_event.is_set():
|
|
303
|
+
break
|
|
304
|
+
elapsed = time.time() - (self._current_job_start_time or time.time())
|
|
305
|
+
memory_mb = self._get_memory_usage_mb()
|
|
306
|
+
LOGGER.info(
|
|
307
|
+
"Job heartbeat - still processing",
|
|
308
|
+
extra={
|
|
309
|
+
"correlation_id": correlation_id,
|
|
310
|
+
"organization_id": str(org_id),
|
|
311
|
+
"workspace_id": str(workspace_id),
|
|
312
|
+
"hostname": socket.gethostname(),
|
|
313
|
+
"elapsed_seconds": round(elapsed, 1),
|
|
314
|
+
"memory_usage_mb": round(memory_mb, 1),
|
|
315
|
+
},
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
self._heartbeat_thread = threading.Thread(target=heartbeat_loop, daemon=True)
|
|
319
|
+
self._heartbeat_thread.start()
|
|
320
|
+
|
|
321
|
+
def _stop_heartbeat(self) -> None:
|
|
322
|
+
"""Stop the heartbeat thread."""
|
|
323
|
+
if self._heartbeat_stop_event:
|
|
324
|
+
self._heartbeat_stop_event.set()
|
|
325
|
+
if self._heartbeat_thread and self._heartbeat_thread.is_alive():
|
|
326
|
+
self._heartbeat_thread.join(timeout=2)
|
|
327
|
+
self._heartbeat_thread = None
|
|
328
|
+
self._heartbeat_stop_event = None
|
|
329
|
+
self._current_job_correlation_id = None
|
|
330
|
+
self._current_job_start_time = None
|
|
331
|
+
|
|
236
332
|
@classmethod
|
|
237
333
|
def get_current_gx_agent_version(cls) -> str:
|
|
238
334
|
version: str = metadata_version(cls._PYPI_GX_AGENT_PACKAGE_NAME)
|
|
@@ -252,8 +348,26 @@ class GXAgent:
|
|
|
252
348
|
Args:
|
|
253
349
|
event_context: An Event with related properties and actions.
|
|
254
350
|
"""
|
|
351
|
+
# Track how many times this correlation_id has been seen BY THIS POD (for local diagnostics)
|
|
352
|
+
# Note: event_context.redelivered is set by RabbitMQ and indicates cross-pod redelivery
|
|
353
|
+
local_delivery_count = self._correlation_ids.get(event_context.correlation_id, 0)
|
|
354
|
+
|
|
255
355
|
if self._reject_correlation_id(event_context.correlation_id) is True:
|
|
256
|
-
# this event has been redelivered too many times - remove it from circulation
|
|
356
|
+
# this event has been redelivered too many times to THIS pod - remove it from circulation
|
|
357
|
+
LOGGER.error(
|
|
358
|
+
"Message redelivered too many times to this pod, removing from queue",
|
|
359
|
+
extra={
|
|
360
|
+
"event_type": event_context.event.type,
|
|
361
|
+
"correlation_id": event_context.correlation_id,
|
|
362
|
+
"organization_id": self.get_organization_id(event_context),
|
|
363
|
+
"workspace_id": str(self.get_workspace_id(event_context)),
|
|
364
|
+
"schedule_id": event_context.event.schedule_id
|
|
365
|
+
if isinstance(event_context.event, ScheduledEventBase)
|
|
366
|
+
else None,
|
|
367
|
+
"local_delivery_count": local_delivery_count,
|
|
368
|
+
"redelivered": event_context.redelivered,
|
|
369
|
+
},
|
|
370
|
+
)
|
|
257
371
|
event_context.processed_with_failures()
|
|
258
372
|
return
|
|
259
373
|
elif self._can_accept_new_task() is not True:
|
|
@@ -263,9 +377,11 @@ class GXAgent:
|
|
|
263
377
|
"event_type": event_context.event.type,
|
|
264
378
|
"correlation_id": event_context.correlation_id,
|
|
265
379
|
"organization_id": self.get_organization_id(event_context),
|
|
380
|
+
"workspace_id": str(self.get_workspace_id(event_context)),
|
|
266
381
|
"schedule_id": event_context.event.schedule_id
|
|
267
382
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
268
383
|
else None,
|
|
384
|
+
"redelivered": event_context.redelivered,
|
|
269
385
|
},
|
|
270
386
|
)
|
|
271
387
|
# request that this message is redelivered later
|
|
@@ -274,6 +390,21 @@ class GXAgent:
|
|
|
274
390
|
self._redeliver_msg_task = loop.create_task(event_context.redeliver_message())
|
|
275
391
|
return
|
|
276
392
|
|
|
393
|
+
if event_context.redelivered:
|
|
394
|
+
LOGGER.warning(
|
|
395
|
+
"Accepting redelivered message - another consumer failed to acknowledge",
|
|
396
|
+
extra={
|
|
397
|
+
"event_type": event_context.event.type,
|
|
398
|
+
"correlation_id": event_context.correlation_id,
|
|
399
|
+
"organization_id": self.get_organization_id(event_context),
|
|
400
|
+
"workspace_id": str(self.get_workspace_id(event_context)),
|
|
401
|
+
"schedule_id": event_context.event.schedule_id
|
|
402
|
+
if isinstance(event_context.event, ScheduledEventBase)
|
|
403
|
+
else None,
|
|
404
|
+
"redelivered": event_context.redelivered,
|
|
405
|
+
},
|
|
406
|
+
)
|
|
407
|
+
|
|
277
408
|
self._current_task = self._executor.submit(
|
|
278
409
|
self._handle_event,
|
|
279
410
|
event_context=event_context,
|
|
@@ -287,8 +418,23 @@ class GXAgent:
|
|
|
287
418
|
self._current_task.add_done_callback(on_exit_callback)
|
|
288
419
|
|
|
289
420
|
def get_data_context(self, event_context: EventContext) -> CloudDataContext:
|
|
290
|
-
"""
|
|
291
|
-
|
|
421
|
+
"""Create a new CloudDataContext for each job using the event's workspace_id."""
|
|
422
|
+
with warnings.catch_warnings():
|
|
423
|
+
warnings.filterwarnings("ignore", message="You are using great_expectations version")
|
|
424
|
+
workspace_id = self.get_workspace_id(event_context)
|
|
425
|
+
|
|
426
|
+
LOGGER.debug("Loading a DataContext - this might take a moment.")
|
|
427
|
+
|
|
428
|
+
context: CloudDataContext = get_context(
|
|
429
|
+
cloud_mode=True,
|
|
430
|
+
user_agent_str=self.user_agent_str,
|
|
431
|
+
cloud_workspace_id=str(workspace_id),
|
|
432
|
+
)
|
|
433
|
+
self._configure_progress_bars(data_context=context)
|
|
434
|
+
|
|
435
|
+
LOGGER.debug("DataContext is ready.")
|
|
436
|
+
|
|
437
|
+
return context
|
|
292
438
|
|
|
293
439
|
def get_organization_id(self, event_context: EventContext) -> UUID:
|
|
294
440
|
"""Helper method to get the organization ID. Overridden in GX-Runner."""
|
|
@@ -298,6 +444,13 @@ class GXAgent:
|
|
|
298
444
|
"""Helper method to get the auth key. Overridden in GX-Runner."""
|
|
299
445
|
return self._get_config().gx_cloud_access_token
|
|
300
446
|
|
|
447
|
+
def get_workspace_id(self, event_context: EventContext) -> UUID:
|
|
448
|
+
"""Helper method to get the workspace ID from the event."""
|
|
449
|
+
workspace_id: UUID | None = getattr(event_context.event, "workspace_id", None)
|
|
450
|
+
if workspace_id is None:
|
|
451
|
+
raise GXAgentError()
|
|
452
|
+
return workspace_id
|
|
453
|
+
|
|
301
454
|
def _set_sentry_tags(self, even_context: EventContext) -> None:
|
|
302
455
|
"""Used by GX-Runner to set tags for Sentry logging. No-op in the Agent."""
|
|
303
456
|
pass
|
|
@@ -320,27 +473,38 @@ class GXAgent:
|
|
|
320
473
|
)
|
|
321
474
|
|
|
322
475
|
org_id = self.get_organization_id(event_context)
|
|
476
|
+
workspace_id = self.get_workspace_id(event_context)
|
|
323
477
|
base_url = self._get_config().gx_cloud_base_url
|
|
324
478
|
auth_key = self.get_auth_key()
|
|
325
479
|
|
|
326
480
|
if isinstance(event_context.event, ScheduledEventBase):
|
|
327
|
-
self._create_scheduled_job_and_set_started(event_context, org_id)
|
|
481
|
+
self._create_scheduled_job_and_set_started(event_context, org_id, workspace_id)
|
|
328
482
|
else:
|
|
329
483
|
self._update_status(
|
|
330
|
-
correlation_id=event_context.correlation_id,
|
|
484
|
+
correlation_id=event_context.correlation_id,
|
|
485
|
+
status=JobStarted(),
|
|
486
|
+
org_id=org_id,
|
|
487
|
+
workspace_id=workspace_id,
|
|
331
488
|
)
|
|
489
|
+
memory_mb = self._get_memory_usage_mb()
|
|
332
490
|
LOGGER.info(
|
|
333
491
|
"Starting job",
|
|
334
492
|
extra={
|
|
335
493
|
"event_type": event_context.event.type,
|
|
336
494
|
"correlation_id": event_context.correlation_id,
|
|
337
495
|
"organization_id": str(org_id),
|
|
496
|
+
"workspace_id": str(workspace_id),
|
|
338
497
|
"schedule_id": event_context.event.schedule_id
|
|
339
498
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
340
499
|
else None,
|
|
500
|
+
"hostname": socket.gethostname(),
|
|
501
|
+
"redelivered": event_context.redelivered,
|
|
502
|
+
"memory_usage_mb": round(memory_mb, 1),
|
|
341
503
|
},
|
|
342
504
|
)
|
|
343
505
|
|
|
506
|
+
self._start_heartbeat(event_context.correlation_id, org_id, workspace_id)
|
|
507
|
+
|
|
344
508
|
self._set_sentry_tags(event_context)
|
|
345
509
|
|
|
346
510
|
handler = EventHandler(context=data_context)
|
|
@@ -350,7 +514,7 @@ class GXAgent:
|
|
|
350
514
|
id=event_context.correlation_id,
|
|
351
515
|
base_url=base_url,
|
|
352
516
|
auth_key=auth_key,
|
|
353
|
-
organization_id=org_id,
|
|
517
|
+
domain_context=DomainContext(organization_id=org_id, workspace_id=workspace_id),
|
|
354
518
|
)
|
|
355
519
|
return result
|
|
356
520
|
|
|
@@ -365,10 +529,26 @@ class GXAgent:
|
|
|
365
529
|
"""
|
|
366
530
|
# warning: this method will not be executed in the main thread
|
|
367
531
|
|
|
532
|
+
self._stop_heartbeat()
|
|
533
|
+
|
|
368
534
|
org_id = self.get_organization_id(event_context)
|
|
535
|
+
workspace_id = self.get_workspace_id(event_context)
|
|
536
|
+
|
|
537
|
+
memory_mb = self._get_memory_usage_mb()
|
|
538
|
+
LOGGER.debug(
|
|
539
|
+
"Job thread exiting",
|
|
540
|
+
extra={
|
|
541
|
+
"correlation_id": event_context.correlation_id,
|
|
542
|
+
"hostname": socket.gethostname(),
|
|
543
|
+
"has_exception": future.exception() is not None,
|
|
544
|
+
"cancelled": future.cancelled(),
|
|
545
|
+
"memory_usage_mb": round(memory_mb, 1),
|
|
546
|
+
},
|
|
547
|
+
)
|
|
369
548
|
|
|
370
549
|
# get results or errors from the thread
|
|
371
550
|
error = future.exception()
|
|
551
|
+
|
|
372
552
|
if error is None:
|
|
373
553
|
result: ActionResult = future.result()
|
|
374
554
|
|
|
@@ -385,6 +565,7 @@ class GXAgent:
|
|
|
385
565
|
"event_type": event_context.event.type,
|
|
386
566
|
"id": event_context.correlation_id,
|
|
387
567
|
"organization_id": str(org_id),
|
|
568
|
+
"workspace_id": str(workspace_id),
|
|
388
569
|
"schedule_id": event_context.event.schedule_id
|
|
389
570
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
390
571
|
else None,
|
|
@@ -405,26 +586,35 @@ class GXAgent:
|
|
|
405
586
|
result.job_duration.total_seconds() if result.job_duration else None
|
|
406
587
|
),
|
|
407
588
|
"organization_id": str(org_id),
|
|
589
|
+
"workspace_id": str(workspace_id),
|
|
408
590
|
"schedule_id": event_context.event.schedule_id
|
|
409
591
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
410
592
|
else None,
|
|
593
|
+
"hostname": socket.gethostname(),
|
|
411
594
|
},
|
|
412
595
|
)
|
|
413
596
|
else:
|
|
414
597
|
status = build_failed_job_completed_status(error)
|
|
415
598
|
LOGGER.info(traceback.format_exc())
|
|
416
|
-
LOGGER.
|
|
599
|
+
LOGGER.warning(
|
|
417
600
|
"Job completed with error",
|
|
418
601
|
extra={
|
|
419
602
|
"event_type": event_context.event.type,
|
|
420
603
|
"correlation_id": event_context.correlation_id,
|
|
421
604
|
"organization_id": str(org_id),
|
|
605
|
+
"workspace_id": str(workspace_id),
|
|
606
|
+
"hostname": socket.gethostname(),
|
|
607
|
+
"error_type": type(error).__name__,
|
|
608
|
+
"error_message": str(error)[:500], # Truncate to avoid huge logs
|
|
422
609
|
},
|
|
423
610
|
)
|
|
424
611
|
|
|
425
612
|
try:
|
|
426
613
|
self._update_status(
|
|
427
|
-
correlation_id=event_context.correlation_id,
|
|
614
|
+
correlation_id=event_context.correlation_id,
|
|
615
|
+
status=status,
|
|
616
|
+
org_id=org_id,
|
|
617
|
+
workspace_id=workspace_id,
|
|
428
618
|
)
|
|
429
619
|
except Exception:
|
|
430
620
|
LOGGER.exception(
|
|
@@ -433,6 +623,7 @@ class GXAgent:
|
|
|
433
623
|
"correlation_id": event_context.correlation_id,
|
|
434
624
|
"status": str(status),
|
|
435
625
|
"organization_id": str(org_id),
|
|
626
|
+
"workspace_id": str(workspace_id),
|
|
436
627
|
},
|
|
437
628
|
)
|
|
438
629
|
# We do not want to cause an infinite loop of errors
|
|
@@ -552,7 +743,9 @@ class GXAgent:
|
|
|
552
743
|
)
|
|
553
744
|
)
|
|
554
745
|
|
|
555
|
-
def _update_status(
|
|
746
|
+
def _update_status(
|
|
747
|
+
self, correlation_id: str, status: JobStatus, org_id: UUID, workspace_id: UUID
|
|
748
|
+
) -> None:
|
|
556
749
|
"""Update GX Cloud on the status of a job.
|
|
557
750
|
|
|
558
751
|
Args:
|
|
@@ -565,11 +758,12 @@ class GXAgent:
|
|
|
565
758
|
"correlation_id": correlation_id,
|
|
566
759
|
"status": str(status),
|
|
567
760
|
"organization_id": str(org_id),
|
|
761
|
+
"workspace_id": str(workspace_id),
|
|
568
762
|
},
|
|
569
763
|
)
|
|
570
764
|
agent_sessions_url = urljoin(
|
|
571
765
|
self._get_config().gx_cloud_base_url,
|
|
572
|
-
f"/api/v1/organizations/{org_id}/agent-jobs/{correlation_id}",
|
|
766
|
+
f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs/{correlation_id}",
|
|
573
767
|
)
|
|
574
768
|
with create_session(access_token=self.get_auth_key()) as session:
|
|
575
769
|
data = UpdateJobStatusRequest(data=status).json()
|
|
@@ -580,6 +774,7 @@ class GXAgent:
|
|
|
580
774
|
"correlation_id": correlation_id,
|
|
581
775
|
"status": str(status),
|
|
582
776
|
"organization_id": str(org_id),
|
|
777
|
+
"workspace_id": str(workspace_id),
|
|
583
778
|
},
|
|
584
779
|
)
|
|
585
780
|
GXAgent._log_http_error(
|
|
@@ -587,7 +782,7 @@ class GXAgent:
|
|
|
587
782
|
)
|
|
588
783
|
|
|
589
784
|
def _create_scheduled_job_and_set_started(
|
|
590
|
-
self, event_context: EventContext, org_id: UUID
|
|
785
|
+
self, event_context: EventContext, org_id: UUID, workspace_id: UUID
|
|
591
786
|
) -> None:
|
|
592
787
|
"""Create a job in GX Cloud for scheduled events.
|
|
593
788
|
|
|
@@ -609,13 +804,14 @@ class GXAgent:
|
|
|
609
804
|
"correlation_id": str(event_context.correlation_id),
|
|
610
805
|
"event_type": str(event_context.event.type),
|
|
611
806
|
"organization_id": str(org_id),
|
|
807
|
+
"workspace_id": str(workspace_id),
|
|
612
808
|
"schedule_id": str(event_context.event.schedule_id),
|
|
613
809
|
},
|
|
614
810
|
)
|
|
615
811
|
|
|
616
812
|
agent_sessions_url = urljoin(
|
|
617
813
|
self._get_config().gx_cloud_base_url,
|
|
618
|
-
f"/api/v1/organizations/{org_id}/agent-jobs",
|
|
814
|
+
f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs",
|
|
619
815
|
)
|
|
620
816
|
data = CreateScheduledJobAndSetJobStarted(
|
|
621
817
|
type="run_scheduled_checkpoint.received",
|
|
@@ -629,6 +825,31 @@ class GXAgent:
|
|
|
629
825
|
with create_session(access_token=self.get_auth_key()) as session:
|
|
630
826
|
payload = CreateScheduledJobAndSetJobStartedRequest(data=data).json()
|
|
631
827
|
response = session.post(agent_sessions_url, data=payload)
|
|
828
|
+
|
|
829
|
+
if response.status_code == HTTPStatus.BAD_REQUEST:
|
|
830
|
+
try:
|
|
831
|
+
response_body = response.json()
|
|
832
|
+
except Exception:
|
|
833
|
+
response_body = response.text
|
|
834
|
+
LOGGER.warning(
|
|
835
|
+
"Job already exists - this message was likely redelivered by RabbitMQ "
|
|
836
|
+
"after another runner already claimed it. Continuing to process anyway "
|
|
837
|
+
"as a safety measure in case the original runner failed.",
|
|
838
|
+
extra={
|
|
839
|
+
"correlation_id": str(event_context.correlation_id),
|
|
840
|
+
"event_type": str(event_context.event.type),
|
|
841
|
+
"organization_id": str(org_id),
|
|
842
|
+
"schedule_id": str(event_context.event.schedule_id),
|
|
843
|
+
"workspace_id": str(workspace_id),
|
|
844
|
+
"response_status": response.status_code,
|
|
845
|
+
"response_body": response_body,
|
|
846
|
+
},
|
|
847
|
+
)
|
|
848
|
+
# Note: We intentionally continue processing instead of NACKing.
|
|
849
|
+
# This ensures job completion even if the first runner fails.
|
|
850
|
+
# TODO: Once we add inProgress timeout in Mercury, we can
|
|
851
|
+
# safely NACK here to prevent duplicate processing.
|
|
852
|
+
|
|
632
853
|
LOGGER.info(
|
|
633
854
|
"Created scheduled job and set started",
|
|
634
855
|
extra={
|
|
@@ -636,6 +857,8 @@ class GXAgent:
|
|
|
636
857
|
"event_type": str(event_context.event.type),
|
|
637
858
|
"organization_id": str(org_id),
|
|
638
859
|
"schedule_id": str(event_context.event.schedule_id),
|
|
860
|
+
"workspace_id": str(workspace_id),
|
|
861
|
+
"response_status": response.status_code,
|
|
639
862
|
},
|
|
640
863
|
)
|
|
641
864
|
GXAgent._log_http_error(
|
|
@@ -658,7 +881,9 @@ class GXAgent:
|
|
|
658
881
|
"""
|
|
659
882
|
Sets headers on all stores in the data context.
|
|
660
883
|
"""
|
|
661
|
-
from great_expectations.data_context.store.gx_cloud_store_backend import
|
|
884
|
+
from great_expectations.data_context.store.gx_cloud_store_backend import ( # noqa: PLC0415
|
|
885
|
+
GXCloudStoreBackend,
|
|
886
|
+
)
|
|
662
887
|
|
|
663
888
|
# OSS doesn't use the same session for all requests, so we need to set the header for each store
|
|
664
889
|
stores = list(data_context.stores.values())
|
|
@@ -686,8 +911,6 @@ class GXAgent:
|
|
|
686
911
|
Note: the Agent-Job-Id header value will be set for all GX Cloud request until this method is
|
|
687
912
|
called again.
|
|
688
913
|
"""
|
|
689
|
-
from great_expectations import __version__ # noqa: PLC0415
|
|
690
|
-
from great_expectations.core import http # noqa: PLC0415
|
|
691
914
|
|
|
692
915
|
header_name = self.get_header_name()
|
|
693
916
|
user_agent_header_value = self.user_agent_str
|
|
@@ -15,6 +15,7 @@ from pydantic import v1 as pydantic_v1
|
|
|
15
15
|
from great_expectations_cloud.agent.actions.unknown import UnknownEventAction
|
|
16
16
|
from great_expectations_cloud.agent.exceptions import GXAgentError
|
|
17
17
|
from great_expectations_cloud.agent.models import (
|
|
18
|
+
DomainContext,
|
|
18
19
|
Event,
|
|
19
20
|
EventType,
|
|
20
21
|
UnknownEvent,
|
|
@@ -67,11 +68,11 @@ class EventHandler:
|
|
|
67
68
|
self._context = context
|
|
68
69
|
|
|
69
70
|
def get_event_action(
|
|
70
|
-
self, event: Event, base_url: str, auth_key: str,
|
|
71
|
+
self, event: Event, base_url: str, auth_key: str, domain_context: DomainContext
|
|
71
72
|
) -> AgentAction[Any]:
|
|
72
73
|
"""Get the action that should be run for the given event."""
|
|
73
74
|
|
|
74
|
-
if not self._check_event_organization_id(event, organization_id):
|
|
75
|
+
if not self._check_event_organization_id(event, domain_context.organization_id):
|
|
75
76
|
# Making message more generic
|
|
76
77
|
raise GXAgentError("Unable to process job. Invalid input.") # noqa: TRY003
|
|
77
78
|
|
|
@@ -84,17 +85,17 @@ class EventHandler:
|
|
|
84
85
|
return action_class(
|
|
85
86
|
context=self._context,
|
|
86
87
|
base_url=base_url,
|
|
87
|
-
|
|
88
|
+
domain_context=domain_context,
|
|
88
89
|
auth_key=auth_key,
|
|
89
90
|
)
|
|
90
91
|
|
|
91
|
-
def handle_event(
|
|
92
|
-
self, event: Event, id: str, base_url: str, auth_key: str,
|
|
92
|
+
def handle_event(
|
|
93
|
+
self, event: Event, id: str, base_url: str, auth_key: str, domain_context: DomainContext
|
|
93
94
|
) -> ActionResult:
|
|
94
|
-
start_time = datetime.now(tz=timezone.utc)
|
|
95
95
|
"""Transform an Event into an ActionResult."""
|
|
96
|
+
start_time = datetime.now(tz=timezone.utc)
|
|
96
97
|
action = self.get_event_action(
|
|
97
|
-
event=event, base_url=base_url, auth_key=auth_key,
|
|
98
|
+
event=event, base_url=base_url, auth_key=auth_key, domain_context=domain_context
|
|
98
99
|
)
|
|
99
100
|
LOGGER.info(f"Handling event: {event.type} -> {action.__class__.__name__}")
|
|
100
101
|
action_result = action.run(event=event, id=id)
|
|
@@ -27,6 +27,7 @@ class OnMessagePayload:
|
|
|
27
27
|
correlation_id: str
|
|
28
28
|
delivery_tag: int
|
|
29
29
|
body: bytes
|
|
30
|
+
redelivered: bool = False # Set by RabbitMQ when message is redelivered
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class OnMessageFn(Protocol):
|
|
@@ -174,8 +175,12 @@ class AsyncRabbitMQClient:
|
|
|
174
175
|
# param on_message is provided by the caller as an argument to AsyncRabbitMQClient.run
|
|
175
176
|
correlation_id = header_frame.correlation_id
|
|
176
177
|
delivery_tag = method_frame.delivery_tag
|
|
178
|
+
redelivered = method_frame.redelivered # RabbitMQ sets this flag on redelivery
|
|
177
179
|
payload = OnMessagePayload(
|
|
178
|
-
correlation_id=correlation_id,
|
|
180
|
+
correlation_id=correlation_id,
|
|
181
|
+
delivery_tag=delivery_tag,
|
|
182
|
+
body=body,
|
|
183
|
+
redelivered=redelivered,
|
|
179
184
|
)
|
|
180
185
|
return on_message(payload)
|
|
181
186
|
|
|
@@ -190,10 +195,13 @@ class AsyncRabbitMQClient:
|
|
|
190
195
|
def _on_consumer_canceled(self, method_frame: Basic.Cancel) -> None:
|
|
191
196
|
"""Callback invoked when the broker cancels the client's connection."""
|
|
192
197
|
if self._channel is not None:
|
|
193
|
-
LOGGER.
|
|
194
|
-
"Consumer was cancelled remotely
|
|
198
|
+
LOGGER.warning(
|
|
199
|
+
"Consumer was cancelled remotely by RabbitMQ - this may indicate DAT timeout",
|
|
195
200
|
extra={
|
|
196
|
-
"
|
|
201
|
+
"consumer_tag": method_frame.consumer_tag
|
|
202
|
+
if hasattr(method_frame, "consumer_tag")
|
|
203
|
+
else None,
|
|
204
|
+
"was_consuming": self.was_consuming,
|
|
197
205
|
},
|
|
198
206
|
)
|
|
199
207
|
self._channel.close()
|
|
@@ -232,11 +240,28 @@ class AsyncRabbitMQClient:
|
|
|
232
240
|
self._reconnect()
|
|
233
241
|
self._log_pika_exception("Connection open failed", reason)
|
|
234
242
|
|
|
235
|
-
def _on_connection_closed(
|
|
236
|
-
self, connection: AsyncioConnection, _unused_reason: pika.Exception
|
|
237
|
-
) -> None:
|
|
243
|
+
def _on_connection_closed(self, connection: AsyncioConnection, reason: pika.Exception) -> None:
|
|
238
244
|
"""Callback invoked after the broker closes the connection"""
|
|
239
|
-
|
|
245
|
+
if isinstance(reason, (ConnectionClosed, ChannelClosed)):
|
|
246
|
+
LOGGER.warning(
|
|
247
|
+
"Connection to RabbitMQ has been closed",
|
|
248
|
+
extra={
|
|
249
|
+
"reply_code": reason.reply_code,
|
|
250
|
+
"reply_text": reason.reply_text,
|
|
251
|
+
"was_consuming": self.was_consuming,
|
|
252
|
+
"is_closing": self._closing,
|
|
253
|
+
},
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
LOGGER.warning(
|
|
257
|
+
"Connection to RabbitMQ has been closed",
|
|
258
|
+
extra={
|
|
259
|
+
"reason": str(reason),
|
|
260
|
+
"reason_type": type(reason).__name__,
|
|
261
|
+
"was_consuming": self.was_consuming,
|
|
262
|
+
"is_closing": self._closing,
|
|
263
|
+
},
|
|
264
|
+
)
|
|
240
265
|
self._channel = None
|
|
241
266
|
self._is_unrecoverable = True
|
|
242
267
|
if self._closing:
|