great-expectations-cloud 20250902.0.dev1__py3-none-any.whl → 20260120.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of great-expectations-cloud might be problematic. Click here for more details.
- great_expectations_cloud/agent/actions/agent_action.py +3 -3
- great_expectations_cloud/agent/actions/draft_datasource_config_action.py +2 -2
- great_expectations_cloud/agent/actions/generate_data_quality_check_expectations_action.py +22 -14
- great_expectations_cloud/agent/actions/list_asset_names.py +4 -5
- great_expectations_cloud/agent/actions/run_checkpoint.py +64 -3
- great_expectations_cloud/agent/actions/run_metric_list_action.py +3 -3
- great_expectations_cloud/agent/actions/run_scheduled_checkpoint.py +28 -5
- great_expectations_cloud/agent/actions/run_window_checkpoint.py +2 -4
- great_expectations_cloud/agent/actions/utils.py +13 -4
- great_expectations_cloud/agent/agent.py +280 -43
- great_expectations_cloud/agent/event_handler.py +8 -7
- great_expectations_cloud/agent/message_service/asyncio_rabbit_mq_client.py +36 -8
- great_expectations_cloud/agent/message_service/subscriber.py +4 -0
- great_expectations_cloud/agent/models.py +23 -2
- {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/METADATA +5 -5
- {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/RECORD +19 -19
- {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/WHEEL +1 -1
- {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/entry_points.txt +0 -0
- {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info/licenses}/LICENSE +0 -0
|
@@ -2,13 +2,20 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
|
+
import os
|
|
6
|
+
import resource
|
|
7
|
+
import signal
|
|
8
|
+
import socket
|
|
5
9
|
import sys
|
|
10
|
+
import threading
|
|
11
|
+
import time
|
|
6
12
|
import traceback
|
|
7
13
|
import warnings
|
|
8
14
|
from collections import defaultdict
|
|
9
15
|
from concurrent.futures import Future
|
|
10
16
|
from concurrent.futures.thread import ThreadPoolExecutor
|
|
11
17
|
from functools import partial
|
|
18
|
+
from http import HTTPStatus
|
|
12
19
|
from importlib.metadata import version as metadata_version
|
|
13
20
|
from typing import TYPE_CHECKING, Any, Callable, Final, Literal
|
|
14
21
|
from urllib.parse import urljoin, urlparse
|
|
@@ -16,13 +23,12 @@ from uuid import UUID
|
|
|
16
23
|
|
|
17
24
|
import orjson
|
|
18
25
|
import requests
|
|
26
|
+
from great_expectations import __version__, get_context
|
|
27
|
+
from great_expectations.core import http
|
|
19
28
|
from great_expectations.core.http import create_session
|
|
20
29
|
from great_expectations.data_context.cloud_constants import CLOUD_DEFAULT_BASE_URL
|
|
21
|
-
from great_expectations.data_context.data_context.context_factory import get_context
|
|
22
30
|
from great_expectations.data_context.types.base import ProgressBarsConfig
|
|
23
|
-
from pika.adapters.utils.connection_workflow import
|
|
24
|
-
AMQPConnectorException,
|
|
25
|
-
)
|
|
31
|
+
from pika.adapters.utils.connection_workflow import AMQPConnectorException
|
|
26
32
|
from pika.exceptions import (
|
|
27
33
|
AMQPConnectionError,
|
|
28
34
|
AMQPError,
|
|
@@ -45,9 +51,7 @@ from great_expectations_cloud.agent.config import (
|
|
|
45
51
|
generate_config_validation_error_text,
|
|
46
52
|
)
|
|
47
53
|
from great_expectations_cloud.agent.constants import USER_AGENT_HEADER, HeaderName
|
|
48
|
-
from great_expectations_cloud.agent.event_handler import
|
|
49
|
-
EventHandler,
|
|
50
|
-
)
|
|
54
|
+
from great_expectations_cloud.agent.event_handler import EventHandler
|
|
51
55
|
from great_expectations_cloud.agent.exceptions import (
|
|
52
56
|
GXAgentConfigError,
|
|
53
57
|
GXAgentError,
|
|
@@ -67,6 +71,7 @@ from great_expectations_cloud.agent.models import (
|
|
|
67
71
|
AgentBaseExtraForbid,
|
|
68
72
|
CreateScheduledJobAndSetJobStarted,
|
|
69
73
|
CreateScheduledJobAndSetJobStartedRequest,
|
|
74
|
+
DomainContext,
|
|
70
75
|
JobCompleted,
|
|
71
76
|
JobStarted,
|
|
72
77
|
JobStatus,
|
|
@@ -138,6 +143,9 @@ class GXAgent:
|
|
|
138
143
|
_PYPI_GX_AGENT_PACKAGE_NAME = "great_expectations_cloud"
|
|
139
144
|
_PYPI_GREAT_EXPECTATIONS_PACKAGE_NAME = "great_expectations"
|
|
140
145
|
|
|
146
|
+
# Heartbeat interval in seconds (log progress every 60 seconds during job processing)
|
|
147
|
+
_HEARTBEAT_INTERVAL_SECONDS = 60
|
|
148
|
+
|
|
141
149
|
def __init__(self: Self):
|
|
142
150
|
self._config = self._create_config()
|
|
143
151
|
|
|
@@ -150,19 +158,6 @@ class GXAgent:
|
|
|
150
158
|
"great_expectations_version": great_expectations_version,
|
|
151
159
|
},
|
|
152
160
|
)
|
|
153
|
-
LOGGER.debug("Loading a DataContext - this might take a moment.")
|
|
154
|
-
|
|
155
|
-
with warnings.catch_warnings():
|
|
156
|
-
# suppress warnings about GX version
|
|
157
|
-
warnings.filterwarnings("ignore", message="You are using great_expectations version")
|
|
158
|
-
self._context: CloudDataContext = get_context(
|
|
159
|
-
cloud_mode=True,
|
|
160
|
-
user_agent_str=self.user_agent_str,
|
|
161
|
-
)
|
|
162
|
-
self._configure_progress_bars(data_context=self._context)
|
|
163
|
-
LOGGER.debug("DataContext is ready.")
|
|
164
|
-
|
|
165
|
-
self._set_http_session_headers(data_context=self._context)
|
|
166
161
|
|
|
167
162
|
# Create a thread pool with a single worker, so we can run long-lived
|
|
168
163
|
# GX processes and maintain our connection to the broker. Note that
|
|
@@ -174,6 +169,15 @@ class GXAgent:
|
|
|
174
169
|
self._correlation_ids: defaultdict[str, int] = defaultdict(lambda: 0)
|
|
175
170
|
self._listen_tries = 0
|
|
176
171
|
|
|
172
|
+
# Heartbeat tracking
|
|
173
|
+
self._heartbeat_stop_event: threading.Event | None = None
|
|
174
|
+
self._heartbeat_thread: threading.Thread | None = None
|
|
175
|
+
self._current_job_correlation_id: str | None = None
|
|
176
|
+
self._current_job_start_time: float | None = None
|
|
177
|
+
|
|
178
|
+
# Install signal handlers for graceful shutdown logging
|
|
179
|
+
self._install_signal_handlers()
|
|
180
|
+
|
|
177
181
|
def run(self) -> None:
|
|
178
182
|
"""Open a connection to GX Cloud."""
|
|
179
183
|
|
|
@@ -218,7 +222,7 @@ class GXAgent:
|
|
|
218
222
|
LOGGER.exception("The connection to GX Cloud has encountered an error.")
|
|
219
223
|
except GXAgentUnrecoverableConnectionError:
|
|
220
224
|
LOGGER.exception("The connection to GX Cloud has encountered an unrecoverable error.")
|
|
221
|
-
|
|
225
|
+
os.kill(os.getpid(), signal.SIGTERM)
|
|
222
226
|
except (
|
|
223
227
|
AuthenticationError,
|
|
224
228
|
ProbableAuthenticationError,
|
|
@@ -233,6 +237,98 @@ class GXAgent:
|
|
|
233
237
|
if subscriber is not None:
|
|
234
238
|
subscriber.close()
|
|
235
239
|
|
|
240
|
+
def _install_signal_handlers(self) -> None:
|
|
241
|
+
"""Install signal handlers to log when the process receives shutdown signals."""
|
|
242
|
+
original_sigterm = signal.getsignal(signal.SIGTERM)
|
|
243
|
+
original_sigint = signal.getsignal(signal.SIGINT)
|
|
244
|
+
|
|
245
|
+
def sigterm_handler(signum: int, frame: Any) -> None:
|
|
246
|
+
self._log_signal_received("SIGTERM", signum)
|
|
247
|
+
if callable(original_sigterm):
|
|
248
|
+
original_sigterm(signum, frame)
|
|
249
|
+
elif original_sigterm == signal.SIG_DFL:
|
|
250
|
+
raise SystemExit(128 + signum)
|
|
251
|
+
|
|
252
|
+
def sigint_handler(signum: int, frame: Any) -> None:
|
|
253
|
+
self._log_signal_received("SIGINT", signum)
|
|
254
|
+
if callable(original_sigint):
|
|
255
|
+
original_sigint(signum, frame)
|
|
256
|
+
elif original_sigint == signal.SIG_DFL:
|
|
257
|
+
raise KeyboardInterrupt
|
|
258
|
+
|
|
259
|
+
signal.signal(signal.SIGTERM, sigterm_handler)
|
|
260
|
+
signal.signal(signal.SIGINT, sigint_handler)
|
|
261
|
+
|
|
262
|
+
def _log_signal_received(self, signal_name: str, signum: int) -> None:
|
|
263
|
+
"""Log when a shutdown signal is received, including current job info."""
|
|
264
|
+
memory_mb = self._get_memory_usage_mb()
|
|
265
|
+
LOGGER.warning(
|
|
266
|
+
f"Received {signal_name} signal - shutting down",
|
|
267
|
+
extra={
|
|
268
|
+
"signal": signal_name,
|
|
269
|
+
"signal_number": signum,
|
|
270
|
+
"hostname": socket.gethostname(),
|
|
271
|
+
"current_job_correlation_id": self._current_job_correlation_id,
|
|
272
|
+
"job_elapsed_seconds": (
|
|
273
|
+
time.time() - self._current_job_start_time
|
|
274
|
+
if self._current_job_start_time
|
|
275
|
+
else None
|
|
276
|
+
),
|
|
277
|
+
"memory_usage_mb": memory_mb,
|
|
278
|
+
"has_active_task": self._current_task is not None and not self._current_task.done(),
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def _get_memory_usage_mb(self) -> float:
|
|
283
|
+
"""Get current memory usage in MB using resource module."""
|
|
284
|
+
# ru_maxrss is in KB on Linux, bytes on macOS
|
|
285
|
+
usage = resource.getrusage(resource.RUSAGE_SELF)
|
|
286
|
+
# On macOS, ru_maxrss is in bytes; on Linux, it's in KB
|
|
287
|
+
if sys.platform == "darwin":
|
|
288
|
+
return usage.ru_maxrss / (1024 * 1024)
|
|
289
|
+
return usage.ru_maxrss / 1024
|
|
290
|
+
|
|
291
|
+
def _start_heartbeat(self, correlation_id: str, org_id: UUID, workspace_id: UUID) -> None:
|
|
292
|
+
"""Start a background thread that logs periodic heartbeats during job processing."""
|
|
293
|
+
self._current_job_correlation_id = correlation_id
|
|
294
|
+
self._current_job_start_time = time.time()
|
|
295
|
+
self._heartbeat_stop_event = threading.Event()
|
|
296
|
+
|
|
297
|
+
def heartbeat_loop() -> None:
|
|
298
|
+
stop_event = self._heartbeat_stop_event
|
|
299
|
+
if stop_event is None:
|
|
300
|
+
return
|
|
301
|
+
while not stop_event.wait(timeout=self._HEARTBEAT_INTERVAL_SECONDS):
|
|
302
|
+
if stop_event.is_set():
|
|
303
|
+
break
|
|
304
|
+
elapsed = time.time() - (self._current_job_start_time or time.time())
|
|
305
|
+
memory_mb = self._get_memory_usage_mb()
|
|
306
|
+
LOGGER.debug(
|
|
307
|
+
"job.heartbeat",
|
|
308
|
+
extra={
|
|
309
|
+
"correlation_id": correlation_id,
|
|
310
|
+
"organization_id": str(org_id),
|
|
311
|
+
"workspace_id": str(workspace_id),
|
|
312
|
+
"hostname": socket.gethostname(),
|
|
313
|
+
"elapsed_seconds": round(elapsed, 1),
|
|
314
|
+
"memory_usage_mb": round(memory_mb, 1),
|
|
315
|
+
},
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
self._heartbeat_thread = threading.Thread(target=heartbeat_loop, daemon=True)
|
|
319
|
+
self._heartbeat_thread.start()
|
|
320
|
+
|
|
321
|
+
def _stop_heartbeat(self) -> None:
|
|
322
|
+
"""Stop the heartbeat thread."""
|
|
323
|
+
if self._heartbeat_stop_event:
|
|
324
|
+
self._heartbeat_stop_event.set()
|
|
325
|
+
if self._heartbeat_thread and self._heartbeat_thread.is_alive():
|
|
326
|
+
self._heartbeat_thread.join(timeout=2)
|
|
327
|
+
self._heartbeat_thread = None
|
|
328
|
+
self._heartbeat_stop_event = None
|
|
329
|
+
self._current_job_correlation_id = None
|
|
330
|
+
self._current_job_start_time = None
|
|
331
|
+
|
|
236
332
|
@classmethod
|
|
237
333
|
def get_current_gx_agent_version(cls) -> str:
|
|
238
334
|
version: str = metadata_version(cls._PYPI_GX_AGENT_PACKAGE_NAME)
|
|
@@ -252,8 +348,26 @@ class GXAgent:
|
|
|
252
348
|
Args:
|
|
253
349
|
event_context: An Event with related properties and actions.
|
|
254
350
|
"""
|
|
351
|
+
# Track how many times this correlation_id has been seen BY THIS POD (for local diagnostics)
|
|
352
|
+
# Note: event_context.redelivered is set by RabbitMQ and indicates cross-pod redelivery
|
|
353
|
+
local_delivery_count = self._correlation_ids.get(event_context.correlation_id, 0)
|
|
354
|
+
|
|
255
355
|
if self._reject_correlation_id(event_context.correlation_id) is True:
|
|
256
|
-
# this event has been redelivered too many times - remove it from circulation
|
|
356
|
+
# this event has been redelivered too many times to THIS pod - remove it from circulation
|
|
357
|
+
LOGGER.error(
|
|
358
|
+
"Message redelivered too many times to this pod, removing from queue",
|
|
359
|
+
extra={
|
|
360
|
+
"event_type": event_context.event.type,
|
|
361
|
+
"correlation_id": event_context.correlation_id,
|
|
362
|
+
"organization_id": self.get_organization_id(event_context),
|
|
363
|
+
"workspace_id": str(self.get_workspace_id(event_context)),
|
|
364
|
+
"schedule_id": event_context.event.schedule_id
|
|
365
|
+
if isinstance(event_context.event, ScheduledEventBase)
|
|
366
|
+
else None,
|
|
367
|
+
"local_delivery_count": local_delivery_count,
|
|
368
|
+
"redelivered": event_context.redelivered,
|
|
369
|
+
},
|
|
370
|
+
)
|
|
257
371
|
event_context.processed_with_failures()
|
|
258
372
|
return
|
|
259
373
|
elif self._can_accept_new_task() is not True:
|
|
@@ -263,9 +377,11 @@ class GXAgent:
|
|
|
263
377
|
"event_type": event_context.event.type,
|
|
264
378
|
"correlation_id": event_context.correlation_id,
|
|
265
379
|
"organization_id": self.get_organization_id(event_context),
|
|
380
|
+
"workspace_id": str(self.get_workspace_id(event_context)),
|
|
266
381
|
"schedule_id": event_context.event.schedule_id
|
|
267
382
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
268
383
|
else None,
|
|
384
|
+
"redelivered": event_context.redelivered,
|
|
269
385
|
},
|
|
270
386
|
)
|
|
271
387
|
# request that this message is redelivered later
|
|
@@ -274,6 +390,20 @@ class GXAgent:
|
|
|
274
390
|
self._redeliver_msg_task = loop.create_task(event_context.redeliver_message())
|
|
275
391
|
return
|
|
276
392
|
|
|
393
|
+
if event_context.redelivered:
|
|
394
|
+
LOGGER.warning(
|
|
395
|
+
"rabbitmq.message.redelivered",
|
|
396
|
+
extra={
|
|
397
|
+
"event_type": event_context.event.type,
|
|
398
|
+
"correlation_id": event_context.correlation_id,
|
|
399
|
+
"organization_id": self.get_organization_id(event_context),
|
|
400
|
+
"workspace_id": str(self.get_workspace_id(event_context)),
|
|
401
|
+
"schedule_id": event_context.event.schedule_id
|
|
402
|
+
if isinstance(event_context.event, ScheduledEventBase)
|
|
403
|
+
else None,
|
|
404
|
+
},
|
|
405
|
+
)
|
|
406
|
+
|
|
277
407
|
self._current_task = self._executor.submit(
|
|
278
408
|
self._handle_event,
|
|
279
409
|
event_context=event_context,
|
|
@@ -287,8 +417,23 @@ class GXAgent:
|
|
|
287
417
|
self._current_task.add_done_callback(on_exit_callback)
|
|
288
418
|
|
|
289
419
|
def get_data_context(self, event_context: EventContext) -> CloudDataContext:
|
|
290
|
-
"""
|
|
291
|
-
|
|
420
|
+
"""Create a new CloudDataContext for each job using the event's workspace_id."""
|
|
421
|
+
with warnings.catch_warnings():
|
|
422
|
+
warnings.filterwarnings("ignore", message="You are using great_expectations version")
|
|
423
|
+
workspace_id = self.get_workspace_id(event_context)
|
|
424
|
+
|
|
425
|
+
LOGGER.debug("Loading a DataContext - this might take a moment.")
|
|
426
|
+
|
|
427
|
+
context: CloudDataContext = get_context(
|
|
428
|
+
cloud_mode=True,
|
|
429
|
+
user_agent_str=self.user_agent_str,
|
|
430
|
+
cloud_workspace_id=str(workspace_id),
|
|
431
|
+
)
|
|
432
|
+
self._configure_progress_bars(data_context=context)
|
|
433
|
+
|
|
434
|
+
LOGGER.debug("DataContext is ready.")
|
|
435
|
+
|
|
436
|
+
return context
|
|
292
437
|
|
|
293
438
|
def get_organization_id(self, event_context: EventContext) -> UUID:
|
|
294
439
|
"""Helper method to get the organization ID. Overridden in GX-Runner."""
|
|
@@ -298,6 +443,13 @@ class GXAgent:
|
|
|
298
443
|
"""Helper method to get the auth key. Overridden in GX-Runner."""
|
|
299
444
|
return self._get_config().gx_cloud_access_token
|
|
300
445
|
|
|
446
|
+
def get_workspace_id(self, event_context: EventContext) -> UUID:
|
|
447
|
+
"""Helper method to get the workspace ID from the event."""
|
|
448
|
+
workspace_id: UUID | None = getattr(event_context.event, "workspace_id", None)
|
|
449
|
+
if workspace_id is None:
|
|
450
|
+
raise GXAgentError()
|
|
451
|
+
return workspace_id
|
|
452
|
+
|
|
301
453
|
def _set_sentry_tags(self, even_context: EventContext) -> None:
|
|
302
454
|
"""Used by GX-Runner to set tags for Sentry logging. No-op in the Agent."""
|
|
303
455
|
pass
|
|
@@ -320,27 +472,38 @@ class GXAgent:
|
|
|
320
472
|
)
|
|
321
473
|
|
|
322
474
|
org_id = self.get_organization_id(event_context)
|
|
475
|
+
workspace_id = self.get_workspace_id(event_context)
|
|
323
476
|
base_url = self._get_config().gx_cloud_base_url
|
|
324
477
|
auth_key = self.get_auth_key()
|
|
325
478
|
|
|
326
479
|
if isinstance(event_context.event, ScheduledEventBase):
|
|
327
|
-
self._create_scheduled_job_and_set_started(event_context, org_id)
|
|
480
|
+
self._create_scheduled_job_and_set_started(event_context, org_id, workspace_id)
|
|
328
481
|
else:
|
|
329
482
|
self._update_status(
|
|
330
|
-
correlation_id=event_context.correlation_id,
|
|
483
|
+
correlation_id=event_context.correlation_id,
|
|
484
|
+
status=JobStarted(),
|
|
485
|
+
org_id=org_id,
|
|
486
|
+
workspace_id=workspace_id,
|
|
331
487
|
)
|
|
488
|
+
memory_mb = self._get_memory_usage_mb()
|
|
332
489
|
LOGGER.info(
|
|
333
|
-
"
|
|
490
|
+
"job.started",
|
|
334
491
|
extra={
|
|
335
492
|
"event_type": event_context.event.type,
|
|
336
493
|
"correlation_id": event_context.correlation_id,
|
|
337
494
|
"organization_id": str(org_id),
|
|
495
|
+
"workspace_id": str(workspace_id),
|
|
338
496
|
"schedule_id": event_context.event.schedule_id
|
|
339
497
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
340
498
|
else None,
|
|
499
|
+
"hostname": socket.gethostname(),
|
|
500
|
+
"redelivered": event_context.redelivered,
|
|
501
|
+
"memory_usage_mb": round(memory_mb, 1),
|
|
341
502
|
},
|
|
342
503
|
)
|
|
343
504
|
|
|
505
|
+
self._start_heartbeat(event_context.correlation_id, org_id, workspace_id)
|
|
506
|
+
|
|
344
507
|
self._set_sentry_tags(event_context)
|
|
345
508
|
|
|
346
509
|
handler = EventHandler(context=data_context)
|
|
@@ -350,7 +513,7 @@ class GXAgent:
|
|
|
350
513
|
id=event_context.correlation_id,
|
|
351
514
|
base_url=base_url,
|
|
352
515
|
auth_key=auth_key,
|
|
353
|
-
organization_id=org_id,
|
|
516
|
+
domain_context=DomainContext(organization_id=org_id, workspace_id=workspace_id),
|
|
354
517
|
)
|
|
355
518
|
return result
|
|
356
519
|
|
|
@@ -365,10 +528,30 @@ class GXAgent:
|
|
|
365
528
|
"""
|
|
366
529
|
# warning: this method will not be executed in the main thread
|
|
367
530
|
|
|
531
|
+
# Calculate job duration before stopping heartbeat (which clears start time)
|
|
532
|
+
job_elapsed_time = (
|
|
533
|
+
time.time() - self._current_job_start_time if self._current_job_start_time else None
|
|
534
|
+
)
|
|
535
|
+
self._stop_heartbeat()
|
|
536
|
+
|
|
368
537
|
org_id = self.get_organization_id(event_context)
|
|
538
|
+
workspace_id = self.get_workspace_id(event_context)
|
|
539
|
+
|
|
540
|
+
memory_mb = self._get_memory_usage_mb()
|
|
541
|
+
LOGGER.debug(
|
|
542
|
+
"job.thread_exiting",
|
|
543
|
+
extra={
|
|
544
|
+
"correlation_id": event_context.correlation_id,
|
|
545
|
+
"hostname": socket.gethostname(),
|
|
546
|
+
"has_exception": future.exception() is not None,
|
|
547
|
+
"cancelled": future.cancelled(),
|
|
548
|
+
"memory_usage_mb": round(memory_mb, 1),
|
|
549
|
+
},
|
|
550
|
+
)
|
|
369
551
|
|
|
370
552
|
# get results or errors from the thread
|
|
371
553
|
error = future.exception()
|
|
554
|
+
|
|
372
555
|
if error is None:
|
|
373
556
|
result: ActionResult = future.result()
|
|
374
557
|
|
|
@@ -379,15 +562,21 @@ class GXAgent:
|
|
|
379
562
|
error_stack_trace="The version of the GX Agent you are using does not support this functionality. Please upgrade to the most recent image tagged with `stable`.",
|
|
380
563
|
processed_by=self._get_processed_by(),
|
|
381
564
|
)
|
|
382
|
-
LOGGER.
|
|
383
|
-
"
|
|
565
|
+
LOGGER.warning(
|
|
566
|
+
"job.completed",
|
|
384
567
|
extra={
|
|
385
568
|
"event_type": event_context.event.type,
|
|
386
|
-
"
|
|
569
|
+
"correlation_id": event_context.correlation_id,
|
|
570
|
+
"job_duration": job_elapsed_time,
|
|
571
|
+
"success": False,
|
|
387
572
|
"organization_id": str(org_id),
|
|
573
|
+
"workspace_id": str(workspace_id),
|
|
388
574
|
"schedule_id": event_context.event.schedule_id
|
|
389
575
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
390
576
|
else None,
|
|
577
|
+
"hostname": socket.gethostname(),
|
|
578
|
+
"error_type": "UnknownEvent",
|
|
579
|
+
"error_message": "Agent does not support this event type. Upgrade required.",
|
|
391
580
|
},
|
|
392
581
|
)
|
|
393
582
|
else:
|
|
@@ -397,34 +586,49 @@ class GXAgent:
|
|
|
397
586
|
processed_by=self._get_processed_by(),
|
|
398
587
|
)
|
|
399
588
|
LOGGER.info(
|
|
400
|
-
"
|
|
589
|
+
"job.completed",
|
|
401
590
|
extra={
|
|
402
591
|
"event_type": event_context.event.type,
|
|
403
592
|
"correlation_id": event_context.correlation_id,
|
|
404
593
|
"job_duration": (
|
|
405
594
|
result.job_duration.total_seconds() if result.job_duration else None
|
|
406
595
|
),
|
|
596
|
+
"success": True,
|
|
407
597
|
"organization_id": str(org_id),
|
|
598
|
+
"workspace_id": str(workspace_id),
|
|
408
599
|
"schedule_id": event_context.event.schedule_id
|
|
409
600
|
if isinstance(event_context.event, ScheduledEventBase)
|
|
410
601
|
else None,
|
|
602
|
+
"hostname": socket.gethostname(),
|
|
411
603
|
},
|
|
412
604
|
)
|
|
413
605
|
else:
|
|
414
|
-
status = build_failed_job_completed_status(error)
|
|
606
|
+
status = build_failed_job_completed_status(error, processed_by=self._get_processed_by())
|
|
415
607
|
LOGGER.info(traceback.format_exc())
|
|
416
|
-
LOGGER.
|
|
417
|
-
"
|
|
608
|
+
LOGGER.warning(
|
|
609
|
+
"job.completed",
|
|
418
610
|
extra={
|
|
419
611
|
"event_type": event_context.event.type,
|
|
420
612
|
"correlation_id": event_context.correlation_id,
|
|
613
|
+
"job_duration": job_elapsed_time,
|
|
614
|
+
"success": False,
|
|
421
615
|
"organization_id": str(org_id),
|
|
616
|
+
"workspace_id": str(workspace_id),
|
|
617
|
+
"schedule_id": event_context.event.schedule_id
|
|
618
|
+
if isinstance(event_context.event, ScheduledEventBase)
|
|
619
|
+
else None,
|
|
620
|
+
"hostname": socket.gethostname(),
|
|
621
|
+
"error_type": type(error).__name__,
|
|
622
|
+
"error_message": str(error)[:500], # Truncate to avoid huge logs
|
|
422
623
|
},
|
|
423
624
|
)
|
|
424
625
|
|
|
425
626
|
try:
|
|
426
627
|
self._update_status(
|
|
427
|
-
correlation_id=event_context.correlation_id,
|
|
628
|
+
correlation_id=event_context.correlation_id,
|
|
629
|
+
status=status,
|
|
630
|
+
org_id=org_id,
|
|
631
|
+
workspace_id=workspace_id,
|
|
428
632
|
)
|
|
429
633
|
except Exception:
|
|
430
634
|
LOGGER.exception(
|
|
@@ -433,6 +637,7 @@ class GXAgent:
|
|
|
433
637
|
"correlation_id": event_context.correlation_id,
|
|
434
638
|
"status": str(status),
|
|
435
639
|
"organization_id": str(org_id),
|
|
640
|
+
"workspace_id": str(workspace_id),
|
|
436
641
|
},
|
|
437
642
|
)
|
|
438
643
|
# We do not want to cause an infinite loop of errors
|
|
@@ -552,7 +757,9 @@ class GXAgent:
|
|
|
552
757
|
)
|
|
553
758
|
)
|
|
554
759
|
|
|
555
|
-
def _update_status(
|
|
760
|
+
def _update_status(
|
|
761
|
+
self, correlation_id: str, status: JobStatus, org_id: UUID, workspace_id: UUID
|
|
762
|
+
) -> None:
|
|
556
763
|
"""Update GX Cloud on the status of a job.
|
|
557
764
|
|
|
558
765
|
Args:
|
|
@@ -565,11 +772,12 @@ class GXAgent:
|
|
|
565
772
|
"correlation_id": correlation_id,
|
|
566
773
|
"status": str(status),
|
|
567
774
|
"organization_id": str(org_id),
|
|
775
|
+
"workspace_id": str(workspace_id),
|
|
568
776
|
},
|
|
569
777
|
)
|
|
570
778
|
agent_sessions_url = urljoin(
|
|
571
779
|
self._get_config().gx_cloud_base_url,
|
|
572
|
-
f"/api/v1/organizations/{org_id}/agent-jobs/{correlation_id}",
|
|
780
|
+
f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs/{correlation_id}",
|
|
573
781
|
)
|
|
574
782
|
with create_session(access_token=self.get_auth_key()) as session:
|
|
575
783
|
data = UpdateJobStatusRequest(data=status).json()
|
|
@@ -580,6 +788,7 @@ class GXAgent:
|
|
|
580
788
|
"correlation_id": correlation_id,
|
|
581
789
|
"status": str(status),
|
|
582
790
|
"organization_id": str(org_id),
|
|
791
|
+
"workspace_id": str(workspace_id),
|
|
583
792
|
},
|
|
584
793
|
)
|
|
585
794
|
GXAgent._log_http_error(
|
|
@@ -587,7 +796,7 @@ class GXAgent:
|
|
|
587
796
|
)
|
|
588
797
|
|
|
589
798
|
def _create_scheduled_job_and_set_started(
|
|
590
|
-
self, event_context: EventContext, org_id: UUID
|
|
799
|
+
self, event_context: EventContext, org_id: UUID, workspace_id: UUID
|
|
591
800
|
) -> None:
|
|
592
801
|
"""Create a job in GX Cloud for scheduled events.
|
|
593
802
|
|
|
@@ -609,13 +818,14 @@ class GXAgent:
|
|
|
609
818
|
"correlation_id": str(event_context.correlation_id),
|
|
610
819
|
"event_type": str(event_context.event.type),
|
|
611
820
|
"organization_id": str(org_id),
|
|
821
|
+
"workspace_id": str(workspace_id),
|
|
612
822
|
"schedule_id": str(event_context.event.schedule_id),
|
|
613
823
|
},
|
|
614
824
|
)
|
|
615
825
|
|
|
616
826
|
agent_sessions_url = urljoin(
|
|
617
827
|
self._get_config().gx_cloud_base_url,
|
|
618
|
-
f"/api/v1/organizations/{org_id}/agent-jobs",
|
|
828
|
+
f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs",
|
|
619
829
|
)
|
|
620
830
|
data = CreateScheduledJobAndSetJobStarted(
|
|
621
831
|
type="run_scheduled_checkpoint.received",
|
|
@@ -629,6 +839,31 @@ class GXAgent:
|
|
|
629
839
|
with create_session(access_token=self.get_auth_key()) as session:
|
|
630
840
|
payload = CreateScheduledJobAndSetJobStartedRequest(data=data).json()
|
|
631
841
|
response = session.post(agent_sessions_url, data=payload)
|
|
842
|
+
|
|
843
|
+
if response.status_code == HTTPStatus.BAD_REQUEST:
|
|
844
|
+
try:
|
|
845
|
+
response_body = response.json()
|
|
846
|
+
except Exception:
|
|
847
|
+
response_body = response.text
|
|
848
|
+
LOGGER.warning(
|
|
849
|
+
"Job already exists - this message was likely redelivered by RabbitMQ "
|
|
850
|
+
"after another runner already claimed it. Continuing to process anyway "
|
|
851
|
+
"as a safety measure in case the original runner failed.",
|
|
852
|
+
extra={
|
|
853
|
+
"correlation_id": str(event_context.correlation_id),
|
|
854
|
+
"event_type": str(event_context.event.type),
|
|
855
|
+
"organization_id": str(org_id),
|
|
856
|
+
"schedule_id": str(event_context.event.schedule_id),
|
|
857
|
+
"workspace_id": str(workspace_id),
|
|
858
|
+
"response_status": response.status_code,
|
|
859
|
+
"response_body": response_body,
|
|
860
|
+
},
|
|
861
|
+
)
|
|
862
|
+
# Note: We intentionally continue processing instead of NACKing.
|
|
863
|
+
# This ensures job completion even if the first runner fails.
|
|
864
|
+
# TODO: Once we add inProgress timeout in Mercury, we can
|
|
865
|
+
# safely NACK here to prevent duplicate processing.
|
|
866
|
+
|
|
632
867
|
LOGGER.info(
|
|
633
868
|
"Created scheduled job and set started",
|
|
634
869
|
extra={
|
|
@@ -636,6 +871,8 @@ class GXAgent:
|
|
|
636
871
|
"event_type": str(event_context.event.type),
|
|
637
872
|
"organization_id": str(org_id),
|
|
638
873
|
"schedule_id": str(event_context.event.schedule_id),
|
|
874
|
+
"workspace_id": str(workspace_id),
|
|
875
|
+
"response_status": response.status_code,
|
|
639
876
|
},
|
|
640
877
|
)
|
|
641
878
|
GXAgent._log_http_error(
|
|
@@ -658,7 +895,9 @@ class GXAgent:
|
|
|
658
895
|
"""
|
|
659
896
|
Sets headers on all stores in the data context.
|
|
660
897
|
"""
|
|
661
|
-
from great_expectations.data_context.store.gx_cloud_store_backend import
|
|
898
|
+
from great_expectations.data_context.store.gx_cloud_store_backend import ( # noqa: PLC0415
|
|
899
|
+
GXCloudStoreBackend,
|
|
900
|
+
)
|
|
662
901
|
|
|
663
902
|
# OSS doesn't use the same session for all requests, so we need to set the header for each store
|
|
664
903
|
stores = list(data_context.stores.values())
|
|
@@ -686,8 +925,6 @@ class GXAgent:
|
|
|
686
925
|
Note: the Agent-Job-Id header value will be set for all GX Cloud request until this method is
|
|
687
926
|
called again.
|
|
688
927
|
"""
|
|
689
|
-
from great_expectations import __version__ # noqa: PLC0415
|
|
690
|
-
from great_expectations.core import http # noqa: PLC0415
|
|
691
928
|
|
|
692
929
|
header_name = self.get_header_name()
|
|
693
930
|
user_agent_header_value = self.user_agent_str
|
|
@@ -15,6 +15,7 @@ from pydantic import v1 as pydantic_v1
|
|
|
15
15
|
from great_expectations_cloud.agent.actions.unknown import UnknownEventAction
|
|
16
16
|
from great_expectations_cloud.agent.exceptions import GXAgentError
|
|
17
17
|
from great_expectations_cloud.agent.models import (
|
|
18
|
+
DomainContext,
|
|
18
19
|
Event,
|
|
19
20
|
EventType,
|
|
20
21
|
UnknownEvent,
|
|
@@ -67,11 +68,11 @@ class EventHandler:
|
|
|
67
68
|
self._context = context
|
|
68
69
|
|
|
69
70
|
def get_event_action(
|
|
70
|
-
self, event: Event, base_url: str, auth_key: str,
|
|
71
|
+
self, event: Event, base_url: str, auth_key: str, domain_context: DomainContext
|
|
71
72
|
) -> AgentAction[Any]:
|
|
72
73
|
"""Get the action that should be run for the given event."""
|
|
73
74
|
|
|
74
|
-
if not self._check_event_organization_id(event, organization_id):
|
|
75
|
+
if not self._check_event_organization_id(event, domain_context.organization_id):
|
|
75
76
|
# Making message more generic
|
|
76
77
|
raise GXAgentError("Unable to process job. Invalid input.") # noqa: TRY003
|
|
77
78
|
|
|
@@ -84,17 +85,17 @@ class EventHandler:
|
|
|
84
85
|
return action_class(
|
|
85
86
|
context=self._context,
|
|
86
87
|
base_url=base_url,
|
|
87
|
-
|
|
88
|
+
domain_context=domain_context,
|
|
88
89
|
auth_key=auth_key,
|
|
89
90
|
)
|
|
90
91
|
|
|
91
|
-
def handle_event(
|
|
92
|
-
self, event: Event, id: str, base_url: str, auth_key: str,
|
|
92
|
+
def handle_event(
|
|
93
|
+
self, event: Event, id: str, base_url: str, auth_key: str, domain_context: DomainContext
|
|
93
94
|
) -> ActionResult:
|
|
94
|
-
start_time = datetime.now(tz=timezone.utc)
|
|
95
95
|
"""Transform an Event into an ActionResult."""
|
|
96
|
+
start_time = datetime.now(tz=timezone.utc)
|
|
96
97
|
action = self.get_event_action(
|
|
97
|
-
event=event, base_url=base_url, auth_key=auth_key,
|
|
98
|
+
event=event, base_url=base_url, auth_key=auth_key, domain_context=domain_context
|
|
98
99
|
)
|
|
99
100
|
LOGGER.info(f"Handling event: {event.type} -> {action.__class__.__name__}")
|
|
100
101
|
action_result = action.run(event=event, id=id)
|