great-expectations-cloud 20250811.1.dev0__py3-none-any.whl → 20260113.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of great-expectations-cloud might be problematic. Click here for more details.

Files changed (19) hide show
  1. great_expectations_cloud/agent/actions/agent_action.py +3 -3
  2. great_expectations_cloud/agent/actions/draft_datasource_config_action.py +2 -2
  3. great_expectations_cloud/agent/actions/generate_data_quality_check_expectations_action.py +47 -24
  4. great_expectations_cloud/agent/actions/list_asset_names.py +4 -5
  5. great_expectations_cloud/agent/actions/run_checkpoint.py +64 -3
  6. great_expectations_cloud/agent/actions/run_metric_list_action.py +3 -3
  7. great_expectations_cloud/agent/actions/run_scheduled_checkpoint.py +28 -5
  8. great_expectations_cloud/agent/actions/run_window_checkpoint.py +2 -4
  9. great_expectations_cloud/agent/actions/utils.py +13 -4
  10. great_expectations_cloud/agent/agent.py +259 -36
  11. great_expectations_cloud/agent/event_handler.py +8 -7
  12. great_expectations_cloud/agent/message_service/asyncio_rabbit_mq_client.py +33 -8
  13. great_expectations_cloud/agent/message_service/subscriber.py +4 -0
  14. great_expectations_cloud/agent/models.py +13 -0
  15. {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/METADATA +7 -5
  16. {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/RECORD +19 -19
  17. {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/WHEEL +1 -1
  18. {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info}/entry_points.txt +0 -0
  19. {great_expectations_cloud-20250811.1.dev0.dist-info → great_expectations_cloud-20260113.0.dev1.dist-info/licenses}/LICENSE +0 -0
@@ -2,13 +2,20 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  import logging
5
+ import os
6
+ import resource
7
+ import signal
8
+ import socket
5
9
  import sys
10
+ import threading
11
+ import time
6
12
  import traceback
7
13
  import warnings
8
14
  from collections import defaultdict
9
15
  from concurrent.futures import Future
10
16
  from concurrent.futures.thread import ThreadPoolExecutor
11
17
  from functools import partial
18
+ from http import HTTPStatus
12
19
  from importlib.metadata import version as metadata_version
13
20
  from typing import TYPE_CHECKING, Any, Callable, Final, Literal
14
21
  from urllib.parse import urljoin, urlparse
@@ -16,13 +23,12 @@ from uuid import UUID
16
23
 
17
24
  import orjson
18
25
  import requests
26
+ from great_expectations import __version__, get_context
27
+ from great_expectations.core import http
19
28
  from great_expectations.core.http import create_session
20
29
  from great_expectations.data_context.cloud_constants import CLOUD_DEFAULT_BASE_URL
21
- from great_expectations.data_context.data_context.context_factory import get_context
22
30
  from great_expectations.data_context.types.base import ProgressBarsConfig
23
- from pika.adapters.utils.connection_workflow import (
24
- AMQPConnectorException,
25
- )
31
+ from pika.adapters.utils.connection_workflow import AMQPConnectorException
26
32
  from pika.exceptions import (
27
33
  AMQPConnectionError,
28
34
  AMQPError,
@@ -45,9 +51,7 @@ from great_expectations_cloud.agent.config import (
45
51
  generate_config_validation_error_text,
46
52
  )
47
53
  from great_expectations_cloud.agent.constants import USER_AGENT_HEADER, HeaderName
48
- from great_expectations_cloud.agent.event_handler import (
49
- EventHandler,
50
- )
54
+ from great_expectations_cloud.agent.event_handler import EventHandler
51
55
  from great_expectations_cloud.agent.exceptions import (
52
56
  GXAgentConfigError,
53
57
  GXAgentError,
@@ -67,6 +71,7 @@ from great_expectations_cloud.agent.models import (
67
71
  AgentBaseExtraForbid,
68
72
  CreateScheduledJobAndSetJobStarted,
69
73
  CreateScheduledJobAndSetJobStartedRequest,
74
+ DomainContext,
70
75
  JobCompleted,
71
76
  JobStarted,
72
77
  JobStatus,
@@ -138,6 +143,9 @@ class GXAgent:
138
143
  _PYPI_GX_AGENT_PACKAGE_NAME = "great_expectations_cloud"
139
144
  _PYPI_GREAT_EXPECTATIONS_PACKAGE_NAME = "great_expectations"
140
145
 
146
+ # Heartbeat interval in seconds (log progress every 60 seconds during job processing)
147
+ _HEARTBEAT_INTERVAL_SECONDS = 60
148
+
141
149
  def __init__(self: Self):
142
150
  self._config = self._create_config()
143
151
 
@@ -150,19 +158,6 @@ class GXAgent:
150
158
  "great_expectations_version": great_expectations_version,
151
159
  },
152
160
  )
153
- LOGGER.debug("Loading a DataContext - this might take a moment.")
154
-
155
- with warnings.catch_warnings():
156
- # suppress warnings about GX version
157
- warnings.filterwarnings("ignore", message="You are using great_expectations version")
158
- self._context: CloudDataContext = get_context(
159
- cloud_mode=True,
160
- user_agent_str=self.user_agent_str,
161
- )
162
- self._configure_progress_bars(data_context=self._context)
163
- LOGGER.debug("DataContext is ready.")
164
-
165
- self._set_http_session_headers(data_context=self._context)
166
161
 
167
162
  # Create a thread pool with a single worker, so we can run long-lived
168
163
  # GX processes and maintain our connection to the broker. Note that
@@ -174,6 +169,15 @@ class GXAgent:
174
169
  self._correlation_ids: defaultdict[str, int] = defaultdict(lambda: 0)
175
170
  self._listen_tries = 0
176
171
 
172
+ # Heartbeat tracking
173
+ self._heartbeat_stop_event: threading.Event | None = None
174
+ self._heartbeat_thread: threading.Thread | None = None
175
+ self._current_job_correlation_id: str | None = None
176
+ self._current_job_start_time: float | None = None
177
+
178
+ # Install signal handlers for graceful shutdown logging
179
+ self._install_signal_handlers()
180
+
177
181
  def run(self) -> None:
178
182
  """Open a connection to GX Cloud."""
179
183
 
@@ -218,7 +222,7 @@ class GXAgent:
218
222
  LOGGER.exception("The connection to GX Cloud has encountered an error.")
219
223
  except GXAgentUnrecoverableConnectionError:
220
224
  LOGGER.exception("The connection to GX Cloud has encountered an unrecoverable error.")
221
- sys.exit(1)
225
+ os.kill(os.getpid(), signal.SIGTERM)
222
226
  except (
223
227
  AuthenticationError,
224
228
  ProbableAuthenticationError,
@@ -233,6 +237,98 @@ class GXAgent:
233
237
  if subscriber is not None:
234
238
  subscriber.close()
235
239
 
240
+ def _install_signal_handlers(self) -> None:
241
+ """Install signal handlers to log when the process receives shutdown signals."""
242
+ original_sigterm = signal.getsignal(signal.SIGTERM)
243
+ original_sigint = signal.getsignal(signal.SIGINT)
244
+
245
+ def sigterm_handler(signum: int, frame: Any) -> None:
246
+ self._log_signal_received("SIGTERM", signum)
247
+ if callable(original_sigterm):
248
+ original_sigterm(signum, frame)
249
+ elif original_sigterm == signal.SIG_DFL:
250
+ raise SystemExit(128 + signum)
251
+
252
+ def sigint_handler(signum: int, frame: Any) -> None:
253
+ self._log_signal_received("SIGINT", signum)
254
+ if callable(original_sigint):
255
+ original_sigint(signum, frame)
256
+ elif original_sigint == signal.SIG_DFL:
257
+ raise KeyboardInterrupt
258
+
259
+ signal.signal(signal.SIGTERM, sigterm_handler)
260
+ signal.signal(signal.SIGINT, sigint_handler)
261
+
262
+ def _log_signal_received(self, signal_name: str, signum: int) -> None:
263
+ """Log when a shutdown signal is received, including current job info."""
264
+ memory_mb = self._get_memory_usage_mb()
265
+ LOGGER.warning(
266
+ f"Received {signal_name} signal - shutting down",
267
+ extra={
268
+ "signal": signal_name,
269
+ "signal_number": signum,
270
+ "hostname": socket.gethostname(),
271
+ "current_job_correlation_id": self._current_job_correlation_id,
272
+ "job_elapsed_seconds": (
273
+ time.time() - self._current_job_start_time
274
+ if self._current_job_start_time
275
+ else None
276
+ ),
277
+ "memory_usage_mb": memory_mb,
278
+ "has_active_task": self._current_task is not None and not self._current_task.done(),
279
+ },
280
+ )
281
+
282
+ def _get_memory_usage_mb(self) -> float:
283
+ """Get current memory usage in MB using resource module."""
284
+ # ru_maxrss is in KB on Linux, bytes on macOS
285
+ usage = resource.getrusage(resource.RUSAGE_SELF)
286
+ # On macOS, ru_maxrss is in bytes; on Linux, it's in KB
287
+ if sys.platform == "darwin":
288
+ return usage.ru_maxrss / (1024 * 1024)
289
+ return usage.ru_maxrss / 1024
290
+
291
+ def _start_heartbeat(self, correlation_id: str, org_id: UUID, workspace_id: UUID) -> None:
292
+ """Start a background thread that logs periodic heartbeats during job processing."""
293
+ self._current_job_correlation_id = correlation_id
294
+ self._current_job_start_time = time.time()
295
+ self._heartbeat_stop_event = threading.Event()
296
+
297
+ def heartbeat_loop() -> None:
298
+ stop_event = self._heartbeat_stop_event
299
+ if stop_event is None:
300
+ return
301
+ while not stop_event.wait(timeout=self._HEARTBEAT_INTERVAL_SECONDS):
302
+ if stop_event.is_set():
303
+ break
304
+ elapsed = time.time() - (self._current_job_start_time or time.time())
305
+ memory_mb = self._get_memory_usage_mb()
306
+ LOGGER.info(
307
+ "Job heartbeat - still processing",
308
+ extra={
309
+ "correlation_id": correlation_id,
310
+ "organization_id": str(org_id),
311
+ "workspace_id": str(workspace_id),
312
+ "hostname": socket.gethostname(),
313
+ "elapsed_seconds": round(elapsed, 1),
314
+ "memory_usage_mb": round(memory_mb, 1),
315
+ },
316
+ )
317
+
318
+ self._heartbeat_thread = threading.Thread(target=heartbeat_loop, daemon=True)
319
+ self._heartbeat_thread.start()
320
+
321
+ def _stop_heartbeat(self) -> None:
322
+ """Stop the heartbeat thread."""
323
+ if self._heartbeat_stop_event:
324
+ self._heartbeat_stop_event.set()
325
+ if self._heartbeat_thread and self._heartbeat_thread.is_alive():
326
+ self._heartbeat_thread.join(timeout=2)
327
+ self._heartbeat_thread = None
328
+ self._heartbeat_stop_event = None
329
+ self._current_job_correlation_id = None
330
+ self._current_job_start_time = None
331
+
236
332
  @classmethod
237
333
  def get_current_gx_agent_version(cls) -> str:
238
334
  version: str = metadata_version(cls._PYPI_GX_AGENT_PACKAGE_NAME)
@@ -252,8 +348,26 @@ class GXAgent:
252
348
  Args:
253
349
  event_context: An Event with related properties and actions.
254
350
  """
351
+ # Track how many times this correlation_id has been seen BY THIS POD (for local diagnostics)
352
+ # Note: event_context.redelivered is set by RabbitMQ and indicates cross-pod redelivery
353
+ local_delivery_count = self._correlation_ids.get(event_context.correlation_id, 0)
354
+
255
355
  if self._reject_correlation_id(event_context.correlation_id) is True:
256
- # this event has been redelivered too many times - remove it from circulation
356
+ # this event has been redelivered too many times to THIS pod - remove it from circulation
357
+ LOGGER.error(
358
+ "Message redelivered too many times to this pod, removing from queue",
359
+ extra={
360
+ "event_type": event_context.event.type,
361
+ "correlation_id": event_context.correlation_id,
362
+ "organization_id": self.get_organization_id(event_context),
363
+ "workspace_id": str(self.get_workspace_id(event_context)),
364
+ "schedule_id": event_context.event.schedule_id
365
+ if isinstance(event_context.event, ScheduledEventBase)
366
+ else None,
367
+ "local_delivery_count": local_delivery_count,
368
+ "redelivered": event_context.redelivered,
369
+ },
370
+ )
257
371
  event_context.processed_with_failures()
258
372
  return
259
373
  elif self._can_accept_new_task() is not True:
@@ -263,9 +377,11 @@ class GXAgent:
263
377
  "event_type": event_context.event.type,
264
378
  "correlation_id": event_context.correlation_id,
265
379
  "organization_id": self.get_organization_id(event_context),
380
+ "workspace_id": str(self.get_workspace_id(event_context)),
266
381
  "schedule_id": event_context.event.schedule_id
267
382
  if isinstance(event_context.event, ScheduledEventBase)
268
383
  else None,
384
+ "redelivered": event_context.redelivered,
269
385
  },
270
386
  )
271
387
  # request that this message is redelivered later
@@ -274,6 +390,21 @@ class GXAgent:
274
390
  self._redeliver_msg_task = loop.create_task(event_context.redeliver_message())
275
391
  return
276
392
 
393
+ if event_context.redelivered:
394
+ LOGGER.warning(
395
+ "Accepting redelivered message - another consumer failed to acknowledge",
396
+ extra={
397
+ "event_type": event_context.event.type,
398
+ "correlation_id": event_context.correlation_id,
399
+ "organization_id": self.get_organization_id(event_context),
400
+ "workspace_id": str(self.get_workspace_id(event_context)),
401
+ "schedule_id": event_context.event.schedule_id
402
+ if isinstance(event_context.event, ScheduledEventBase)
403
+ else None,
404
+ "redelivered": event_context.redelivered,
405
+ },
406
+ )
407
+
277
408
  self._current_task = self._executor.submit(
278
409
  self._handle_event,
279
410
  event_context=event_context,
@@ -287,8 +418,23 @@ class GXAgent:
287
418
  self._current_task.add_done_callback(on_exit_callback)
288
419
 
289
420
  def get_data_context(self, event_context: EventContext) -> CloudDataContext:
290
- """Helper method to get a DataContext Agent. Overridden in GX-Runner."""
291
- return self._context
421
+ """Create a new CloudDataContext for each job using the event's workspace_id."""
422
+ with warnings.catch_warnings():
423
+ warnings.filterwarnings("ignore", message="You are using great_expectations version")
424
+ workspace_id = self.get_workspace_id(event_context)
425
+
426
+ LOGGER.debug("Loading a DataContext - this might take a moment.")
427
+
428
+ context: CloudDataContext = get_context(
429
+ cloud_mode=True,
430
+ user_agent_str=self.user_agent_str,
431
+ cloud_workspace_id=str(workspace_id),
432
+ )
433
+ self._configure_progress_bars(data_context=context)
434
+
435
+ LOGGER.debug("DataContext is ready.")
436
+
437
+ return context
292
438
 
293
439
  def get_organization_id(self, event_context: EventContext) -> UUID:
294
440
  """Helper method to get the organization ID. Overridden in GX-Runner."""
@@ -298,6 +444,13 @@ class GXAgent:
298
444
  """Helper method to get the auth key. Overridden in GX-Runner."""
299
445
  return self._get_config().gx_cloud_access_token
300
446
 
447
+ def get_workspace_id(self, event_context: EventContext) -> UUID:
448
+ """Helper method to get the workspace ID from the event."""
449
+ workspace_id: UUID | None = getattr(event_context.event, "workspace_id", None)
450
+ if workspace_id is None:
451
+ raise GXAgentError()
452
+ return workspace_id
453
+
301
454
  def _set_sentry_tags(self, even_context: EventContext) -> None:
302
455
  """Used by GX-Runner to set tags for Sentry logging. No-op in the Agent."""
303
456
  pass
@@ -320,27 +473,38 @@ class GXAgent:
320
473
  )
321
474
 
322
475
  org_id = self.get_organization_id(event_context)
476
+ workspace_id = self.get_workspace_id(event_context)
323
477
  base_url = self._get_config().gx_cloud_base_url
324
478
  auth_key = self.get_auth_key()
325
479
 
326
480
  if isinstance(event_context.event, ScheduledEventBase):
327
- self._create_scheduled_job_and_set_started(event_context, org_id)
481
+ self._create_scheduled_job_and_set_started(event_context, org_id, workspace_id)
328
482
  else:
329
483
  self._update_status(
330
- correlation_id=event_context.correlation_id, status=JobStarted(), org_id=org_id
484
+ correlation_id=event_context.correlation_id,
485
+ status=JobStarted(),
486
+ org_id=org_id,
487
+ workspace_id=workspace_id,
331
488
  )
489
+ memory_mb = self._get_memory_usage_mb()
332
490
  LOGGER.info(
333
491
  "Starting job",
334
492
  extra={
335
493
  "event_type": event_context.event.type,
336
494
  "correlation_id": event_context.correlation_id,
337
495
  "organization_id": str(org_id),
496
+ "workspace_id": str(workspace_id),
338
497
  "schedule_id": event_context.event.schedule_id
339
498
  if isinstance(event_context.event, ScheduledEventBase)
340
499
  else None,
500
+ "hostname": socket.gethostname(),
501
+ "redelivered": event_context.redelivered,
502
+ "memory_usage_mb": round(memory_mb, 1),
341
503
  },
342
504
  )
343
505
 
506
+ self._start_heartbeat(event_context.correlation_id, org_id, workspace_id)
507
+
344
508
  self._set_sentry_tags(event_context)
345
509
 
346
510
  handler = EventHandler(context=data_context)
@@ -350,7 +514,7 @@ class GXAgent:
350
514
  id=event_context.correlation_id,
351
515
  base_url=base_url,
352
516
  auth_key=auth_key,
353
- organization_id=org_id,
517
+ domain_context=DomainContext(organization_id=org_id, workspace_id=workspace_id),
354
518
  )
355
519
  return result
356
520
 
@@ -365,10 +529,26 @@ class GXAgent:
365
529
  """
366
530
  # warning: this method will not be executed in the main thread
367
531
 
532
+ self._stop_heartbeat()
533
+
368
534
  org_id = self.get_organization_id(event_context)
535
+ workspace_id = self.get_workspace_id(event_context)
536
+
537
+ memory_mb = self._get_memory_usage_mb()
538
+ LOGGER.debug(
539
+ "Job thread exiting",
540
+ extra={
541
+ "correlation_id": event_context.correlation_id,
542
+ "hostname": socket.gethostname(),
543
+ "has_exception": future.exception() is not None,
544
+ "cancelled": future.cancelled(),
545
+ "memory_usage_mb": round(memory_mb, 1),
546
+ },
547
+ )
369
548
 
370
549
  # get results or errors from the thread
371
550
  error = future.exception()
551
+
372
552
  if error is None:
373
553
  result: ActionResult = future.result()
374
554
 
@@ -385,6 +565,7 @@ class GXAgent:
385
565
  "event_type": event_context.event.type,
386
566
  "id": event_context.correlation_id,
387
567
  "organization_id": str(org_id),
568
+ "workspace_id": str(workspace_id),
388
569
  "schedule_id": event_context.event.schedule_id
389
570
  if isinstance(event_context.event, ScheduledEventBase)
390
571
  else None,
@@ -405,26 +586,35 @@ class GXAgent:
405
586
  result.job_duration.total_seconds() if result.job_duration else None
406
587
  ),
407
588
  "organization_id": str(org_id),
589
+ "workspace_id": str(workspace_id),
408
590
  "schedule_id": event_context.event.schedule_id
409
591
  if isinstance(event_context.event, ScheduledEventBase)
410
592
  else None,
593
+ "hostname": socket.gethostname(),
411
594
  },
412
595
  )
413
596
  else:
414
597
  status = build_failed_job_completed_status(error)
415
598
  LOGGER.info(traceback.format_exc())
416
- LOGGER.info(
599
+ LOGGER.warning(
417
600
  "Job completed with error",
418
601
  extra={
419
602
  "event_type": event_context.event.type,
420
603
  "correlation_id": event_context.correlation_id,
421
604
  "organization_id": str(org_id),
605
+ "workspace_id": str(workspace_id),
606
+ "hostname": socket.gethostname(),
607
+ "error_type": type(error).__name__,
608
+ "error_message": str(error)[:500], # Truncate to avoid huge logs
422
609
  },
423
610
  )
424
611
 
425
612
  try:
426
613
  self._update_status(
427
- correlation_id=event_context.correlation_id, status=status, org_id=org_id
614
+ correlation_id=event_context.correlation_id,
615
+ status=status,
616
+ org_id=org_id,
617
+ workspace_id=workspace_id,
428
618
  )
429
619
  except Exception:
430
620
  LOGGER.exception(
@@ -433,6 +623,7 @@ class GXAgent:
433
623
  "correlation_id": event_context.correlation_id,
434
624
  "status": str(status),
435
625
  "organization_id": str(org_id),
626
+ "workspace_id": str(workspace_id),
436
627
  },
437
628
  )
438
629
  # We do not want to cause an infinite loop of errors
@@ -552,7 +743,9 @@ class GXAgent:
552
743
  )
553
744
  )
554
745
 
555
- def _update_status(self, correlation_id: str, status: JobStatus, org_id: UUID) -> None:
746
+ def _update_status(
747
+ self, correlation_id: str, status: JobStatus, org_id: UUID, workspace_id: UUID
748
+ ) -> None:
556
749
  """Update GX Cloud on the status of a job.
557
750
 
558
751
  Args:
@@ -565,11 +758,12 @@ class GXAgent:
565
758
  "correlation_id": correlation_id,
566
759
  "status": str(status),
567
760
  "organization_id": str(org_id),
761
+ "workspace_id": str(workspace_id),
568
762
  },
569
763
  )
570
764
  agent_sessions_url = urljoin(
571
765
  self._get_config().gx_cloud_base_url,
572
- f"/api/v1/organizations/{org_id}/agent-jobs/{correlation_id}",
766
+ f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs/{correlation_id}",
573
767
  )
574
768
  with create_session(access_token=self.get_auth_key()) as session:
575
769
  data = UpdateJobStatusRequest(data=status).json()
@@ -580,6 +774,7 @@ class GXAgent:
580
774
  "correlation_id": correlation_id,
581
775
  "status": str(status),
582
776
  "organization_id": str(org_id),
777
+ "workspace_id": str(workspace_id),
583
778
  },
584
779
  )
585
780
  GXAgent._log_http_error(
@@ -587,7 +782,7 @@ class GXAgent:
587
782
  )
588
783
 
589
784
  def _create_scheduled_job_and_set_started(
590
- self, event_context: EventContext, org_id: UUID
785
+ self, event_context: EventContext, org_id: UUID, workspace_id: UUID
591
786
  ) -> None:
592
787
  """Create a job in GX Cloud for scheduled events.
593
788
 
@@ -609,13 +804,14 @@ class GXAgent:
609
804
  "correlation_id": str(event_context.correlation_id),
610
805
  "event_type": str(event_context.event.type),
611
806
  "organization_id": str(org_id),
807
+ "workspace_id": str(workspace_id),
612
808
  "schedule_id": str(event_context.event.schedule_id),
613
809
  },
614
810
  )
615
811
 
616
812
  agent_sessions_url = urljoin(
617
813
  self._get_config().gx_cloud_base_url,
618
- f"/api/v1/organizations/{org_id}/agent-jobs",
814
+ f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs",
619
815
  )
620
816
  data = CreateScheduledJobAndSetJobStarted(
621
817
  type="run_scheduled_checkpoint.received",
@@ -629,6 +825,31 @@ class GXAgent:
629
825
  with create_session(access_token=self.get_auth_key()) as session:
630
826
  payload = CreateScheduledJobAndSetJobStartedRequest(data=data).json()
631
827
  response = session.post(agent_sessions_url, data=payload)
828
+
829
+ if response.status_code == HTTPStatus.BAD_REQUEST:
830
+ try:
831
+ response_body = response.json()
832
+ except Exception:
833
+ response_body = response.text
834
+ LOGGER.warning(
835
+ "Job already exists - this message was likely redelivered by RabbitMQ "
836
+ "after another runner already claimed it. Continuing to process anyway "
837
+ "as a safety measure in case the original runner failed.",
838
+ extra={
839
+ "correlation_id": str(event_context.correlation_id),
840
+ "event_type": str(event_context.event.type),
841
+ "organization_id": str(org_id),
842
+ "schedule_id": str(event_context.event.schedule_id),
843
+ "workspace_id": str(workspace_id),
844
+ "response_status": response.status_code,
845
+ "response_body": response_body,
846
+ },
847
+ )
848
+ # Note: We intentionally continue processing instead of NACKing.
849
+ # This ensures job completion even if the first runner fails.
850
+ # TODO: Once we add inProgress timeout in Mercury, we can
851
+ # safely NACK here to prevent duplicate processing.
852
+
632
853
  LOGGER.info(
633
854
  "Created scheduled job and set started",
634
855
  extra={
@@ -636,6 +857,8 @@ class GXAgent:
636
857
  "event_type": str(event_context.event.type),
637
858
  "organization_id": str(org_id),
638
859
  "schedule_id": str(event_context.event.schedule_id),
860
+ "workspace_id": str(workspace_id),
861
+ "response_status": response.status_code,
639
862
  },
640
863
  )
641
864
  GXAgent._log_http_error(
@@ -658,7 +881,9 @@ class GXAgent:
658
881
  """
659
882
  Sets headers on all stores in the data context.
660
883
  """
661
- from great_expectations.data_context.store.gx_cloud_store_backend import GXCloudStoreBackend # noqa: I001, PLC0415
884
+ from great_expectations.data_context.store.gx_cloud_store_backend import ( # noqa: PLC0415
885
+ GXCloudStoreBackend,
886
+ )
662
887
 
663
888
  # OSS doesn't use the same session for all requests, so we need to set the header for each store
664
889
  stores = list(data_context.stores.values())
@@ -686,8 +911,6 @@ class GXAgent:
686
911
  Note: the Agent-Job-Id header value will be set for all GX Cloud request until this method is
687
912
  called again.
688
913
  """
689
- from great_expectations import __version__ # noqa: PLC0415
690
- from great_expectations.core import http # noqa: PLC0415
691
914
 
692
915
  header_name = self.get_header_name()
693
916
  user_agent_header_value = self.user_agent_str
@@ -15,6 +15,7 @@ from pydantic import v1 as pydantic_v1
15
15
  from great_expectations_cloud.agent.actions.unknown import UnknownEventAction
16
16
  from great_expectations_cloud.agent.exceptions import GXAgentError
17
17
  from great_expectations_cloud.agent.models import (
18
+ DomainContext,
18
19
  Event,
19
20
  EventType,
20
21
  UnknownEvent,
@@ -67,11 +68,11 @@ class EventHandler:
67
68
  self._context = context
68
69
 
69
70
  def get_event_action(
70
- self, event: Event, base_url: str, auth_key: str, organization_id: UUID
71
+ self, event: Event, base_url: str, auth_key: str, domain_context: DomainContext
71
72
  ) -> AgentAction[Any]:
72
73
  """Get the action that should be run for the given event."""
73
74
 
74
- if not self._check_event_organization_id(event, organization_id):
75
+ if not self._check_event_organization_id(event, domain_context.organization_id):
75
76
  # Making message more generic
76
77
  raise GXAgentError("Unable to process job. Invalid input.") # noqa: TRY003
77
78
 
@@ -84,17 +85,17 @@ class EventHandler:
84
85
  return action_class(
85
86
  context=self._context,
86
87
  base_url=base_url,
87
- organization_id=organization_id,
88
+ domain_context=domain_context,
88
89
  auth_key=auth_key,
89
90
  )
90
91
 
91
- def handle_event( # Refactor opportunity
92
- self, event: Event, id: str, base_url: str, auth_key: str, organization_id: UUID
92
+ def handle_event(
93
+ self, event: Event, id: str, base_url: str, auth_key: str, domain_context: DomainContext
93
94
  ) -> ActionResult:
94
- start_time = datetime.now(tz=timezone.utc)
95
95
  """Transform an Event into an ActionResult."""
96
+ start_time = datetime.now(tz=timezone.utc)
96
97
  action = self.get_event_action(
97
- event=event, base_url=base_url, auth_key=auth_key, organization_id=organization_id
98
+ event=event, base_url=base_url, auth_key=auth_key, domain_context=domain_context
98
99
  )
99
100
  LOGGER.info(f"Handling event: {event.type} -> {action.__class__.__name__}")
100
101
  action_result = action.run(event=event, id=id)
@@ -27,6 +27,7 @@ class OnMessagePayload:
27
27
  correlation_id: str
28
28
  delivery_tag: int
29
29
  body: bytes
30
+ redelivered: bool = False # Set by RabbitMQ when message is redelivered
30
31
 
31
32
 
32
33
  class OnMessageFn(Protocol):
@@ -174,8 +175,12 @@ class AsyncRabbitMQClient:
174
175
  # param on_message is provided by the caller as an argument to AsyncRabbitMQClient.run
175
176
  correlation_id = header_frame.correlation_id
176
177
  delivery_tag = method_frame.delivery_tag
178
+ redelivered = method_frame.redelivered # RabbitMQ sets this flag on redelivery
177
179
  payload = OnMessagePayload(
178
- correlation_id=correlation_id, delivery_tag=delivery_tag, body=body
180
+ correlation_id=correlation_id,
181
+ delivery_tag=delivery_tag,
182
+ body=body,
183
+ redelivered=redelivered,
179
184
  )
180
185
  return on_message(payload)
181
186
 
@@ -190,10 +195,13 @@ class AsyncRabbitMQClient:
190
195
  def _on_consumer_canceled(self, method_frame: Basic.Cancel) -> None:
191
196
  """Callback invoked when the broker cancels the client's connection."""
192
197
  if self._channel is not None:
193
- LOGGER.info(
194
- "Consumer was cancelled remotely, shutting down",
198
+ LOGGER.warning(
199
+ "Consumer was cancelled remotely by RabbitMQ - this may indicate DAT timeout",
195
200
  extra={
196
- "method_frame": method_frame,
201
+ "consumer_tag": method_frame.consumer_tag
202
+ if hasattr(method_frame, "consumer_tag")
203
+ else None,
204
+ "was_consuming": self.was_consuming,
197
205
  },
198
206
  )
199
207
  self._channel.close()
@@ -232,11 +240,28 @@ class AsyncRabbitMQClient:
232
240
  self._reconnect()
233
241
  self._log_pika_exception("Connection open failed", reason)
234
242
 
235
- def _on_connection_closed(
236
- self, connection: AsyncioConnection, _unused_reason: pika.Exception
237
- ) -> None:
243
+ def _on_connection_closed(self, connection: AsyncioConnection, reason: pika.Exception) -> None:
238
244
  """Callback invoked after the broker closes the connection"""
239
- LOGGER.debug("Connection to RabbitMQ has been closed")
245
+ if isinstance(reason, (ConnectionClosed, ChannelClosed)):
246
+ LOGGER.warning(
247
+ "Connection to RabbitMQ has been closed",
248
+ extra={
249
+ "reply_code": reason.reply_code,
250
+ "reply_text": reason.reply_text,
251
+ "was_consuming": self.was_consuming,
252
+ "is_closing": self._closing,
253
+ },
254
+ )
255
+ else:
256
+ LOGGER.warning(
257
+ "Connection to RabbitMQ has been closed",
258
+ extra={
259
+ "reason": str(reason),
260
+ "reason_type": type(reason).__name__,
261
+ "was_consuming": self.was_consuming,
262
+ "is_closing": self._closing,
263
+ },
264
+ )
240
265
  self._channel = None
241
266
  self._is_unrecoverable = True
242
267
  if self._closing: