great-expectations-cloud 20250902.0.dev1__py3-none-any.whl → 20260120.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of great-expectations-cloud might be problematic. Click here for more details.

Files changed (19) hide show
  1. great_expectations_cloud/agent/actions/agent_action.py +3 -3
  2. great_expectations_cloud/agent/actions/draft_datasource_config_action.py +2 -2
  3. great_expectations_cloud/agent/actions/generate_data_quality_check_expectations_action.py +22 -14
  4. great_expectations_cloud/agent/actions/list_asset_names.py +4 -5
  5. great_expectations_cloud/agent/actions/run_checkpoint.py +64 -3
  6. great_expectations_cloud/agent/actions/run_metric_list_action.py +3 -3
  7. great_expectations_cloud/agent/actions/run_scheduled_checkpoint.py +28 -5
  8. great_expectations_cloud/agent/actions/run_window_checkpoint.py +2 -4
  9. great_expectations_cloud/agent/actions/utils.py +13 -4
  10. great_expectations_cloud/agent/agent.py +280 -43
  11. great_expectations_cloud/agent/event_handler.py +8 -7
  12. great_expectations_cloud/agent/message_service/asyncio_rabbit_mq_client.py +36 -8
  13. great_expectations_cloud/agent/message_service/subscriber.py +4 -0
  14. great_expectations_cloud/agent/models.py +23 -2
  15. {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/METADATA +5 -5
  16. {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/RECORD +19 -19
  17. {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/WHEEL +1 -1
  18. {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info}/entry_points.txt +0 -0
  19. {great_expectations_cloud-20250902.0.dev1.dist-info → great_expectations_cloud-20260120.0.dev0.dist-info/licenses}/LICENSE +0 -0
@@ -2,13 +2,20 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  import logging
5
+ import os
6
+ import resource
7
+ import signal
8
+ import socket
5
9
  import sys
10
+ import threading
11
+ import time
6
12
  import traceback
7
13
  import warnings
8
14
  from collections import defaultdict
9
15
  from concurrent.futures import Future
10
16
  from concurrent.futures.thread import ThreadPoolExecutor
11
17
  from functools import partial
18
+ from http import HTTPStatus
12
19
  from importlib.metadata import version as metadata_version
13
20
  from typing import TYPE_CHECKING, Any, Callable, Final, Literal
14
21
  from urllib.parse import urljoin, urlparse
@@ -16,13 +23,12 @@ from uuid import UUID
16
23
 
17
24
  import orjson
18
25
  import requests
26
+ from great_expectations import __version__, get_context
27
+ from great_expectations.core import http
19
28
  from great_expectations.core.http import create_session
20
29
  from great_expectations.data_context.cloud_constants import CLOUD_DEFAULT_BASE_URL
21
- from great_expectations.data_context.data_context.context_factory import get_context
22
30
  from great_expectations.data_context.types.base import ProgressBarsConfig
23
- from pika.adapters.utils.connection_workflow import (
24
- AMQPConnectorException,
25
- )
31
+ from pika.adapters.utils.connection_workflow import AMQPConnectorException
26
32
  from pika.exceptions import (
27
33
  AMQPConnectionError,
28
34
  AMQPError,
@@ -45,9 +51,7 @@ from great_expectations_cloud.agent.config import (
45
51
  generate_config_validation_error_text,
46
52
  )
47
53
  from great_expectations_cloud.agent.constants import USER_AGENT_HEADER, HeaderName
48
- from great_expectations_cloud.agent.event_handler import (
49
- EventHandler,
50
- )
54
+ from great_expectations_cloud.agent.event_handler import EventHandler
51
55
  from great_expectations_cloud.agent.exceptions import (
52
56
  GXAgentConfigError,
53
57
  GXAgentError,
@@ -67,6 +71,7 @@ from great_expectations_cloud.agent.models import (
67
71
  AgentBaseExtraForbid,
68
72
  CreateScheduledJobAndSetJobStarted,
69
73
  CreateScheduledJobAndSetJobStartedRequest,
74
+ DomainContext,
70
75
  JobCompleted,
71
76
  JobStarted,
72
77
  JobStatus,
@@ -138,6 +143,9 @@ class GXAgent:
138
143
  _PYPI_GX_AGENT_PACKAGE_NAME = "great_expectations_cloud"
139
144
  _PYPI_GREAT_EXPECTATIONS_PACKAGE_NAME = "great_expectations"
140
145
 
146
+ # Heartbeat interval in seconds (log progress every 60 seconds during job processing)
147
+ _HEARTBEAT_INTERVAL_SECONDS = 60
148
+
141
149
  def __init__(self: Self):
142
150
  self._config = self._create_config()
143
151
 
@@ -150,19 +158,6 @@ class GXAgent:
150
158
  "great_expectations_version": great_expectations_version,
151
159
  },
152
160
  )
153
- LOGGER.debug("Loading a DataContext - this might take a moment.")
154
-
155
- with warnings.catch_warnings():
156
- # suppress warnings about GX version
157
- warnings.filterwarnings("ignore", message="You are using great_expectations version")
158
- self._context: CloudDataContext = get_context(
159
- cloud_mode=True,
160
- user_agent_str=self.user_agent_str,
161
- )
162
- self._configure_progress_bars(data_context=self._context)
163
- LOGGER.debug("DataContext is ready.")
164
-
165
- self._set_http_session_headers(data_context=self._context)
166
161
 
167
162
  # Create a thread pool with a single worker, so we can run long-lived
168
163
  # GX processes and maintain our connection to the broker. Note that
@@ -174,6 +169,15 @@ class GXAgent:
174
169
  self._correlation_ids: defaultdict[str, int] = defaultdict(lambda: 0)
175
170
  self._listen_tries = 0
176
171
 
172
+ # Heartbeat tracking
173
+ self._heartbeat_stop_event: threading.Event | None = None
174
+ self._heartbeat_thread: threading.Thread | None = None
175
+ self._current_job_correlation_id: str | None = None
176
+ self._current_job_start_time: float | None = None
177
+
178
+ # Install signal handlers for graceful shutdown logging
179
+ self._install_signal_handlers()
180
+
177
181
  def run(self) -> None:
178
182
  """Open a connection to GX Cloud."""
179
183
 
@@ -218,7 +222,7 @@ class GXAgent:
218
222
  LOGGER.exception("The connection to GX Cloud has encountered an error.")
219
223
  except GXAgentUnrecoverableConnectionError:
220
224
  LOGGER.exception("The connection to GX Cloud has encountered an unrecoverable error.")
221
- sys.exit(1)
225
+ os.kill(os.getpid(), signal.SIGTERM)
222
226
  except (
223
227
  AuthenticationError,
224
228
  ProbableAuthenticationError,
@@ -233,6 +237,98 @@ class GXAgent:
233
237
  if subscriber is not None:
234
238
  subscriber.close()
235
239
 
240
+ def _install_signal_handlers(self) -> None:
241
+ """Install signal handlers to log when the process receives shutdown signals."""
242
+ original_sigterm = signal.getsignal(signal.SIGTERM)
243
+ original_sigint = signal.getsignal(signal.SIGINT)
244
+
245
+ def sigterm_handler(signum: int, frame: Any) -> None:
246
+ self._log_signal_received("SIGTERM", signum)
247
+ if callable(original_sigterm):
248
+ original_sigterm(signum, frame)
249
+ elif original_sigterm == signal.SIG_DFL:
250
+ raise SystemExit(128 + signum)
251
+
252
+ def sigint_handler(signum: int, frame: Any) -> None:
253
+ self._log_signal_received("SIGINT", signum)
254
+ if callable(original_sigint):
255
+ original_sigint(signum, frame)
256
+ elif original_sigint == signal.SIG_DFL:
257
+ raise KeyboardInterrupt
258
+
259
+ signal.signal(signal.SIGTERM, sigterm_handler)
260
+ signal.signal(signal.SIGINT, sigint_handler)
261
+
262
+ def _log_signal_received(self, signal_name: str, signum: int) -> None:
263
+ """Log when a shutdown signal is received, including current job info."""
264
+ memory_mb = self._get_memory_usage_mb()
265
+ LOGGER.warning(
266
+ f"Received {signal_name} signal - shutting down",
267
+ extra={
268
+ "signal": signal_name,
269
+ "signal_number": signum,
270
+ "hostname": socket.gethostname(),
271
+ "current_job_correlation_id": self._current_job_correlation_id,
272
+ "job_elapsed_seconds": (
273
+ time.time() - self._current_job_start_time
274
+ if self._current_job_start_time
275
+ else None
276
+ ),
277
+ "memory_usage_mb": memory_mb,
278
+ "has_active_task": self._current_task is not None and not self._current_task.done(),
279
+ },
280
+ )
281
+
282
+ def _get_memory_usage_mb(self) -> float:
283
+ """Get current memory usage in MB using resource module."""
284
+ # ru_maxrss is in KB on Linux, bytes on macOS
285
+ usage = resource.getrusage(resource.RUSAGE_SELF)
286
+ # On macOS, ru_maxrss is in bytes; on Linux, it's in KB
287
+ if sys.platform == "darwin":
288
+ return usage.ru_maxrss / (1024 * 1024)
289
+ return usage.ru_maxrss / 1024
290
+
291
+ def _start_heartbeat(self, correlation_id: str, org_id: UUID, workspace_id: UUID) -> None:
292
+ """Start a background thread that logs periodic heartbeats during job processing."""
293
+ self._current_job_correlation_id = correlation_id
294
+ self._current_job_start_time = time.time()
295
+ self._heartbeat_stop_event = threading.Event()
296
+
297
+ def heartbeat_loop() -> None:
298
+ stop_event = self._heartbeat_stop_event
299
+ if stop_event is None:
300
+ return
301
+ while not stop_event.wait(timeout=self._HEARTBEAT_INTERVAL_SECONDS):
302
+ if stop_event.is_set():
303
+ break
304
+ elapsed = time.time() - (self._current_job_start_time or time.time())
305
+ memory_mb = self._get_memory_usage_mb()
306
+ LOGGER.debug(
307
+ "job.heartbeat",
308
+ extra={
309
+ "correlation_id": correlation_id,
310
+ "organization_id": str(org_id),
311
+ "workspace_id": str(workspace_id),
312
+ "hostname": socket.gethostname(),
313
+ "elapsed_seconds": round(elapsed, 1),
314
+ "memory_usage_mb": round(memory_mb, 1),
315
+ },
316
+ )
317
+
318
+ self._heartbeat_thread = threading.Thread(target=heartbeat_loop, daemon=True)
319
+ self._heartbeat_thread.start()
320
+
321
+ def _stop_heartbeat(self) -> None:
322
+ """Stop the heartbeat thread."""
323
+ if self._heartbeat_stop_event:
324
+ self._heartbeat_stop_event.set()
325
+ if self._heartbeat_thread and self._heartbeat_thread.is_alive():
326
+ self._heartbeat_thread.join(timeout=2)
327
+ self._heartbeat_thread = None
328
+ self._heartbeat_stop_event = None
329
+ self._current_job_correlation_id = None
330
+ self._current_job_start_time = None
331
+
236
332
  @classmethod
237
333
  def get_current_gx_agent_version(cls) -> str:
238
334
  version: str = metadata_version(cls._PYPI_GX_AGENT_PACKAGE_NAME)
@@ -252,8 +348,26 @@ class GXAgent:
252
348
  Args:
253
349
  event_context: An Event with related properties and actions.
254
350
  """
351
+ # Track how many times this correlation_id has been seen BY THIS POD (for local diagnostics)
352
+ # Note: event_context.redelivered is set by RabbitMQ and indicates cross-pod redelivery
353
+ local_delivery_count = self._correlation_ids.get(event_context.correlation_id, 0)
354
+
255
355
  if self._reject_correlation_id(event_context.correlation_id) is True:
256
- # this event has been redelivered too many times - remove it from circulation
356
+ # this event has been redelivered too many times to THIS pod - remove it from circulation
357
+ LOGGER.error(
358
+ "Message redelivered too many times to this pod, removing from queue",
359
+ extra={
360
+ "event_type": event_context.event.type,
361
+ "correlation_id": event_context.correlation_id,
362
+ "organization_id": self.get_organization_id(event_context),
363
+ "workspace_id": str(self.get_workspace_id(event_context)),
364
+ "schedule_id": event_context.event.schedule_id
365
+ if isinstance(event_context.event, ScheduledEventBase)
366
+ else None,
367
+ "local_delivery_count": local_delivery_count,
368
+ "redelivered": event_context.redelivered,
369
+ },
370
+ )
257
371
  event_context.processed_with_failures()
258
372
  return
259
373
  elif self._can_accept_new_task() is not True:
@@ -263,9 +377,11 @@ class GXAgent:
263
377
  "event_type": event_context.event.type,
264
378
  "correlation_id": event_context.correlation_id,
265
379
  "organization_id": self.get_organization_id(event_context),
380
+ "workspace_id": str(self.get_workspace_id(event_context)),
266
381
  "schedule_id": event_context.event.schedule_id
267
382
  if isinstance(event_context.event, ScheduledEventBase)
268
383
  else None,
384
+ "redelivered": event_context.redelivered,
269
385
  },
270
386
  )
271
387
  # request that this message is redelivered later
@@ -274,6 +390,20 @@ class GXAgent:
274
390
  self._redeliver_msg_task = loop.create_task(event_context.redeliver_message())
275
391
  return
276
392
 
393
+ if event_context.redelivered:
394
+ LOGGER.warning(
395
+ "rabbitmq.message.redelivered",
396
+ extra={
397
+ "event_type": event_context.event.type,
398
+ "correlation_id": event_context.correlation_id,
399
+ "organization_id": self.get_organization_id(event_context),
400
+ "workspace_id": str(self.get_workspace_id(event_context)),
401
+ "schedule_id": event_context.event.schedule_id
402
+ if isinstance(event_context.event, ScheduledEventBase)
403
+ else None,
404
+ },
405
+ )
406
+
277
407
  self._current_task = self._executor.submit(
278
408
  self._handle_event,
279
409
  event_context=event_context,
@@ -287,8 +417,23 @@ class GXAgent:
287
417
  self._current_task.add_done_callback(on_exit_callback)
288
418
 
289
419
  def get_data_context(self, event_context: EventContext) -> CloudDataContext:
290
- """Helper method to get a DataContext Agent. Overridden in GX-Runner."""
291
- return self._context
420
+ """Create a new CloudDataContext for each job using the event's workspace_id."""
421
+ with warnings.catch_warnings():
422
+ warnings.filterwarnings("ignore", message="You are using great_expectations version")
423
+ workspace_id = self.get_workspace_id(event_context)
424
+
425
+ LOGGER.debug("Loading a DataContext - this might take a moment.")
426
+
427
+ context: CloudDataContext = get_context(
428
+ cloud_mode=True,
429
+ user_agent_str=self.user_agent_str,
430
+ cloud_workspace_id=str(workspace_id),
431
+ )
432
+ self._configure_progress_bars(data_context=context)
433
+
434
+ LOGGER.debug("DataContext is ready.")
435
+
436
+ return context
292
437
 
293
438
  def get_organization_id(self, event_context: EventContext) -> UUID:
294
439
  """Helper method to get the organization ID. Overridden in GX-Runner."""
@@ -298,6 +443,13 @@ class GXAgent:
298
443
  """Helper method to get the auth key. Overridden in GX-Runner."""
299
444
  return self._get_config().gx_cloud_access_token
300
445
 
446
+ def get_workspace_id(self, event_context: EventContext) -> UUID:
447
+ """Helper method to get the workspace ID from the event."""
448
+ workspace_id: UUID | None = getattr(event_context.event, "workspace_id", None)
449
+ if workspace_id is None:
450
+ raise GXAgentError()
451
+ return workspace_id
452
+
301
453
  def _set_sentry_tags(self, even_context: EventContext) -> None:
302
454
  """Used by GX-Runner to set tags for Sentry logging. No-op in the Agent."""
303
455
  pass
@@ -320,27 +472,38 @@ class GXAgent:
320
472
  )
321
473
 
322
474
  org_id = self.get_organization_id(event_context)
475
+ workspace_id = self.get_workspace_id(event_context)
323
476
  base_url = self._get_config().gx_cloud_base_url
324
477
  auth_key = self.get_auth_key()
325
478
 
326
479
  if isinstance(event_context.event, ScheduledEventBase):
327
- self._create_scheduled_job_and_set_started(event_context, org_id)
480
+ self._create_scheduled_job_and_set_started(event_context, org_id, workspace_id)
328
481
  else:
329
482
  self._update_status(
330
- correlation_id=event_context.correlation_id, status=JobStarted(), org_id=org_id
483
+ correlation_id=event_context.correlation_id,
484
+ status=JobStarted(),
485
+ org_id=org_id,
486
+ workspace_id=workspace_id,
331
487
  )
488
+ memory_mb = self._get_memory_usage_mb()
332
489
  LOGGER.info(
333
- "Starting job",
490
+ "job.started",
334
491
  extra={
335
492
  "event_type": event_context.event.type,
336
493
  "correlation_id": event_context.correlation_id,
337
494
  "organization_id": str(org_id),
495
+ "workspace_id": str(workspace_id),
338
496
  "schedule_id": event_context.event.schedule_id
339
497
  if isinstance(event_context.event, ScheduledEventBase)
340
498
  else None,
499
+ "hostname": socket.gethostname(),
500
+ "redelivered": event_context.redelivered,
501
+ "memory_usage_mb": round(memory_mb, 1),
341
502
  },
342
503
  )
343
504
 
505
+ self._start_heartbeat(event_context.correlation_id, org_id, workspace_id)
506
+
344
507
  self._set_sentry_tags(event_context)
345
508
 
346
509
  handler = EventHandler(context=data_context)
@@ -350,7 +513,7 @@ class GXAgent:
350
513
  id=event_context.correlation_id,
351
514
  base_url=base_url,
352
515
  auth_key=auth_key,
353
- organization_id=org_id,
516
+ domain_context=DomainContext(organization_id=org_id, workspace_id=workspace_id),
354
517
  )
355
518
  return result
356
519
 
@@ -365,10 +528,30 @@ class GXAgent:
365
528
  """
366
529
  # warning: this method will not be executed in the main thread
367
530
 
531
+ # Calculate job duration before stopping heartbeat (which clears start time)
532
+ job_elapsed_time = (
533
+ time.time() - self._current_job_start_time if self._current_job_start_time else None
534
+ )
535
+ self._stop_heartbeat()
536
+
368
537
  org_id = self.get_organization_id(event_context)
538
+ workspace_id = self.get_workspace_id(event_context)
539
+
540
+ memory_mb = self._get_memory_usage_mb()
541
+ LOGGER.debug(
542
+ "job.thread_exiting",
543
+ extra={
544
+ "correlation_id": event_context.correlation_id,
545
+ "hostname": socket.gethostname(),
546
+ "has_exception": future.exception() is not None,
547
+ "cancelled": future.cancelled(),
548
+ "memory_usage_mb": round(memory_mb, 1),
549
+ },
550
+ )
369
551
 
370
552
  # get results or errors from the thread
371
553
  error = future.exception()
554
+
372
555
  if error is None:
373
556
  result: ActionResult = future.result()
374
557
 
@@ -379,15 +562,21 @@ class GXAgent:
379
562
  error_stack_trace="The version of the GX Agent you are using does not support this functionality. Please upgrade to the most recent image tagged with `stable`.",
380
563
  processed_by=self._get_processed_by(),
381
564
  )
382
- LOGGER.error(
383
- "Job completed with error. Ensure agent is up-to-date.",
565
+ LOGGER.warning(
566
+ "job.completed",
384
567
  extra={
385
568
  "event_type": event_context.event.type,
386
- "id": event_context.correlation_id,
569
+ "correlation_id": event_context.correlation_id,
570
+ "job_duration": job_elapsed_time,
571
+ "success": False,
387
572
  "organization_id": str(org_id),
573
+ "workspace_id": str(workspace_id),
388
574
  "schedule_id": event_context.event.schedule_id
389
575
  if isinstance(event_context.event, ScheduledEventBase)
390
576
  else None,
577
+ "hostname": socket.gethostname(),
578
+ "error_type": "UnknownEvent",
579
+ "error_message": "Agent does not support this event type. Upgrade required.",
391
580
  },
392
581
  )
393
582
  else:
@@ -397,34 +586,49 @@ class GXAgent:
397
586
  processed_by=self._get_processed_by(),
398
587
  )
399
588
  LOGGER.info(
400
- "Completed job",
589
+ "job.completed",
401
590
  extra={
402
591
  "event_type": event_context.event.type,
403
592
  "correlation_id": event_context.correlation_id,
404
593
  "job_duration": (
405
594
  result.job_duration.total_seconds() if result.job_duration else None
406
595
  ),
596
+ "success": True,
407
597
  "organization_id": str(org_id),
598
+ "workspace_id": str(workspace_id),
408
599
  "schedule_id": event_context.event.schedule_id
409
600
  if isinstance(event_context.event, ScheduledEventBase)
410
601
  else None,
602
+ "hostname": socket.gethostname(),
411
603
  },
412
604
  )
413
605
  else:
414
- status = build_failed_job_completed_status(error)
606
+ status = build_failed_job_completed_status(error, processed_by=self._get_processed_by())
415
607
  LOGGER.info(traceback.format_exc())
416
- LOGGER.info(
417
- "Job completed with error",
608
+ LOGGER.warning(
609
+ "job.completed",
418
610
  extra={
419
611
  "event_type": event_context.event.type,
420
612
  "correlation_id": event_context.correlation_id,
613
+ "job_duration": job_elapsed_time,
614
+ "success": False,
421
615
  "organization_id": str(org_id),
616
+ "workspace_id": str(workspace_id),
617
+ "schedule_id": event_context.event.schedule_id
618
+ if isinstance(event_context.event, ScheduledEventBase)
619
+ else None,
620
+ "hostname": socket.gethostname(),
621
+ "error_type": type(error).__name__,
622
+ "error_message": str(error)[:500], # Truncate to avoid huge logs
422
623
  },
423
624
  )
424
625
 
425
626
  try:
426
627
  self._update_status(
427
- correlation_id=event_context.correlation_id, status=status, org_id=org_id
628
+ correlation_id=event_context.correlation_id,
629
+ status=status,
630
+ org_id=org_id,
631
+ workspace_id=workspace_id,
428
632
  )
429
633
  except Exception:
430
634
  LOGGER.exception(
@@ -433,6 +637,7 @@ class GXAgent:
433
637
  "correlation_id": event_context.correlation_id,
434
638
  "status": str(status),
435
639
  "organization_id": str(org_id),
640
+ "workspace_id": str(workspace_id),
436
641
  },
437
642
  )
438
643
  # We do not want to cause an infinite loop of errors
@@ -552,7 +757,9 @@ class GXAgent:
552
757
  )
553
758
  )
554
759
 
555
- def _update_status(self, correlation_id: str, status: JobStatus, org_id: UUID) -> None:
760
+ def _update_status(
761
+ self, correlation_id: str, status: JobStatus, org_id: UUID, workspace_id: UUID
762
+ ) -> None:
556
763
  """Update GX Cloud on the status of a job.
557
764
 
558
765
  Args:
@@ -565,11 +772,12 @@ class GXAgent:
565
772
  "correlation_id": correlation_id,
566
773
  "status": str(status),
567
774
  "organization_id": str(org_id),
775
+ "workspace_id": str(workspace_id),
568
776
  },
569
777
  )
570
778
  agent_sessions_url = urljoin(
571
779
  self._get_config().gx_cloud_base_url,
572
- f"/api/v1/organizations/{org_id}/agent-jobs/{correlation_id}",
780
+ f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs/{correlation_id}",
573
781
  )
574
782
  with create_session(access_token=self.get_auth_key()) as session:
575
783
  data = UpdateJobStatusRequest(data=status).json()
@@ -580,6 +788,7 @@ class GXAgent:
580
788
  "correlation_id": correlation_id,
581
789
  "status": str(status),
582
790
  "organization_id": str(org_id),
791
+ "workspace_id": str(workspace_id),
583
792
  },
584
793
  )
585
794
  GXAgent._log_http_error(
@@ -587,7 +796,7 @@ class GXAgent:
587
796
  )
588
797
 
589
798
  def _create_scheduled_job_and_set_started(
590
- self, event_context: EventContext, org_id: UUID
799
+ self, event_context: EventContext, org_id: UUID, workspace_id: UUID
591
800
  ) -> None:
592
801
  """Create a job in GX Cloud for scheduled events.
593
802
 
@@ -609,13 +818,14 @@ class GXAgent:
609
818
  "correlation_id": str(event_context.correlation_id),
610
819
  "event_type": str(event_context.event.type),
611
820
  "organization_id": str(org_id),
821
+ "workspace_id": str(workspace_id),
612
822
  "schedule_id": str(event_context.event.schedule_id),
613
823
  },
614
824
  )
615
825
 
616
826
  agent_sessions_url = urljoin(
617
827
  self._get_config().gx_cloud_base_url,
618
- f"/api/v1/organizations/{org_id}/agent-jobs",
828
+ f"/api/v1/organizations/{org_id}/workspaces/{workspace_id}/agent-jobs",
619
829
  )
620
830
  data = CreateScheduledJobAndSetJobStarted(
621
831
  type="run_scheduled_checkpoint.received",
@@ -629,6 +839,31 @@ class GXAgent:
629
839
  with create_session(access_token=self.get_auth_key()) as session:
630
840
  payload = CreateScheduledJobAndSetJobStartedRequest(data=data).json()
631
841
  response = session.post(agent_sessions_url, data=payload)
842
+
843
+ if response.status_code == HTTPStatus.BAD_REQUEST:
844
+ try:
845
+ response_body = response.json()
846
+ except Exception:
847
+ response_body = response.text
848
+ LOGGER.warning(
849
+ "Job already exists - this message was likely redelivered by RabbitMQ "
850
+ "after another runner already claimed it. Continuing to process anyway "
851
+ "as a safety measure in case the original runner failed.",
852
+ extra={
853
+ "correlation_id": str(event_context.correlation_id),
854
+ "event_type": str(event_context.event.type),
855
+ "organization_id": str(org_id),
856
+ "schedule_id": str(event_context.event.schedule_id),
857
+ "workspace_id": str(workspace_id),
858
+ "response_status": response.status_code,
859
+ "response_body": response_body,
860
+ },
861
+ )
862
+ # Note: We intentionally continue processing instead of NACKing.
863
+ # This ensures job completion even if the first runner fails.
864
+ # TODO: Once we add inProgress timeout in Mercury, we can
865
+ # safely NACK here to prevent duplicate processing.
866
+
632
867
  LOGGER.info(
633
868
  "Created scheduled job and set started",
634
869
  extra={
@@ -636,6 +871,8 @@ class GXAgent:
636
871
  "event_type": str(event_context.event.type),
637
872
  "organization_id": str(org_id),
638
873
  "schedule_id": str(event_context.event.schedule_id),
874
+ "workspace_id": str(workspace_id),
875
+ "response_status": response.status_code,
639
876
  },
640
877
  )
641
878
  GXAgent._log_http_error(
@@ -658,7 +895,9 @@ class GXAgent:
658
895
  """
659
896
  Sets headers on all stores in the data context.
660
897
  """
661
- from great_expectations.data_context.store.gx_cloud_store_backend import GXCloudStoreBackend # noqa: I001, PLC0415
898
+ from great_expectations.data_context.store.gx_cloud_store_backend import ( # noqa: PLC0415
899
+ GXCloudStoreBackend,
900
+ )
662
901
 
663
902
  # OSS doesn't use the same session for all requests, so we need to set the header for each store
664
903
  stores = list(data_context.stores.values())
@@ -686,8 +925,6 @@ class GXAgent:
686
925
  Note: the Agent-Job-Id header value will be set for all GX Cloud request until this method is
687
926
  called again.
688
927
  """
689
- from great_expectations import __version__ # noqa: PLC0415
690
- from great_expectations.core import http # noqa: PLC0415
691
928
 
692
929
  header_name = self.get_header_name()
693
930
  user_agent_header_value = self.user_agent_str
@@ -15,6 +15,7 @@ from pydantic import v1 as pydantic_v1
15
15
  from great_expectations_cloud.agent.actions.unknown import UnknownEventAction
16
16
  from great_expectations_cloud.agent.exceptions import GXAgentError
17
17
  from great_expectations_cloud.agent.models import (
18
+ DomainContext,
18
19
  Event,
19
20
  EventType,
20
21
  UnknownEvent,
@@ -67,11 +68,11 @@ class EventHandler:
67
68
  self._context = context
68
69
 
69
70
  def get_event_action(
70
- self, event: Event, base_url: str, auth_key: str, organization_id: UUID
71
+ self, event: Event, base_url: str, auth_key: str, domain_context: DomainContext
71
72
  ) -> AgentAction[Any]:
72
73
  """Get the action that should be run for the given event."""
73
74
 
74
- if not self._check_event_organization_id(event, organization_id):
75
+ if not self._check_event_organization_id(event, domain_context.organization_id):
75
76
  # Making message more generic
76
77
  raise GXAgentError("Unable to process job. Invalid input.") # noqa: TRY003
77
78
 
@@ -84,17 +85,17 @@ class EventHandler:
84
85
  return action_class(
85
86
  context=self._context,
86
87
  base_url=base_url,
87
- organization_id=organization_id,
88
+ domain_context=domain_context,
88
89
  auth_key=auth_key,
89
90
  )
90
91
 
91
- def handle_event( # Refactor opportunity
92
- self, event: Event, id: str, base_url: str, auth_key: str, organization_id: UUID
92
+ def handle_event(
93
+ self, event: Event, id: str, base_url: str, auth_key: str, domain_context: DomainContext
93
94
  ) -> ActionResult:
94
- start_time = datetime.now(tz=timezone.utc)
95
95
  """Transform an Event into an ActionResult."""
96
+ start_time = datetime.now(tz=timezone.utc)
96
97
  action = self.get_event_action(
97
- event=event, base_url=base_url, auth_key=auth_key, organization_id=organization_id
98
+ event=event, base_url=base_url, auth_key=auth_key, domain_context=domain_context
98
99
  )
99
100
  LOGGER.info(f"Handling event: {event.type} -> {action.__class__.__name__}")
100
101
  action_result = action.run(event=event, id=id)