dagster-cloud 1.8.2__py3-none-any.whl → 1.12.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. dagster_cloud/__init__.py +3 -3
  2. dagster_cloud/agent/__init__.py +4 -4
  3. dagster_cloud/agent/cli/__init__.py +56 -17
  4. dagster_cloud/agent/dagster_cloud_agent.py +360 -172
  5. dagster_cloud/agent/instrumentation/__init__.py +0 -0
  6. dagster_cloud/agent/instrumentation/constants.py +2 -0
  7. dagster_cloud/agent/instrumentation/run_launch.py +23 -0
  8. dagster_cloud/agent/instrumentation/schedule.py +34 -0
  9. dagster_cloud/agent/instrumentation/sensor.py +34 -0
  10. dagster_cloud/anomaly_detection/__init__.py +2 -2
  11. dagster_cloud/anomaly_detection/defs.py +17 -12
  12. dagster_cloud/anomaly_detection/types.py +3 -3
  13. dagster_cloud/api/dagster_cloud_api.py +209 -293
  14. dagster_cloud/auth/constants.py +21 -5
  15. dagster_cloud/batching/__init__.py +1 -0
  16. dagster_cloud/batching/batcher.py +210 -0
  17. dagster_cloud/dagster_insights/__init__.py +12 -6
  18. dagster_cloud/dagster_insights/bigquery/bigquery_utils.py +3 -2
  19. dagster_cloud/dagster_insights/bigquery/dbt_wrapper.py +39 -12
  20. dagster_cloud/dagster_insights/bigquery/insights_bigquery_resource.py +8 -6
  21. dagster_cloud/dagster_insights/insights_utils.py +18 -8
  22. dagster_cloud/dagster_insights/metrics_utils.py +12 -12
  23. dagster_cloud/dagster_insights/snowflake/dagster_snowflake_insights.py +5 -12
  24. dagster_cloud/dagster_insights/snowflake/dbt_wrapper.py +34 -8
  25. dagster_cloud/dagster_insights/snowflake/definitions.py +38 -12
  26. dagster_cloud/dagster_insights/snowflake/insights_snowflake_resource.py +11 -23
  27. dagster_cloud/definitions/__init__.py +0 -0
  28. dagster_cloud/definitions/job_selection.py +36 -0
  29. dagster_cloud/execution/cloud_run_launcher/k8s.py +1 -1
  30. dagster_cloud/execution/cloud_run_launcher/process.py +3 -3
  31. dagster_cloud/execution/monitoring/__init__.py +27 -33
  32. dagster_cloud/execution/utils/process.py +3 -3
  33. dagster_cloud/instance/__init__.py +125 -38
  34. dagster_cloud/instrumentation/__init__.py +32 -0
  35. dagster_cloud/metadata/source_code.py +13 -8
  36. dagster_cloud/metrics/__init__.py +0 -0
  37. dagster_cloud/metrics/tracer.py +59 -0
  38. dagster_cloud/opentelemetry/__init__.py +0 -0
  39. dagster_cloud/opentelemetry/config/__init__.py +73 -0
  40. dagster_cloud/opentelemetry/config/exporter.py +81 -0
  41. dagster_cloud/opentelemetry/config/log_record_processor.py +40 -0
  42. dagster_cloud/opentelemetry/config/logging_handler.py +14 -0
  43. dagster_cloud/opentelemetry/config/meter_provider.py +9 -0
  44. dagster_cloud/opentelemetry/config/metric_reader.py +39 -0
  45. dagster_cloud/opentelemetry/controller.py +319 -0
  46. dagster_cloud/opentelemetry/enum.py +58 -0
  47. dagster_cloud/opentelemetry/factories/__init__.py +1 -0
  48. dagster_cloud/opentelemetry/factories/logs.py +113 -0
  49. dagster_cloud/opentelemetry/factories/metrics.py +121 -0
  50. dagster_cloud/opentelemetry/metrics/__init__.py +0 -0
  51. dagster_cloud/opentelemetry/metrics/meter.py +140 -0
  52. dagster_cloud/opentelemetry/observers/__init__.py +0 -0
  53. dagster_cloud/opentelemetry/observers/dagster_exception_handler.py +40 -0
  54. dagster_cloud/opentelemetry/observers/execution_observer.py +178 -0
  55. dagster_cloud/pex/grpc/__generated__/multi_pex_api_pb2.pyi +175 -0
  56. dagster_cloud/pex/grpc/__init__.py +2 -2
  57. dagster_cloud/pex/grpc/client.py +4 -4
  58. dagster_cloud/pex/grpc/compile.py +2 -2
  59. dagster_cloud/pex/grpc/server/__init__.py +2 -2
  60. dagster_cloud/pex/grpc/server/cli/__init__.py +31 -19
  61. dagster_cloud/pex/grpc/server/manager.py +60 -42
  62. dagster_cloud/pex/grpc/server/registry.py +28 -21
  63. dagster_cloud/pex/grpc/server/server.py +23 -14
  64. dagster_cloud/pex/grpc/types.py +5 -5
  65. dagster_cloud/py.typed +0 -0
  66. dagster_cloud/secrets/__init__.py +1 -1
  67. dagster_cloud/secrets/loader.py +3 -3
  68. dagster_cloud/serverless/__init__.py +1 -1
  69. dagster_cloud/serverless/io_manager.py +36 -53
  70. dagster_cloud/storage/client.py +54 -17
  71. dagster_cloud/storage/compute_logs/__init__.py +3 -1
  72. dagster_cloud/storage/compute_logs/compute_log_manager.py +22 -17
  73. dagster_cloud/storage/defs_state/__init__.py +3 -0
  74. dagster_cloud/storage/defs_state/queries.py +15 -0
  75. dagster_cloud/storage/defs_state/storage.py +113 -0
  76. dagster_cloud/storage/event_logs/__init__.py +3 -1
  77. dagster_cloud/storage/event_logs/queries.py +102 -4
  78. dagster_cloud/storage/event_logs/storage.py +266 -73
  79. dagster_cloud/storage/event_logs/utils.py +88 -7
  80. dagster_cloud/storage/runs/__init__.py +1 -1
  81. dagster_cloud/storage/runs/queries.py +17 -2
  82. dagster_cloud/storage/runs/storage.py +88 -42
  83. dagster_cloud/storage/schedules/__init__.py +1 -1
  84. dagster_cloud/storage/schedules/storage.py +6 -8
  85. dagster_cloud/storage/tags.py +66 -1
  86. dagster_cloud/util/__init__.py +10 -12
  87. dagster_cloud/util/errors.py +49 -64
  88. dagster_cloud/version.py +1 -1
  89. dagster_cloud/workspace/config_schema/__init__.py +55 -13
  90. dagster_cloud/workspace/docker/__init__.py +76 -25
  91. dagster_cloud/workspace/docker/utils.py +1 -1
  92. dagster_cloud/workspace/ecs/__init__.py +1 -1
  93. dagster_cloud/workspace/ecs/client.py +51 -33
  94. dagster_cloud/workspace/ecs/launcher.py +76 -22
  95. dagster_cloud/workspace/ecs/run_launcher.py +3 -3
  96. dagster_cloud/workspace/ecs/utils.py +14 -5
  97. dagster_cloud/workspace/kubernetes/__init__.py +1 -1
  98. dagster_cloud/workspace/kubernetes/launcher.py +61 -29
  99. dagster_cloud/workspace/kubernetes/utils.py +34 -22
  100. dagster_cloud/workspace/user_code_launcher/__init__.py +5 -3
  101. dagster_cloud/workspace/user_code_launcher/process.py +16 -14
  102. dagster_cloud/workspace/user_code_launcher/user_code_launcher.py +552 -172
  103. dagster_cloud/workspace/user_code_launcher/utils.py +105 -1
  104. {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/METADATA +48 -42
  105. dagster_cloud-1.12.6.dist-info/RECORD +134 -0
  106. {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/WHEEL +1 -1
  107. dagster_cloud-1.8.2.dist-info/RECORD +0 -100
  108. {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/top_level.txt +0 -0
@@ -2,33 +2,49 @@ import logging
2
2
  import os
3
3
  import sys
4
4
  import time
5
- from collections import deque
5
+ from collections import defaultdict, deque
6
+ from collections.abc import Iterator
6
7
  from concurrent.futures import Future, ThreadPoolExecutor
7
8
  from contextlib import ExitStack
8
9
  from pathlib import Path
9
- from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple, Union, cast
10
+ from typing import TYPE_CHECKING, Any, Optional, Union, cast
10
11
 
11
12
  import dagster._check as check
12
13
  from dagster import DagsterInstance
13
14
  from dagster._core.launcher.base import LaunchRunContext
14
- from dagster._core.remote_representation import CodeLocationOrigin
15
- from dagster._core.remote_representation.origin import RegisteredCodeLocationOrigin
15
+ from dagster._core.remote_origin import CodeLocationOrigin, RegisteredCodeLocationOrigin
16
16
  from dagster._core.utils import FuturesAwareThreadPoolExecutor
17
17
  from dagster._grpc.client import DagsterGrpcClient
18
18
  from dagster._grpc.types import CancelExecutionRequest
19
19
  from dagster._serdes import deserialize_value, serialize_value
20
20
  from dagster._time import get_current_datetime, get_current_timestamp
21
+ from dagster._utils.cached_method import cached_method
21
22
  from dagster._utils.container import retrieve_containerized_utilization_metrics
22
- from dagster._utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info
23
+ from dagster._utils.error import (
24
+ SerializableErrorInfo,
25
+ serializable_error_info_from_exc_info,
26
+ truncate_serialized_error,
27
+ )
23
28
  from dagster._utils.interrupts import raise_interrupts_as
24
29
  from dagster._utils.merger import merge_dicts
25
30
  from dagster._utils.typed_dict import init_optional_typeddict
26
31
  from dagster_cloud_cli.core.errors import DagsterCloudHTTPError, raise_http_error
27
- from dagster_cloud_cli.core.workspace import CodeLocationDeployData, get_instance_ref_for_user_code
32
+ from dagster_cloud_cli.core.workspace import CodeLocationDeployData
28
33
 
34
+ from dagster_cloud.agent.instrumentation.constants import DAGSTER_CLOUD_AGENT_METRIC_PREFIX
35
+ from dagster_cloud.agent.instrumentation.run_launch import extract_run_attributes
36
+ from dagster_cloud.agent.instrumentation.schedule import inspect_schedule_result
37
+ from dagster_cloud.agent.instrumentation.sensor import inspect_sensor_result
38
+ from dagster_cloud.agent.queries import (
39
+ ADD_AGENT_HEARTBEATS_MUTATION,
40
+ DEPLOYMENTS_QUERY,
41
+ GET_USER_CLOUD_REQUESTS_QUERY,
42
+ WORKSPACE_ENTRIES_QUERY,
43
+ )
29
44
  from dagster_cloud.api.dagster_cloud_api import (
30
45
  AgentHeartbeat,
31
46
  AgentUtilizationMetrics,
47
+ BatchDagsterCloudUploadApiResponse,
32
48
  DagsterCloudApi,
33
49
  DagsterCloudApiErrorResponse,
34
50
  DagsterCloudApiGrpcResponse,
@@ -40,21 +56,16 @@ from dagster_cloud.api.dagster_cloud_api import (
40
56
  DagsterCloudUploadApiResponse,
41
57
  TimestampedError,
42
58
  )
59
+ from dagster_cloud.batching import Batcher
43
60
  from dagster_cloud.instance import DagsterCloudAgentInstance
44
- from dagster_cloud.util.errors import truncate_serialized_error
61
+ from dagster_cloud.opentelemetry.observers.execution_observer import observe_execution
62
+ from dagster_cloud.util import SERVER_HANDLE_TAG, compressed_namedtuple_upload_file, is_isolated_run
63
+ from dagster_cloud.version import __version__
45
64
  from dagster_cloud.workspace.user_code_launcher import (
46
65
  DagsterCloudUserCodeLauncher,
47
66
  UserCodeLauncherEntry,
48
67
  )
49
-
50
- from ..util import SERVER_HANDLE_TAG, compressed_namedtuple_upload_file, is_isolated_run
51
- from ..version import __version__
52
- from .queries import (
53
- ADD_AGENT_HEARTBEATS_MUTATION,
54
- DEPLOYMENTS_QUERY,
55
- GET_USER_CLOUD_REQUESTS_QUERY,
56
- WORKSPACE_ENTRIES_QUERY,
57
- )
68
+ from dagster_cloud.workspace.user_code_launcher.utils import get_instance_ref_for_user_code
58
69
 
59
70
  if TYPE_CHECKING:
60
71
  import datetime
@@ -73,6 +84,11 @@ DEFAULT_PENDING_REQUESTS_LIMIT = 100
73
84
 
74
85
  SLEEP_INTERVAL_SECONDS = float(os.getenv("DAGSTER_CLOUD_AGENT_SLEEP_INTERVAL_SECONDS", "0.5"))
75
86
 
87
+
88
+ def UPLOAD_API_RESPONSE_BATCHING_ENABLED():
89
+ return os.getenv("DAGSTER_CLOUD_AGENT_UPLOAD_API_RESPONSE_BATCHING_ENABLED") == "true"
90
+
91
+
76
92
  DEPLOYMENT_INFO_QUERY = """
77
93
  query DeploymentInfo {
78
94
  deploymentInfo {
@@ -96,10 +112,22 @@ LIVENESS_CHECK_INTERVAL_SECONDS = float(
96
112
 
97
113
 
98
114
  class DagsterCloudAgent:
99
- def __init__(self, pending_requests_limit: int = DEFAULT_PENDING_REQUESTS_LIMIT):
115
+ def __init__(
116
+ self,
117
+ instance: DagsterCloudAgentInstance,
118
+ pending_requests_limit: int = DEFAULT_PENDING_REQUESTS_LIMIT,
119
+ ):
100
120
  self._logger = logging.getLogger("dagster_cloud.agent")
121
+ self._instance: DagsterCloudAgentInstance = instance
101
122
 
102
- self._logger.info("Starting Dagster Cloud agent...")
123
+ self._batcher: defaultdict[
124
+ str, Batcher[tuple[str, DagsterCloudUploadApiResponse], None]
125
+ ] = defaultdict(self._batcher_factory)
126
+
127
+ if self._logger.isEnabledFor(logging.DEBUG):
128
+ self._logger.info("Starting Dagster Cloud agent with debug logging...")
129
+ else:
130
+ self._logger.info("Starting Dagster Cloud agent...")
103
131
 
104
132
  self._exit_stack = ExitStack()
105
133
  self._iteration = 0
@@ -110,7 +138,7 @@ class DagsterCloudAgent:
110
138
  thread_name_prefix="dagster_cloud_agent_worker",
111
139
  )
112
140
  )
113
- self._request_ids_to_futures: Dict[str, Future] = {}
141
+ self._request_ids_to_futures: dict[str, Future] = {}
114
142
  self._utilization_metrics = init_optional_typeddict(AgentUtilizationMetrics)
115
143
 
116
144
  self._last_heartbeat_time: Optional[datetime.datetime] = None
@@ -121,26 +149,38 @@ class DagsterCloudAgent:
121
149
  maxlen=AGENT_HEARTBEAT_ERROR_LIMIT
122
150
  ) # (SerializableErrorInfo, timestamp) tuples
123
151
 
124
- self._pending_requests: List[Dict[str, Any]] = []
125
- self._locations_with_pending_requests: Set[Tuple[str, str, bool]] = set()
126
- self._ready_requests: List[Dict[str, Any]] = []
152
+ self._pending_requests: list[dict[str, Any]] = []
153
+ self._locations_with_pending_requests: set[tuple[str, str, bool]] = set()
154
+ self._ready_requests: list[dict[str, Any]] = []
127
155
 
128
- self._location_query_times: Dict[Tuple[str, str, bool], float] = {}
156
+ self._location_query_times: dict[tuple[str, str, bool], float] = {}
129
157
  self._pending_requests_limit = check.int_param(
130
158
  pending_requests_limit, "pending_requests_limit"
131
159
  )
132
- self._active_deployments: Set[Tuple[str, bool]] = ( # deployment_name, is_branch_deployment
160
+ self._active_deployments: set[tuple[str, bool]] = ( # deployment_name, is_branch_deployment
133
161
  set()
134
162
  )
135
163
 
136
164
  self._last_liveness_check_time = None
137
165
 
166
+ self._warned_about_long_in_progress_reconcile = False
167
+
138
168
  def __enter__(self):
139
169
  return self
140
170
 
141
171
  def __exit__(self, _exception_type, _exception_value, _traceback):
142
172
  self._exit_stack.close()
143
173
 
174
+ def _batcher_factory(
175
+ self,
176
+ ) -> Batcher[tuple[str, DagsterCloudUploadApiResponse], None]:
177
+ return Batcher(
178
+ "upload_api_response",
179
+ self._batch_upload_api_response,
180
+ max_wait_ms=50,
181
+ max_batch_size=32,
182
+ )
183
+
144
184
  @property
145
185
  def _active_deployment_names(self):
146
186
  return [deployment[0] for deployment in self._active_deployments]
@@ -149,14 +189,15 @@ class DagsterCloudAgent:
149
189
  def _active_full_deployment_names(self):
150
190
  return [deployment[0] for deployment in self._active_deployments if not deployment[1]]
151
191
 
152
- def _check_initial_deployment_names(self, instance: DagsterCloudAgentInstance):
153
- if instance.deployment_names:
154
- result = instance.organization_scoped_graphql_client().execute(
155
- DEPLOYMENTS_QUERY, variable_values={"deploymentNames": instance.deployment_names}
192
+ def _check_initial_deployment_names(self):
193
+ if self._instance.deployment_names:
194
+ result = self._instance.organization_scoped_graphql_client().execute(
195
+ DEPLOYMENTS_QUERY,
196
+ variable_values={"deploymentNames": self._instance.deployment_names},
156
197
  )
157
198
  deployments = result["data"]["deployments"]
158
199
  existing_deployment_names = {deployment["deploymentName"] for deployment in deployments}
159
- requested_deployment_names = set(instance.deployment_names)
200
+ requested_deployment_names = set(self._instance.deployment_names)
160
201
  missing_deployment_names = requested_deployment_names.difference(
161
202
  existing_deployment_names
162
203
  )
@@ -171,13 +212,40 @@ class DagsterCloudAgent:
171
212
  def _update_agent_resource_limits(
172
213
  self, user_code_launcher: DagsterCloudUserCodeLauncher
173
214
  ) -> None:
174
- # If the env var DAGSTER_CLOUD_AGENT_MEMORY_LIMIT is set, use that as the memory limit.
175
- memory_limit = os.getenv("DAGSTER_CLOUD_AGENT_MEMORY_LIMIT")
176
- # If the env var DAGSTER_CLOUD_AGENT_CPU_LIMIT is set, use that as the cpu limit.
177
- cpu_limit = os.getenv("DAGSTER_CLOUD_AGENT_CPU_LIMIT")
215
+ # The agent should have environment variables defining its resource requests and limits.
216
+ # However, the agent may be running in a container with resource limits that are different
217
+ # For example, on k8s there are ways to effect change on the cpu limit, like mutating admission webhooks.
218
+ # Since the effective cgroup limits precede actual hosts resources when it comes to actual behavior for
219
+ # throttling and oom kills, we attempt to obtain these and fallback on the environment variables.
220
+ container_utilization_metrics = self._utilization_metrics.get("container_utilization", {})
221
+ memory_limit = container_utilization_metrics.get("memory_limit")
222
+ if not memory_limit:
223
+ memory_limit = os.getenv("DAGSTER_CLOUD_AGENT_MEMORY_LIMIT")
224
+ self._logger.info(
225
+ "Cannot obtain cgroup memory limit, using environment value: "
226
+ f"DAGSTER_CLOUD_AGENT_MEMORY_LIMIT={memory_limit}"
227
+ )
228
+
229
+ cpu_cfs_period_us = container_utilization_metrics.get("cpu_cfs_period_us")
230
+ cpu_cfs_quota_us = container_utilization_metrics.get("cpu_cfs_quota_us")
231
+
232
+ cpu_limit = None
233
+ if cpu_cfs_quota_us and cpu_cfs_period_us:
234
+ cpu_limit = (
235
+ 1000.0 * cpu_cfs_quota_us
236
+ ) / cpu_cfs_period_us # cpu_limit expressed in milliseconds of cpu
237
+
238
+ if not cpu_limit:
239
+ cpu_limit = os.getenv("DAGSTER_CLOUD_AGENT_CPU_LIMIT")
240
+ self._logger.info(
241
+ "Cannot obtain CPU CFS values, using environment value: "
242
+ f"DAGSTER_CLOUD_AGENT_CPU_LIMIT={cpu_limit}"
243
+ )
244
+
178
245
  if not user_code_launcher.user_code_deployment_type.supports_utilization_metrics:
179
246
  self._logger.info(
180
- f"Cannot interpret resource limits for agent type {user_code_launcher.user_code_deployment_type.value}. Skipping utilization metrics retrieval."
247
+ f"Cannot interpret resource limits for agent type {user_code_launcher.user_code_deployment_type.value}."
248
+ "Skipping utilization metrics retrieval."
181
249
  )
182
250
  return
183
251
 
@@ -185,6 +253,7 @@ class DagsterCloudAgent:
185
253
  "cpu_limit": cpu_limit,
186
254
  "memory_limit": memory_limit,
187
255
  }
256
+
188
257
  cpu_request = os.getenv("DAGSTER_CLOUD_AGENT_CPU_REQUEST")
189
258
  memory_request = os.getenv("DAGSTER_CLOUD_AGENT_MEMORY_REQUEST")
190
259
  if cpu_request:
@@ -192,7 +261,6 @@ class DagsterCloudAgent:
192
261
  if memory_request:
193
262
  limits["memory_request"] = memory_request
194
263
 
195
- # At this point, the only agent types possible are serverless, ecs, and k8s, all of which are supported. The linter isn't smart enough to realize this, so we disable it.
196
264
  self._utilization_metrics["resource_limits"][
197
265
  user_code_launcher.user_code_deployment_type.value
198
266
  ] = limits # type: ignore
@@ -216,16 +284,15 @@ class DagsterCloudAgent:
216
284
 
217
285
  def run_loop(
218
286
  self,
219
- instance: DagsterCloudAgentInstance,
220
287
  user_code_launcher,
221
288
  agent_uuid,
222
289
  ):
223
290
  heartbeat_interval_seconds = AGENT_HEARTBEAT_INTERVAL_SECONDS
224
291
 
225
292
  if (
226
- not instance.includes_branch_deployments
227
- and not instance.deployment_names
228
- and not instance.include_all_serverless_deployments
293
+ not self._instance.includes_branch_deployments
294
+ and not self._instance.deployment_names
295
+ and not self._instance.include_all_serverless_deployments
229
296
  ):
230
297
  self._logger.info(
231
298
  "Deployment name was not set - checking to see if it can be fetched from the"
@@ -233,20 +300,34 @@ class DagsterCloudAgent:
233
300
  )
234
301
  # Fetch the deployment name from the server if it isn't set (only true
235
302
  # for old agents, and only will work if there's a single deployment in the org)
236
- result = instance.graphql_client.execute(DEPLOYMENT_INFO_QUERY)
303
+ result = self._instance.graphql_client.execute(DEPLOYMENT_INFO_QUERY)
237
304
  deployment_name = result["data"]["deploymentInfo"]["deploymentName"]
238
- instance = self._exit_stack.enter_context(
239
- DagsterInstance.from_ref(instance.ref_for_deployment(deployment_name)) # type: ignore # (instance subclass)
305
+ self._instance = self._exit_stack.enter_context(
306
+ DagsterInstance.from_ref(self._instance.ref_for_deployment(deployment_name)) # type: ignore # (instance subclass)
240
307
  )
241
308
 
242
- self._check_initial_deployment_names(instance)
309
+ self._check_initial_deployment_names()
310
+
311
+ serving = []
312
+ queues = list(filter(None, self._instance.agent_queues_config.queues))
313
+ if queues:
314
+ serving.append(f"queues{queues}")
315
+ if self._instance.deployment_names:
316
+ serving.append(f"deployments{self._instance.deployment_names}")
317
+ if self._instance.include_all_serverless_deployments:
318
+ serving.append("all serverless deployments")
319
+ if self._instance.includes_branch_deployments:
320
+ serving.append("branch deployments")
321
+
322
+ self._logger.info(f"Agent is serving: {', '.join(serving)}")
243
323
 
244
324
  self._check_update_workspace(
245
- instance, user_code_launcher, upload_all=user_code_launcher.upload_snapshots_on_startup
325
+ user_code_launcher,
326
+ upload_all=user_code_launcher.upload_snapshots_on_startup,
246
327
  )
247
328
 
248
329
  self._logger.info(
249
- f"Will start polling for requests from {instance.dagster_cloud_url} once user code has"
330
+ f"Will start polling for requests from {self._instance.dagster_cloud_url} once user code has"
250
331
  " been loaded."
251
332
  )
252
333
 
@@ -254,7 +335,7 @@ class DagsterCloudAgent:
254
335
 
255
336
  while True:
256
337
  try:
257
- for error in self.run_iteration(instance, user_code_launcher):
338
+ for error in self.run_iteration(user_code_launcher):
258
339
  if error:
259
340
  self._logger.error(str(error))
260
341
  self._errors.appendleft(
@@ -283,16 +364,18 @@ class DagsterCloudAgent:
283
364
 
284
365
  if user_code_launcher.ready_to_serve_requests:
285
366
  try:
286
- self._check_add_heartbeat(instance, agent_uuid, heartbeat_interval_seconds)
367
+ self._check_add_heartbeat(agent_uuid, heartbeat_interval_seconds)
287
368
  except Exception:
288
369
  self._logger.exception("Failed to add heartbeat")
289
370
 
371
+ self._check_for_long_running_reconcile(user_code_launcher)
372
+
290
373
  # Check for any received interrupts
291
374
  with raise_interrupts_as(KeyboardInterrupt):
292
375
  pass
293
376
 
294
377
  try:
295
- self._check_update_workspace(instance, user_code_launcher, upload_all=False)
378
+ self._check_update_workspace(user_code_launcher, upload_all=False)
296
379
 
297
380
  except Exception:
298
381
  self._logger.error(
@@ -329,7 +412,27 @@ class DagsterCloudAgent:
329
412
  self._logger.error(f"Failed to write liveness sentinel and disabling it: {e}")
330
413
  self._last_liveness_check_time = False
331
414
 
332
- def _check_update_workspace(self, instance, user_code_launcher, upload_all):
415
+ def _check_for_long_running_reconcile(self, user_code_launcher):
416
+ """Detect from the main thread if the background reconcile thread is running behind or has gotten stuck."""
417
+ in_progress_reconcile_start_time = user_code_launcher.in_progress_reconcile_start_time
418
+
419
+ reconcile_start_time_warning = int(
420
+ os.getenv("DAGSTER_CLOUD_AGENT_RECONCILE_START_TIME_WARNING", "3600")
421
+ )
422
+
423
+ if (
424
+ in_progress_reconcile_start_time is not None
425
+ and (time.time() - in_progress_reconcile_start_time) >= reconcile_start_time_warning
426
+ ):
427
+ if not self._warned_about_long_in_progress_reconcile:
428
+ self._logger.warning(
429
+ f"Agent has been redeploying code servers for more than {reconcile_start_time_warning} seconds. This may indicate the background thread that performs the redeploys is stuck."
430
+ )
431
+ self._warned_about_long_in_progress_reconcile = True
432
+ else:
433
+ self._warned_about_long_in_progress_reconcile = False
434
+
435
+ def _check_update_workspace(self, user_code_launcher, upload_all):
333
436
  curr_time = get_current_datetime()
334
437
 
335
438
  if (
@@ -340,11 +443,10 @@ class DagsterCloudAgent:
340
443
  return
341
444
 
342
445
  self._last_workspace_check_time = curr_time
343
- self._query_for_workspace_updates(instance, user_code_launcher, upload_all=upload_all)
446
+ self._query_for_workspace_updates(user_code_launcher, upload_all=upload_all)
344
447
 
345
448
  def _check_add_heartbeat(
346
449
  self,
347
- instance: DagsterCloudAgentInstance,
348
450
  agent_uuid,
349
451
  heartbeat_interval_seconds,
350
452
  ):
@@ -357,19 +459,22 @@ class DagsterCloudAgent:
357
459
  return
358
460
 
359
461
  errors = [
360
- TimestampedError(timestamp.timestamp(), error)
462
+ TimestampedError(
463
+ timestamp=timestamp.timestamp(),
464
+ error=error,
465
+ )
361
466
  for (error, timestamp) in self._errors
362
467
  if timestamp.timestamp() > curr_time.timestamp() - 60 * 60 * 24
363
468
  ]
364
469
 
365
- run_worker_statuses_dict = instance.user_code_launcher.get_cloud_run_worker_statuses(
470
+ run_worker_statuses_dict = self._instance.user_code_launcher.get_cloud_run_worker_statuses(
366
471
  self._active_deployment_names
367
472
  )
368
473
 
369
- code_server_heartbeats_dict = instance.user_code_launcher.get_grpc_server_heartbeats()
474
+ code_server_heartbeats_dict = self._instance.user_code_launcher.get_grpc_server_heartbeats()
370
475
 
371
476
  agent_image_tag = os.getenv("DAGSTER_CLOUD_AGENT_IMAGE_TAG")
372
- if instance.user_code_launcher.agent_metrics_enabled:
477
+ if self._instance.user_code_launcher.agent_metrics_enabled:
373
478
  num_running_requests = self._utilization_metrics["request_utilization"][
374
479
  "num_running_requests"
375
480
  ]
@@ -386,10 +491,10 @@ class DagsterCloudAgent:
386
491
  deployment_name: AgentHeartbeat(
387
492
  timestamp=curr_time.timestamp(),
388
493
  agent_id=agent_uuid,
389
- agent_label=instance.dagster_cloud_api_agent_label,
494
+ agent_label=self._instance.dagster_cloud_api_agent_label,
390
495
  agent_type=(
391
- type(instance.user_code_launcher).__name__
392
- if instance.user_code_launcher
496
+ type(self._instance.user_code_launcher).__name__
497
+ if self._instance.user_code_launcher
393
498
  else None
394
499
  ),
395
500
  metadata=merge_dicts(
@@ -397,13 +502,13 @@ class DagsterCloudAgent:
397
502
  {"image_tag": agent_image_tag} if agent_image_tag else {},
398
503
  {
399
504
  "utilization_metrics": self._utilization_metrics
400
- if instance.user_code_launcher.agent_metrics_enabled
505
+ if self._instance.user_code_launcher.agent_metrics_enabled
401
506
  else {}
402
507
  },
403
508
  ),
404
509
  run_worker_statuses=run_worker_statuses_dict[deployment_name],
405
510
  code_server_heartbeats=code_server_heartbeats_dict.get(deployment_name, []),
406
- agent_queues_config=instance.agent_queues_config,
511
+ agent_queues_config=self._instance.agent_queues_config,
407
512
  )
408
513
  for deployment_name in self._active_deployment_names
409
514
  }
@@ -418,7 +523,7 @@ class DagsterCloudAgent:
418
523
 
419
524
  serialized_errors = [serialize_value(error) for error in errors]
420
525
  try:
421
- instance.organization_scoped_graphql_client().execute(
526
+ self._instance.organization_scoped_graphql_client().execute(
422
527
  ADD_AGENT_HEARTBEATS_MUTATION,
423
528
  variable_values={
424
529
  "serializedAgentHeartbeats": serialized_agent_heartbeats,
@@ -444,15 +549,15 @@ class DagsterCloudAgent:
444
549
  for deployment_name, heartbeat in heartbeats.items()
445
550
  ]
446
551
 
447
- instance.organization_scoped_graphql_client().execute(
552
+ self._instance.organization_scoped_graphql_client().execute(
448
553
  ADD_AGENT_HEARTBEATS_MUTATION,
449
554
  variable_values={
450
555
  "serializedAgentHeartbeats": serialized_agent_heartbeats,
451
556
  "serializedErrors": [
452
557
  serialize_value(
453
558
  TimestampedError(
454
- curr_time.timestamp(),
455
- SerializableErrorInfo(
559
+ timestamp=curr_time.timestamp(),
560
+ error=SerializableErrorInfo(
456
561
  error_message,
457
562
  stack=[],
458
563
  cls_name=None,
@@ -471,26 +576,24 @@ class DagsterCloudAgent:
471
576
  return self._executor
472
577
 
473
578
  @property
474
- def request_ids_to_futures(self) -> Dict[str, Future]:
579
+ def request_ids_to_futures(self) -> dict[str, Future]:
475
580
  return self._request_ids_to_futures
476
581
 
477
582
  def _upload_outdated_workspace_entries(
478
583
  self,
479
- instance: DagsterCloudAgentInstance,
480
584
  deployment_name: str,
481
585
  is_branch_deployment: bool,
482
586
  user_code_launcher: DagsterCloudUserCodeLauncher,
483
587
  ):
484
- result = instance.graphql_client_for_deployment(deployment_name).execute(
588
+ result = self._instance.graphql_client_for_deployment(deployment_name).execute(
485
589
  WORKSPACE_ENTRIES_QUERY,
486
590
  variable_values={
487
591
  "deploymentNames": [deployment_name],
488
592
  "includeAllServerlessDeployments": False,
489
- "agentQueues": instance.agent_queues_config.queues,
593
+ "agentQueues": self._instance.agent_queues_config.queues,
490
594
  },
491
595
  )
492
596
  entries = result["data"]["deployments"][0]["workspaceEntries"]
493
- now = time.time()
494
597
 
495
598
  upload_metadata = {}
496
599
 
@@ -501,10 +604,11 @@ class DagsterCloudAgent:
501
604
  )
502
605
  if entry["hasOutdatedData"]:
503
606
  # Spin up a server for this location and upload its metadata to Cloud
504
- # (Bump the TTL counter as well to leave the server up)
607
+ # (Bump the TTL counter as well to leave the server up - ensure that a slighty
608
+ # different timestamp is chosen for each location to break ties)
505
609
  self._location_query_times[
506
610
  (deployment_name, location_name, is_branch_deployment)
507
- ] = now
611
+ ] = time.time()
508
612
  upload_metadata[(deployment_name, location_name)] = UserCodeLauncherEntry(
509
613
  code_location_deploy_data=code_location_deploy_data,
510
614
  update_timestamp=float(entry["metadataTimestamp"]),
@@ -516,16 +620,14 @@ class DagsterCloudAgent:
516
620
  # branch deployments always have TTLs, other deployments only if you asked for it specifically
517
621
  return is_branch_deployment or user_code_launcher.server_ttl_enabled_for_full_deployments
518
622
 
519
- def _get_ttl_seconds(self, instance, is_branch_deployment):
623
+ def _get_ttl_seconds(self, is_branch_deployment):
520
624
  return (
521
- instance.user_code_launcher.branch_deployment_ttl_seconds
625
+ self._instance.user_code_launcher.branch_deployment_ttl_seconds
522
626
  if is_branch_deployment
523
- else instance.user_code_launcher.full_deployment_ttl_seconds
627
+ else self._instance.user_code_launcher.full_deployment_ttl_seconds
524
628
  )
525
629
 
526
- def _get_locations_with_ttl_to_query(
527
- self, instance, user_code_launcher
528
- ) -> List[Tuple[str, str]]:
630
+ def _get_locations_with_ttl_to_query(self, user_code_launcher) -> list[tuple[str, str]]:
529
631
  now = time.time()
530
632
 
531
633
  # For the deployments with TTLs, decide which locations to consider
@@ -533,13 +635,13 @@ class DagsterCloudAgent:
533
635
  # - a) There's a pending request in the queue for it
534
636
  # - b) It's TTL hasn't expired since the last time somebody asked for it
535
637
  # Always include locations in a), and add locations from b) until you hit a limit
536
- location_candidates: Set[Tuple[str, str, float]] = {
638
+ location_candidates: set[tuple[str, str, float]] = {
537
639
  (deployment, location, -1.0) # Score below 0 so that they're at the front of the list
538
640
  for deployment, location, is_branch_deployment in self._locations_with_pending_requests
539
641
  if self._has_ttl(user_code_launcher, is_branch_deployment)
540
642
  }
541
643
 
542
- num_locations_to_query = instance.user_code_launcher.server_ttl_max_servers
644
+ num_locations_to_query = self._instance.user_code_launcher.server_ttl_max_servers
543
645
 
544
646
  if len(location_candidates) > num_locations_to_query:
545
647
  self._logger.warning(
@@ -561,7 +663,7 @@ class DagsterCloudAgent:
561
663
 
562
664
  time_since_last_query = now - query_time
563
665
 
564
- if time_since_last_query >= self._get_ttl_seconds(instance, is_branch_deployment):
666
+ if time_since_last_query >= self._get_ttl_seconds(is_branch_deployment):
565
667
  continue
566
668
 
567
669
  location_candidates.add((deployment_name, location, time_since_last_query))
@@ -583,13 +685,10 @@ class DagsterCloudAgent:
583
685
 
584
686
  def _query_for_workspace_updates(
585
687
  self,
586
- instance: DagsterCloudAgentInstance,
587
688
  user_code_launcher: DagsterCloudUserCodeLauncher,
588
689
  upload_all: bool,
589
690
  ):
590
- locations_with_ttl_to_query = self._get_locations_with_ttl_to_query(
591
- instance, user_code_launcher
592
- )
691
+ locations_with_ttl_to_query = self._get_locations_with_ttl_to_query(user_code_launcher)
593
692
 
594
693
  deployments_to_query = {key[0] for key in locations_with_ttl_to_query}
595
694
 
@@ -600,23 +699,23 @@ class DagsterCloudAgent:
600
699
  self._logger.debug(f"Querying for the following locations with TTL: {locations_str}")
601
700
 
602
701
  # If you have specified a non-branch deployment and no TTL, always consider it
603
- if instance.deployment_names:
604
- deployments_to_query = deployments_to_query.union(set(instance.deployment_names))
702
+ if self._instance.deployment_names:
703
+ deployments_to_query = deployments_to_query.union(set(self._instance.deployment_names))
605
704
 
606
705
  # Create mapping of
607
706
  # - location name => deployment metadata
608
- deployment_map: Dict[Tuple[str, str], UserCodeLauncherEntry] = {}
609
- all_locations: Set[Tuple[str, str]] = set()
707
+ deployment_map: dict[tuple[str, str], UserCodeLauncherEntry] = {}
708
+ all_locations: set[tuple[str, str]] = set()
610
709
 
611
710
  self._active_deployments = set()
612
711
 
613
- if deployments_to_query or instance.include_all_serverless_deployments:
614
- result = instance.organization_scoped_graphql_client().execute(
712
+ if deployments_to_query or self._instance.include_all_serverless_deployments:
713
+ result = self._instance.organization_scoped_graphql_client().execute(
615
714
  WORKSPACE_ENTRIES_QUERY,
616
715
  variable_values={
617
716
  "deploymentNames": list(deployments_to_query),
618
- "includeAllServerlessDeployments": instance.include_all_serverless_deployments,
619
- "agentQueues": instance.agent_queues_config.queues,
717
+ "includeAllServerlessDeployments": self._instance.include_all_serverless_deployments,
718
+ "agentQueues": self._instance.agent_queues_config.queues,
620
719
  },
621
720
  )
622
721
 
@@ -645,7 +744,7 @@ class DagsterCloudAgent:
645
744
  # only include the locations within locations_with_ttl_to_query.
646
745
  if not self._has_ttl(
647
746
  user_code_launcher, is_branch_deployment
648
- ) or location_key in cast(Set[Tuple[str, str]], locations_with_ttl_to_query):
747
+ ) or location_key in cast("set[tuple[str, str]]", locations_with_ttl_to_query):
649
748
  deployment_map[location_key] = UserCodeLauncherEntry(
650
749
  code_location_deploy_data=code_location_deploy_data,
651
750
  update_timestamp=float(entry["metadataTimestamp"]),
@@ -718,17 +817,15 @@ class DagsterCloudAgent:
718
817
  else:
719
818
  return None
720
819
 
721
- def _get_user_code_instance_ref(
722
- self, instance: DagsterCloudAgentInstance, deployment_name: str
723
- ):
724
- return get_instance_ref_for_user_code(instance.ref_for_deployment(deployment_name))
820
+ @cached_method
821
+ def _get_user_code_instance_ref(self, deployment_name: str):
822
+ return get_instance_ref_for_user_code(self._instance.ref_for_deployment(deployment_name))
725
823
 
726
824
  def _handle_api_request(
727
825
  self,
728
826
  request: DagsterCloudApiRequest,
729
827
  deployment_name: str,
730
828
  is_branch_deployment: bool,
731
- instance: DagsterCloudAgentInstance,
732
829
  user_code_launcher: DagsterCloudUserCodeLauncher,
733
830
  ) -> Union[DagsterCloudApiSuccess, DagsterCloudApiGrpcResponse]:
734
831
  api_name = request.request_api
@@ -743,107 +840,153 @@ class DagsterCloudAgent:
743
840
  # Dagster Cloud has requested that we upload new metadata for any out of date locations in
744
841
  # the workspace
745
842
  self._upload_outdated_workspace_entries(
746
- instance, deployment_name, is_branch_deployment, user_code_launcher
843
+ deployment_name, is_branch_deployment, user_code_launcher
747
844
  )
748
845
  return DagsterCloudApiSuccess()
749
846
  elif api_name == DagsterCloudApi.GET_EXTERNAL_EXECUTION_PLAN:
750
847
  client = self._get_grpc_client(
751
- user_code_launcher, deployment_name, cast(str, location_name)
848
+ user_code_launcher, deployment_name, cast("str", location_name)
752
849
  )
753
850
  serialized_snapshot_or_error = client.execution_plan_snapshot(
754
851
  execution_plan_snapshot_args=request.request_args._replace(
755
- instance_ref=self._get_user_code_instance_ref(instance, deployment_name)
852
+ instance_ref=self._get_user_code_instance_ref(deployment_name)
756
853
  )
757
854
  )
758
- return DagsterCloudApiGrpcResponse(serialized_snapshot_or_error)
855
+ return DagsterCloudApiGrpcResponse(
856
+ serialized_response_or_error=serialized_snapshot_or_error
857
+ )
759
858
 
760
859
  elif api_name == DagsterCloudApi.GET_SUBSET_EXTERNAL_PIPELINE_RESULT:
761
860
  client = self._get_grpc_client(
762
- user_code_launcher, deployment_name, cast(str, location_name)
861
+ user_code_launcher, deployment_name, cast("str", location_name)
763
862
  )
764
863
 
765
864
  serialized_subset_result_or_error = client.external_pipeline_subset(
766
865
  pipeline_subset_snapshot_args=request.request_args
767
866
  )
768
867
 
769
- return DagsterCloudApiGrpcResponse(serialized_subset_result_or_error)
868
+ return DagsterCloudApiGrpcResponse(
869
+ serialized_response_or_error=serialized_subset_result_or_error
870
+ )
770
871
  elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_CONFIG:
771
872
  client = self._get_grpc_client(
772
- user_code_launcher, deployment_name, cast(str, location_name)
873
+ user_code_launcher, deployment_name, cast("str", location_name)
773
874
  )
774
875
  serialized_partition_config_or_error = client.external_partition_config(
775
876
  partition_args=request.request_args,
776
877
  )
777
- return DagsterCloudApiGrpcResponse(serialized_partition_config_or_error)
878
+ return DagsterCloudApiGrpcResponse(
879
+ serialized_response_or_error=serialized_partition_config_or_error
880
+ )
778
881
  elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_TAGS:
779
882
  client = self._get_grpc_client(
780
- user_code_launcher, deployment_name, cast(str, location_name)
883
+ user_code_launcher, deployment_name, cast("str", location_name)
781
884
  )
782
885
  serialized_partition_tags_or_error = client.external_partition_tags(
783
886
  partition_args=request.request_args,
784
887
  )
785
- return DagsterCloudApiGrpcResponse(serialized_partition_tags_or_error)
888
+ return DagsterCloudApiGrpcResponse(
889
+ serialized_response_or_error=serialized_partition_tags_or_error
890
+ )
786
891
  elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_NAMES:
787
892
  client = self._get_grpc_client(
788
- user_code_launcher, deployment_name, cast(str, location_name)
893
+ user_code_launcher, deployment_name, cast("str", location_name)
789
894
  )
790
895
  serialized_partition_names_or_error = client.external_partition_names(
791
896
  partition_names_args=request.request_args,
792
897
  )
793
- return DagsterCloudApiGrpcResponse(serialized_partition_names_or_error)
898
+ return DagsterCloudApiGrpcResponse(
899
+ serialized_response_or_error=serialized_partition_names_or_error
900
+ )
794
901
  elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_SET_EXECUTION_PARAM_DATA:
795
902
  client = self._get_grpc_client(
796
- user_code_launcher, deployment_name, cast(str, location_name)
903
+ user_code_launcher, deployment_name, cast("str", location_name)
797
904
  )
798
905
  serialized_partition_execution_params_or_error = (
799
906
  client.external_partition_set_execution_params(
800
907
  partition_set_execution_param_args=request.request_args
801
908
  )
802
909
  )
803
- return DagsterCloudApiGrpcResponse(serialized_partition_execution_params_or_error)
910
+ return DagsterCloudApiGrpcResponse(
911
+ serialized_response_or_error=serialized_partition_execution_params_or_error
912
+ )
804
913
  elif api_name == DagsterCloudApi.GET_EXTERNAL_SCHEDULE_EXECUTION_DATA:
805
914
  client = self._get_grpc_client(
806
- user_code_launcher, deployment_name, cast(str, location_name)
915
+ user_code_launcher, deployment_name, cast("str", location_name)
807
916
  )
808
917
 
809
918
  args = request.request_args._replace(
810
- instance_ref=self._get_user_code_instance_ref(instance, deployment_name)
919
+ instance_ref=self._get_user_code_instance_ref(deployment_name)
811
920
  )
812
921
 
813
- serialized_schedule_data_or_error = client.external_schedule_execution(
814
- external_schedule_execution_args=args,
815
- )
922
+ schedule_attributes = {
923
+ "schedule": args.schedule_name,
924
+ "repository": args.repository_origin.repository_name,
925
+ "location": args.repository_origin.code_location_origin.location_name,
926
+ "deployment": deployment_name,
927
+ }
816
928
 
817
- return DagsterCloudApiGrpcResponse(serialized_schedule_data_or_error)
929
+ with observe_execution(
930
+ opentelemetry=self._instance.opentelemetry,
931
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.schedule.evaluation",
932
+ short_description="schedule evaluation requests",
933
+ attributes=schedule_attributes,
934
+ result_evaluator_callback=inspect_schedule_result,
935
+ ) as observer:
936
+ serialized_schedule_data_or_error = client.external_schedule_execution(
937
+ external_schedule_execution_args=args,
938
+ )
939
+ observer.evaluate_result(
940
+ serialized_data_or_error=serialized_schedule_data_or_error,
941
+ )
942
+ return DagsterCloudApiGrpcResponse(
943
+ serialized_response_or_error=serialized_schedule_data_or_error
944
+ )
818
945
 
819
946
  elif api_name == DagsterCloudApi.GET_EXTERNAL_SENSOR_EXECUTION_DATA:
820
947
  client = self._get_grpc_client(
821
- user_code_launcher, deployment_name, cast(str, location_name)
948
+ user_code_launcher, deployment_name, cast("str", location_name)
822
949
  )
823
950
 
824
951
  args = request.request_args._replace(
825
- instance_ref=self._get_user_code_instance_ref(instance, deployment_name)
952
+ instance_ref=self._get_user_code_instance_ref(deployment_name)
826
953
  )
827
954
 
828
- serialized_sensor_data_or_error = client.external_sensor_execution(
829
- sensor_execution_args=args,
830
- )
955
+ sensor_attributes = {
956
+ "sensor": args.sensor_name,
957
+ "repository": args.repository_origin.repository_name,
958
+ "location": args.repository_origin.code_location_origin.location_name,
959
+ "deployment": deployment_name,
960
+ }
831
961
 
832
- return DagsterCloudApiGrpcResponse(serialized_sensor_data_or_error)
962
+ with observe_execution(
963
+ opentelemetry=self._instance.opentelemetry,
964
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.sensor.evaluation",
965
+ short_description="sensor evaluation requests",
966
+ attributes=sensor_attributes,
967
+ result_evaluator_callback=inspect_sensor_result,
968
+ ) as observer:
969
+ serialized_sensor_data_or_error = client.external_sensor_execution(
970
+ sensor_execution_args=args,
971
+ )
972
+ observer.evaluate_result(serialized_sensor_data_or_error)
973
+ return DagsterCloudApiGrpcResponse(
974
+ serialized_response_or_error=serialized_sensor_data_or_error
975
+ )
833
976
  elif api_name == DagsterCloudApi.GET_EXTERNAL_NOTEBOOK_DATA:
834
977
  client = self._get_grpc_client(
835
- user_code_launcher, deployment_name, cast(str, location_name)
978
+ user_code_launcher, deployment_name, cast("str", location_name)
836
979
  )
837
980
  response = client.external_notebook_data(request.request_args.notebook_path)
838
- return DagsterCloudApiGrpcResponse(response.decode())
981
+ return DagsterCloudApiGrpcResponse(serialized_response_or_error=response.decode())
839
982
  elif api_name == DagsterCloudApi.LAUNCH_RUN:
840
983
  run = request.request_args.dagster_run
841
984
 
842
985
  with DagsterInstance.from_ref(
843
- self._get_user_code_instance_ref(instance, deployment_name)
986
+ self._get_user_code_instance_ref(deployment_name)
844
987
  ) as scoped_instance:
845
988
  scoped_instance.report_engine_event(
846
- f"{instance.agent_display_name} is launching run {run.run_id}",
989
+ f"{self._instance.agent_display_name} is launching run {run.run_id}",
847
990
  run,
848
991
  cls=self.__class__,
849
992
  )
@@ -852,40 +995,50 @@ class DagsterCloudAgent:
852
995
  run.run_id,
853
996
  merge_dicts(
854
997
  (
855
- {"dagster/agent_label": instance.dagster_cloud_api_agent_label}
856
- if instance.dagster_cloud_api_agent_label
998
+ {"dagster/agent_label": self._instance.dagster_cloud_api_agent_label}
999
+ if self._instance.dagster_cloud_api_agent_label
857
1000
  else {}
858
1001
  ),
859
- {"dagster/agent_id": instance.instance_uuid},
1002
+ {"dagster/agent_id": self._instance.instance_uuid},
860
1003
  ),
861
1004
  )
862
1005
 
863
- launcher = scoped_instance.get_run_launcher_for_run(run) # type: ignore # (instance subclass)
1006
+ run_attributes = extract_run_attributes(deployment_name, run)
1007
+ with observe_execution(
1008
+ opentelemetry=self._instance.opentelemetry,
1009
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.run.launches",
1010
+ short_description="run execution requests",
1011
+ attributes=run_attributes,
1012
+ ) as observer:
1013
+ launcher = scoped_instance.get_run_launcher_for_run(run) # type: ignore # (instance subclass)
864
1014
 
865
- if is_isolated_run(run):
866
- launcher.launch_run(LaunchRunContext(dagster_run=run, workspace=None))
867
- else:
868
- scoped_instance.report_engine_event(
869
- f"Launching {run.run_id} without an isolated run environment.",
870
- run,
871
- cls=self.__class__,
872
- )
1015
+ if is_isolated_run(run):
1016
+ launcher.launch_run(LaunchRunContext(dagster_run=run, workspace=None))
1017
+ else:
1018
+ scoped_instance.report_engine_event(
1019
+ f"Launching {run.run_id} without an isolated run environment.",
1020
+ run,
1021
+ cls=self.__class__,
1022
+ )
873
1023
 
874
- run_location_name = cast(
875
- str,
876
- run.external_job_origin.repository_origin.code_location_origin.location_name,
877
- )
1024
+ run_location_name = cast(
1025
+ "str",
1026
+ run.remote_job_origin.repository_origin.code_location_origin.location_name,
1027
+ )
878
1028
 
879
- server = user_code_launcher.get_grpc_server(deployment_name, run_location_name)
1029
+ server = user_code_launcher.get_grpc_server(
1030
+ deployment_name, run_location_name
1031
+ )
880
1032
 
881
- # Record the server handle that we launched it on to for run monitoring
882
- scoped_instance.add_run_tags(
883
- run.run_id, new_tags={SERVER_HANDLE_TAG: str(server.server_handle)}
884
- )
1033
+ # Record the server handle that we launched it on to for run monitoring
1034
+ scoped_instance.add_run_tags(
1035
+ run.run_id, new_tags={SERVER_HANDLE_TAG: str(server.server_handle)}
1036
+ )
885
1037
 
886
- launcher.launch_run_from_grpc_client(
887
- scoped_instance, run, server.server_endpoint.create_client()
888
- )
1038
+ launcher.launch_run_from_grpc_client(
1039
+ scoped_instance, run, server.server_endpoint.create_client()
1040
+ )
1041
+ observer.evaluate_result(run=run)
889
1042
 
890
1043
  return DagsterCloudApiSuccess()
891
1044
  elif api_name == DagsterCloudApi.TERMINATE_RUN:
@@ -896,18 +1049,18 @@ class DagsterCloudAgent:
896
1049
  run = request.request_args.dagster_run
897
1050
 
898
1051
  with DagsterInstance.from_ref(
899
- instance.ref_for_deployment(deployment_name)
1052
+ self._instance.ref_for_deployment(deployment_name)
900
1053
  ) as scoped_instance:
901
- if instance.is_using_isolated_agents:
1054
+ if self._instance.is_using_isolated_agents:
902
1055
  scoped_instance.report_engine_event(
903
- f"{instance.agent_display_name} received request to mark run as canceling",
1056
+ f"{self._instance.agent_display_name} received request to mark run as canceling",
904
1057
  run,
905
1058
  cls=self.__class__,
906
1059
  )
907
1060
  scoped_instance.report_run_canceling(run)
908
1061
  else:
909
1062
  scoped_instance.report_engine_event(
910
- f"{instance.agent_display_name} received request to terminate run",
1063
+ f"{self._instance.agent_display_name} received request to terminate run",
911
1064
  run,
912
1065
  cls=self.__class__,
913
1066
  )
@@ -916,8 +1069,8 @@ class DagsterCloudAgent:
916
1069
  launcher.terminate(run.run_id)
917
1070
  else:
918
1071
  run_location_name = cast(
919
- str,
920
- run.external_job_origin.repository_origin.code_location_origin.location_name,
1072
+ "str",
1073
+ run.remote_job_origin.repository_origin.code_location_origin.location_name,
921
1074
  )
922
1075
 
923
1076
  server = user_code_launcher.get_grpc_server(
@@ -929,15 +1082,20 @@ class DagsterCloudAgent:
929
1082
  client.cancel_execution(CancelExecutionRequest(run_id=run.run_id))
930
1083
 
931
1084
  return DagsterCloudApiSuccess()
932
-
1085
+ elif api_name in (
1086
+ DagsterCloudApi.CHECK_STEP_HEALTH,
1087
+ DagsterCloudApi.TERMINATE_STEP,
1088
+ DagsterCloudApi.LAUNCH_STEP,
1089
+ DagsterCloudApi.CHECK_RUN_HEALTH,
1090
+ DagsterCloudApi.LOAD_REPOSITORIES,
1091
+ ):
1092
+ check.failed(f"Unexpected deprecated request type {api_name}")
933
1093
  else:
934
1094
  check.assert_never(api_name)
935
- raise Exception(f"Unexpected dagster cloud api call {api_name}")
936
1095
 
937
1096
  def _process_api_request(
938
1097
  self,
939
- json_request: Dict,
940
- instance: DagsterCloudAgentInstance,
1098
+ json_request: dict,
941
1099
  user_code_launcher: DagsterCloudUserCodeLauncher,
942
1100
  submitted_to_executor_timestamp: float,
943
1101
  ) -> Optional[SerializableErrorInfo]:
@@ -956,7 +1114,7 @@ class DagsterCloudAgent:
956
1114
  )
957
1115
 
958
1116
  if request_api not in DagsterCloudApi.__members__:
959
- api_result = DagsterCloudApiUnknownCommandResponse(request_api)
1117
+ api_result = DagsterCloudApiUnknownCommandResponse(request_api=request_api)
960
1118
  self._logger.warning(
961
1119
  f"Ignoring request {json_request}: Unknown command. This is likely due to running an "
962
1120
  "older version of the agent."
@@ -966,7 +1124,7 @@ class DagsterCloudAgent:
966
1124
  request = deserialize_value(request_body, DagsterCloudApiRequest)
967
1125
  self._logger.info(f"Received request {request}.")
968
1126
  api_result = self._handle_api_request(
969
- request, deployment_name, is_branch_deployment, instance, user_code_launcher
1127
+ request, deployment_name, is_branch_deployment, user_code_launcher
970
1128
  )
971
1129
  except Exception:
972
1130
  error_info = serializable_error_info_from_exc_info(sys.exc_info())
@@ -995,13 +1153,28 @@ class DagsterCloudAgent:
995
1153
 
996
1154
  self._logger.info(f"Uploading response for request {request}.")
997
1155
 
998
- upload_api_response(instance, deployment_name, upload_response)
1156
+ if UPLOAD_API_RESPONSE_BATCHING_ENABLED():
1157
+ self._batcher[deployment_name].submit((deployment_name, upload_response))
1158
+ else:
1159
+ upload_api_response(self._instance, deployment_name, upload_response)
999
1160
 
1000
1161
  self._logger.info(f"Finished uploading response for request {request}.")
1001
1162
 
1002
1163
  return error_info
1003
1164
 
1004
- def _get_location_from_request(self, json_request: Dict[str, Any]) -> Optional[str]:
1165
+ def _batch_upload_api_response(
1166
+ self, upload_response_batch: list[tuple[str, DagsterCloudUploadApiResponse]]
1167
+ ) -> list[None]:
1168
+ deployment_names = set(deployment_name for deployment_name, _ in upload_response_batch)
1169
+ assert len(deployment_names) == 1
1170
+ batch_upload_api_response(
1171
+ self._instance,
1172
+ next(iter(deployment_names)),
1173
+ [resp for _, resp in upload_response_batch],
1174
+ )
1175
+ return [None for _ in upload_response_batch]
1176
+
1177
+ def _get_location_from_request(self, json_request: dict[str, Any]) -> Optional[str]:
1005
1178
  request_api = json_request["requestApi"]
1006
1179
  request_body = json_request["requestBody"]
1007
1180
  if request_api not in DagsterCloudApi.__members__:
@@ -1015,7 +1188,7 @@ class DagsterCloudAgent:
1015
1188
  return location_origin.location_name
1016
1189
 
1017
1190
  def run_iteration(
1018
- self, instance: DagsterCloudAgentInstance, user_code_launcher: DagsterCloudUserCodeLauncher
1191
+ self, user_code_launcher: DagsterCloudUserCodeLauncher
1019
1192
  ) -> Iterator[Optional[SerializableErrorInfo]]:
1020
1193
  if not user_code_launcher.ready_to_serve_requests:
1021
1194
  return
@@ -1024,12 +1197,12 @@ class DagsterCloudAgent:
1024
1197
 
1025
1198
  if num_pending_requests < self._pending_requests_limit:
1026
1199
  # limit (implicit default 10) applied separately for requests to branch deployments, and for each full deployment
1027
- result = instance.organization_scoped_graphql_client().execute(
1200
+ result = self._instance.organization_scoped_graphql_client().execute(
1028
1201
  GET_USER_CLOUD_REQUESTS_QUERY,
1029
1202
  {
1030
- "forBranchDeployments": instance.includes_branch_deployments,
1203
+ "forBranchDeployments": self._instance.includes_branch_deployments,
1031
1204
  "forFullDeployments": self._active_full_deployment_names,
1032
- "agentQueues": instance.agent_queues_config.queues,
1205
+ "agentQueues": self._instance.agent_queues_config.queues,
1033
1206
  },
1034
1207
  )
1035
1208
  json_requests = result["data"]["userCloudAgent"]["popUserCloudAgentRequests"]
@@ -1082,7 +1255,6 @@ class DagsterCloudAgent:
1082
1255
  future = self._executor.submit(
1083
1256
  self._process_api_request,
1084
1257
  json_request,
1085
- instance,
1086
1258
  user_code_launcher,
1087
1259
  submitted_to_executor_timestamp,
1088
1260
  )
@@ -1109,7 +1281,7 @@ class DagsterCloudAgent:
1109
1281
  if response:
1110
1282
  yield response
1111
1283
 
1112
- if instance.user_code_launcher.agent_metrics_enabled and (
1284
+ if self._instance.user_code_launcher.agent_metrics_enabled and (
1113
1285
  self._utilization_metrics["container_utilization"]["measurement_timestamp"] is None
1114
1286
  or (
1115
1287
  get_current_timestamp()
@@ -1123,6 +1295,22 @@ class DagsterCloudAgent:
1123
1295
  yield None
1124
1296
 
1125
1297
 
1298
+ def batch_upload_api_response(
1299
+ instance: DagsterCloudAgentInstance,
1300
+ deployment_name: str,
1301
+ batch: list[DagsterCloudUploadApiResponse],
1302
+ ):
1303
+ with compressed_namedtuple_upload_file(BatchDagsterCloudUploadApiResponse(batch=batch)) as f:
1304
+ resp = instance.requests_managed_retries_session.put(
1305
+ instance.dagster_cloud_upload_api_response_url,
1306
+ headers=instance.headers_for_deployment(deployment_name),
1307
+ files={"api_response_batch.tmp": f},
1308
+ timeout=instance.dagster_cloud_api_timeout,
1309
+ proxies=instance.dagster_cloud_api_proxies,
1310
+ )
1311
+ raise_http_error(resp)
1312
+
1313
+
1126
1314
  def upload_api_response(
1127
1315
  instance: DagsterCloudAgentInstance,
1128
1316
  deployment_name: str,