dagster-cloud 1.8.2__py3-none-any.whl → 1.12.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. dagster_cloud/__init__.py +3 -3
  2. dagster_cloud/agent/__init__.py +4 -4
  3. dagster_cloud/agent/cli/__init__.py +56 -17
  4. dagster_cloud/agent/dagster_cloud_agent.py +360 -172
  5. dagster_cloud/agent/instrumentation/__init__.py +0 -0
  6. dagster_cloud/agent/instrumentation/constants.py +2 -0
  7. dagster_cloud/agent/instrumentation/run_launch.py +23 -0
  8. dagster_cloud/agent/instrumentation/schedule.py +34 -0
  9. dagster_cloud/agent/instrumentation/sensor.py +34 -0
  10. dagster_cloud/anomaly_detection/__init__.py +2 -2
  11. dagster_cloud/anomaly_detection/defs.py +17 -12
  12. dagster_cloud/anomaly_detection/types.py +3 -3
  13. dagster_cloud/api/dagster_cloud_api.py +209 -293
  14. dagster_cloud/auth/constants.py +21 -5
  15. dagster_cloud/batching/__init__.py +1 -0
  16. dagster_cloud/batching/batcher.py +210 -0
  17. dagster_cloud/dagster_insights/__init__.py +12 -6
  18. dagster_cloud/dagster_insights/bigquery/bigquery_utils.py +3 -2
  19. dagster_cloud/dagster_insights/bigquery/dbt_wrapper.py +39 -12
  20. dagster_cloud/dagster_insights/bigquery/insights_bigquery_resource.py +8 -6
  21. dagster_cloud/dagster_insights/insights_utils.py +18 -8
  22. dagster_cloud/dagster_insights/metrics_utils.py +12 -12
  23. dagster_cloud/dagster_insights/snowflake/dagster_snowflake_insights.py +5 -12
  24. dagster_cloud/dagster_insights/snowflake/dbt_wrapper.py +34 -8
  25. dagster_cloud/dagster_insights/snowflake/definitions.py +38 -12
  26. dagster_cloud/dagster_insights/snowflake/insights_snowflake_resource.py +11 -23
  27. dagster_cloud/definitions/__init__.py +0 -0
  28. dagster_cloud/definitions/job_selection.py +36 -0
  29. dagster_cloud/execution/cloud_run_launcher/k8s.py +1 -1
  30. dagster_cloud/execution/cloud_run_launcher/process.py +3 -3
  31. dagster_cloud/execution/monitoring/__init__.py +27 -33
  32. dagster_cloud/execution/utils/process.py +3 -3
  33. dagster_cloud/instance/__init__.py +125 -38
  34. dagster_cloud/instrumentation/__init__.py +32 -0
  35. dagster_cloud/metadata/source_code.py +13 -8
  36. dagster_cloud/metrics/__init__.py +0 -0
  37. dagster_cloud/metrics/tracer.py +59 -0
  38. dagster_cloud/opentelemetry/__init__.py +0 -0
  39. dagster_cloud/opentelemetry/config/__init__.py +73 -0
  40. dagster_cloud/opentelemetry/config/exporter.py +81 -0
  41. dagster_cloud/opentelemetry/config/log_record_processor.py +40 -0
  42. dagster_cloud/opentelemetry/config/logging_handler.py +14 -0
  43. dagster_cloud/opentelemetry/config/meter_provider.py +9 -0
  44. dagster_cloud/opentelemetry/config/metric_reader.py +39 -0
  45. dagster_cloud/opentelemetry/controller.py +319 -0
  46. dagster_cloud/opentelemetry/enum.py +58 -0
  47. dagster_cloud/opentelemetry/factories/__init__.py +1 -0
  48. dagster_cloud/opentelemetry/factories/logs.py +113 -0
  49. dagster_cloud/opentelemetry/factories/metrics.py +121 -0
  50. dagster_cloud/opentelemetry/metrics/__init__.py +0 -0
  51. dagster_cloud/opentelemetry/metrics/meter.py +140 -0
  52. dagster_cloud/opentelemetry/observers/__init__.py +0 -0
  53. dagster_cloud/opentelemetry/observers/dagster_exception_handler.py +40 -0
  54. dagster_cloud/opentelemetry/observers/execution_observer.py +178 -0
  55. dagster_cloud/pex/grpc/__generated__/multi_pex_api_pb2.pyi +175 -0
  56. dagster_cloud/pex/grpc/__init__.py +2 -2
  57. dagster_cloud/pex/grpc/client.py +4 -4
  58. dagster_cloud/pex/grpc/compile.py +2 -2
  59. dagster_cloud/pex/grpc/server/__init__.py +2 -2
  60. dagster_cloud/pex/grpc/server/cli/__init__.py +31 -19
  61. dagster_cloud/pex/grpc/server/manager.py +60 -42
  62. dagster_cloud/pex/grpc/server/registry.py +28 -21
  63. dagster_cloud/pex/grpc/server/server.py +23 -14
  64. dagster_cloud/pex/grpc/types.py +5 -5
  65. dagster_cloud/py.typed +0 -0
  66. dagster_cloud/secrets/__init__.py +1 -1
  67. dagster_cloud/secrets/loader.py +3 -3
  68. dagster_cloud/serverless/__init__.py +1 -1
  69. dagster_cloud/serverless/io_manager.py +36 -53
  70. dagster_cloud/storage/client.py +54 -17
  71. dagster_cloud/storage/compute_logs/__init__.py +3 -1
  72. dagster_cloud/storage/compute_logs/compute_log_manager.py +22 -17
  73. dagster_cloud/storage/defs_state/__init__.py +3 -0
  74. dagster_cloud/storage/defs_state/queries.py +15 -0
  75. dagster_cloud/storage/defs_state/storage.py +113 -0
  76. dagster_cloud/storage/event_logs/__init__.py +3 -1
  77. dagster_cloud/storage/event_logs/queries.py +102 -4
  78. dagster_cloud/storage/event_logs/storage.py +266 -73
  79. dagster_cloud/storage/event_logs/utils.py +88 -7
  80. dagster_cloud/storage/runs/__init__.py +1 -1
  81. dagster_cloud/storage/runs/queries.py +17 -2
  82. dagster_cloud/storage/runs/storage.py +88 -42
  83. dagster_cloud/storage/schedules/__init__.py +1 -1
  84. dagster_cloud/storage/schedules/storage.py +6 -8
  85. dagster_cloud/storage/tags.py +66 -1
  86. dagster_cloud/util/__init__.py +10 -12
  87. dagster_cloud/util/errors.py +49 -64
  88. dagster_cloud/version.py +1 -1
  89. dagster_cloud/workspace/config_schema/__init__.py +55 -13
  90. dagster_cloud/workspace/docker/__init__.py +76 -25
  91. dagster_cloud/workspace/docker/utils.py +1 -1
  92. dagster_cloud/workspace/ecs/__init__.py +1 -1
  93. dagster_cloud/workspace/ecs/client.py +51 -33
  94. dagster_cloud/workspace/ecs/launcher.py +76 -22
  95. dagster_cloud/workspace/ecs/run_launcher.py +3 -3
  96. dagster_cloud/workspace/ecs/utils.py +14 -5
  97. dagster_cloud/workspace/kubernetes/__init__.py +1 -1
  98. dagster_cloud/workspace/kubernetes/launcher.py +61 -29
  99. dagster_cloud/workspace/kubernetes/utils.py +34 -22
  100. dagster_cloud/workspace/user_code_launcher/__init__.py +5 -3
  101. dagster_cloud/workspace/user_code_launcher/process.py +16 -14
  102. dagster_cloud/workspace/user_code_launcher/user_code_launcher.py +552 -172
  103. dagster_cloud/workspace/user_code_launcher/utils.py +105 -1
  104. {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/METADATA +48 -42
  105. dagster_cloud-1.12.6.dist-info/RECORD +134 -0
  106. {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/WHEEL +1 -1
  107. dagster_cloud-1.8.2.dist-info/RECORD +0 -100
  108. {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  # ruff: noqa: PLE1205
3
3
  import asyncio
4
4
  import functools
5
+ import hashlib
5
6
  import json
6
7
  import logging
7
8
  import os
@@ -11,57 +12,66 @@ import threading
11
12
  import time
12
13
  import zlib
13
14
  from abc import abstractmethod, abstractproperty
15
+ from collections import defaultdict
16
+ from collections.abc import Collection, Mapping, Sequence
14
17
  from concurrent.futures import ThreadPoolExecutor, as_completed, wait
15
18
  from contextlib import AbstractContextManager
16
- from typing import (
17
- Any,
18
- Callable,
19
- Collection,
20
- DefaultDict,
21
- Dict,
22
- Generic,
23
- List,
24
- Mapping,
25
- NamedTuple,
26
- Optional,
27
- Sequence,
28
- Set,
29
- Tuple,
30
- TypeVar,
31
- Union,
32
- cast,
33
- )
19
+ from io import BytesIO
20
+ from typing import Any, Callable, Generic, NamedTuple, Optional, TypeVar, Union, cast
34
21
 
35
22
  import dagster._check as check
36
23
  import grpc
37
24
  from dagster import BoolSource, Field, IntSource
38
- from dagster._api.list_repositories import sync_list_repositories_grpc
25
+ from dagster._api.list_repositories import gen_list_repositories_grpc
39
26
  from dagster._core.definitions.selector import JobSelector
40
27
  from dagster._core.errors import DagsterUserCodeUnreachableError
41
28
  from dagster._core.instance import MayHaveInstanceWeakref
42
29
  from dagster._core.launcher import RunLauncher
43
- from dagster._core.remote_representation import RemoteRepositoryOrigin
44
- from dagster._core.remote_representation.origin import (
30
+ from dagster._core.remote_origin import (
45
31
  CodeLocationOrigin,
46
32
  RegisteredCodeLocationOrigin,
33
+ RemoteRepositoryOrigin,
34
+ )
35
+ from dagster._core.remote_representation.external_data import (
36
+ extract_serialized_job_snap_from_serialized_job_data_snap,
47
37
  )
48
38
  from dagster._grpc.client import DagsterGrpcClient
49
- from dagster._grpc.types import GetCurrentImageResult
50
- from dagster._serdes import deserialize_value, serialize_value, whitelist_for_serdes
39
+ from dagster._grpc.types import GetCurrentImageResult, ListRepositoriesResponse
40
+ from dagster._serdes import (
41
+ deserialize_value,
42
+ pack_value,
43
+ serialize_value,
44
+ unpack_value,
45
+ whitelist_for_serdes,
46
+ )
51
47
  from dagster._time import get_current_timestamp
52
- from dagster._utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info
48
+ from dagster._utils.error import (
49
+ SerializableErrorInfo,
50
+ serializable_error_info_from_exc_info,
51
+ truncate_serialized_error,
52
+ )
53
53
  from dagster._utils.merger import merge_dicts
54
54
  from dagster._utils.typed_dict import init_optional_typeddict
55
55
  from dagster_cloud_cli.core.errors import raise_http_error
56
56
  from dagster_cloud_cli.core.workspace import CodeLocationDeployData
57
57
  from typing_extensions import Self, TypeAlias
58
58
 
59
+ from dagster_cloud.agent.instrumentation.constants import DAGSTER_CLOUD_AGENT_METRIC_PREFIX
59
60
  from dagster_cloud.agent.queries import GET_AGENTS_QUERY
60
61
  from dagster_cloud.api.dagster_cloud_api import (
62
+ CheckSnapshotResult,
63
+ ConfirmUploadResult,
64
+ DagsterCloudCodeLocationManifest,
65
+ DagsterCloudCodeLocationUpdateResponse,
66
+ DagsterCloudCodeLocationUpdateResult,
67
+ DagsterCloudRepositoryManifest,
61
68
  DagsterCloudUploadLocationData,
62
69
  DagsterCloudUploadRepositoryData,
63
70
  DagsterCloudUploadWorkspaceEntry,
64
71
  DagsterCloudUploadWorkspaceResponse,
72
+ FileFormat,
73
+ SnapshotType,
74
+ StoredSnapshot,
65
75
  UserCodeDeploymentType,
66
76
  )
67
77
  from dagster_cloud.execution.monitoring import (
@@ -75,6 +85,8 @@ from dagster_cloud.execution.monitoring import (
75
85
  start_run_worker_monitoring_thread,
76
86
  )
77
87
  from dagster_cloud.instance import DagsterCloudAgentInstance
88
+ from dagster_cloud.opentelemetry.controller import OpenTelemetryController
89
+ from dagster_cloud.opentelemetry.observers.execution_observer import observe_execution
78
90
  from dagster_cloud.pex.grpc.client import MultiPexGrpcClient
79
91
  from dagster_cloud.pex.grpc.types import (
80
92
  CreatePexServerArgs,
@@ -83,7 +95,6 @@ from dagster_cloud.pex.grpc.types import (
83
95
  ShutdownPexServerArgs,
84
96
  )
85
97
  from dagster_cloud.util import diff_serializable_namedtuple_map
86
- from dagster_cloud.util.errors import truncate_serialized_error
87
98
 
88
99
  DEFAULT_SERVER_PROCESS_STARTUP_TIMEOUT = 180
89
100
  DEFAULT_MAX_TTL_SERVERS = 25
@@ -155,7 +166,7 @@ class UserCodeLauncherEntry(
155
166
  code_location_deploy_data,
156
167
  update_timestamp,
157
168
  ):
158
- return super(UserCodeLauncherEntry, cls).__new__(
169
+ return super().__new__(
159
170
  cls,
160
171
  check.inst_param(
161
172
  code_location_deploy_data, "code_location_deploy_data", CodeLocationDeployData
@@ -234,10 +245,13 @@ SHARED_USER_CODE_LAUNCHER_CONFIG = {
234
245
  BoolSource,
235
246
  is_required=False,
236
247
  default_value=True,
237
- description=(
238
- "Do not include full job snapshots in the workspace "
239
- "snapshot, upload them separately if they have not been previously uploaded."
240
- ),
248
+ description=("Deprecated - no longer used"),
249
+ ),
250
+ "direct_snapshot_uploads": Field(
251
+ BoolSource,
252
+ is_required=False,
253
+ default_value=True,
254
+ description=("Opt-out for uploading definition snapshots directly to blob storage."),
241
255
  ),
242
256
  "upload_snapshots_on_startup": Field(
243
257
  BoolSource,
@@ -260,8 +274,8 @@ SHARED_USER_CODE_LAUNCHER_CONFIG = {
260
274
  ),
261
275
  }
262
276
 
263
- DeploymentAndLocation: TypeAlias = Tuple[str, str]
264
- UserCodeLauncherEntryMap: TypeAlias = Dict[DeploymentAndLocation, UserCodeLauncherEntry]
277
+ DeploymentAndLocation: TypeAlias = tuple[str, str]
278
+ UserCodeLauncherEntryMap: TypeAlias = dict[DeploymentAndLocation, UserCodeLauncherEntry]
265
279
 
266
280
 
267
281
  class ServerEndpoint(
@@ -271,12 +285,12 @@ class ServerEndpoint(
271
285
  ("host", str),
272
286
  ("port", Optional[int]),
273
287
  ("socket", Optional[str]),
274
- ("metadata", Optional[List[Tuple[str, str]]]),
288
+ ("metadata", Optional[list[tuple[str, str]]]),
275
289
  ],
276
290
  )
277
291
  ):
278
292
  def __new__(cls, host, port, socket, metadata=None):
279
- return super(ServerEndpoint, cls).__new__(
293
+ return super().__new__(
280
294
  cls,
281
295
  check.str_param(host, "host"),
282
296
  check.opt_int_param(port, "port"),
@@ -292,7 +306,7 @@ class ServerEndpoint(
292
306
  def create_multipex_client(self) -> MultiPexGrpcClient:
293
307
  return MultiPexGrpcClient(port=self.port, socket=self.socket, host=self.host)
294
308
 
295
- def with_metadata(self, metadata: Optional[List[Tuple[str, str]]]):
309
+ def with_metadata(self, metadata: Optional[list[tuple[str, str]]]):
296
310
  return self._replace(metadata=metadata)
297
311
 
298
312
 
@@ -312,7 +326,7 @@ class DagsterCloudGrpcServer(
312
326
  server_endpoint: ServerEndpoint,
313
327
  code_location_deploy_data: CodeLocationDeployData,
314
328
  ):
315
- return super(DagsterCloudGrpcServer, cls).__new__(
329
+ return super().__new__(
316
330
  cls,
317
331
  server_handle,
318
332
  check.inst_param(server_endpoint, "server_endpoint", ServerEndpoint),
@@ -322,57 +336,72 @@ class DagsterCloudGrpcServer(
322
336
  )
323
337
 
324
338
 
339
+ _SUPPORTED_FILE_FORMATS = [FileFormat.GZIPPED_JSON, FileFormat.JSON]
340
+
341
+
342
+ def _file_for_format(obj_bytes: bytes, fmt: str):
343
+ if fmt == FileFormat.JSON:
344
+ return BytesIO(obj_bytes)
345
+ elif fmt == FileFormat.GZIPPED_JSON:
346
+ return BytesIO(zlib.compress(obj_bytes))
347
+ else:
348
+ check.failed(f"Unexpected file format {fmt}")
349
+
350
+
325
351
  class DagsterCloudUserCodeLauncher(
326
352
  AbstractContextManager, MayHaveInstanceWeakref[DagsterCloudAgentInstance], Generic[ServerHandle]
327
353
  ):
328
354
  def __init__(
329
355
  self,
330
356
  server_ttl: Optional[dict] = None,
331
- defer_job_snapshots: bool = True,
332
357
  server_process_startup_timeout=None,
333
358
  upload_snapshots_on_startup: bool = True,
334
359
  requires_healthcheck: bool = False,
335
360
  code_server_metrics: Optional[Mapping[str, Any]] = None,
336
361
  agent_metrics: Optional[Mapping[str, Any]] = None,
362
+ direct_snapshot_uploads: bool = False,
363
+ # ignored old setting, allowed to flow through to avoid breakage
364
+ defer_job_snapshots: bool = True,
337
365
  ):
338
- self._grpc_servers: Dict[
366
+ self._grpc_servers: dict[
339
367
  DeploymentAndLocation, Union[DagsterCloudGrpcServer, SerializableErrorInfo]
340
368
  ] = {}
341
- self._first_unavailable_times: Dict[DeploymentAndLocation, float] = {}
369
+ self._first_unavailable_times: dict[DeploymentAndLocation, float] = {}
342
370
 
343
- self._pending_delete_grpc_server_handles: Set[ServerHandle] = set()
371
+ self._pending_delete_grpc_server_handles: set[ServerHandle] = set()
344
372
  self._grpc_servers_lock = threading.Lock()
345
- self._per_location_metrics: Dict[
373
+ self._per_location_metrics: dict[
346
374
  DeploymentAndLocation, CloudCodeServerUtilizationMetrics
347
- ] = DefaultDict(lambda: init_optional_typeddict(CloudCodeServerUtilizationMetrics))
375
+ ] = defaultdict(lambda: init_optional_typeddict(CloudCodeServerUtilizationMetrics))
348
376
 
349
- self._multipex_servers: Dict[DeploymentAndLocation, DagsterCloudGrpcServer] = {}
377
+ self._multipex_servers: dict[DeploymentAndLocation, DagsterCloudGrpcServer] = {}
350
378
 
351
379
  self._server_ttl_config = check.opt_dict_param(server_ttl, "server_ttl")
352
- self._defer_job_snapshots = defer_job_snapshots
380
+ self._direct_snapshot_uploads = direct_snapshot_uploads
353
381
  self.upload_snapshots_on_startup = check.bool_param(
354
382
  upload_snapshots_on_startup, "upload_snapshots_on_startup"
355
383
  )
356
384
  self._requires_healthcheck = check.bool_param(requires_healthcheck, "requires_healthcheck")
357
385
 
358
386
  # periodically reconciles to make desired = actual
359
- self._desired_entries: Dict[DeploymentAndLocation, UserCodeLauncherEntry] = {}
360
- self._actual_entries: Dict[DeploymentAndLocation, UserCodeLauncherEntry] = {}
387
+ self._desired_entries: dict[DeploymentAndLocation, UserCodeLauncherEntry] = {}
388
+ self._actual_entries: dict[DeploymentAndLocation, UserCodeLauncherEntry] = {}
361
389
  self._last_refreshed_actual_entries = 0
362
390
  self._last_cleaned_up_dangling_code_servers = 0
363
391
  self._metadata_lock = threading.Lock()
364
392
 
365
- self._upload_locations: Set[DeploymentAndLocation] = set()
393
+ self._upload_locations: set[DeploymentAndLocation] = set()
366
394
 
367
395
  self._logger = logging.getLogger("dagster_cloud.user_code_launcher")
368
396
  self._event_logger = logging.getLogger("cloud-events")
369
397
  self._started: bool = False
370
398
  self._run_worker_monitoring_thread = None
371
399
  self._run_worker_monitoring_thread_shutdown_event = None
372
- self._run_worker_deployments_to_check: Set[str] = set()
373
- self._run_worker_statuses_dict: Dict[str, List[CloudRunWorkerStatus]] = {}
400
+ self._run_worker_deployments_to_check: set[str] = set()
401
+ self._run_worker_statuses_dict: dict[str, list[CloudRunWorkerStatus]] = {}
374
402
  self._run_worker_monitoring_lock = threading.Lock()
375
403
 
404
+ self._in_progress_reconcile_start_time = time.time()
376
405
  self._reconcile_count = 0
377
406
  self._reconcile_grpc_metadata_shutdown_event = threading.Event()
378
407
  self._reconcile_grpc_metadata_thread = None
@@ -390,7 +419,7 @@ class DagsterCloudUserCodeLauncher(
390
419
  self._agent_metrics_config = agent_metrics
391
420
  super().__init__()
392
421
 
393
- def get_active_grpc_server_handles(self) -> List[ServerHandle]:
422
+ def get_active_grpc_server_handles(self) -> list[ServerHandle]:
394
423
  with self._grpc_servers_lock:
395
424
  return [
396
425
  s.server_handle
@@ -398,7 +427,7 @@ class DagsterCloudUserCodeLauncher(
398
427
  if not isinstance(s, SerializableErrorInfo)
399
428
  ] + list(self._pending_delete_grpc_server_handles)
400
429
 
401
- def get_active_agent_ids(self) -> Optional[Set[str]]:
430
+ def get_active_agent_ids(self) -> Optional[set[str]]:
402
431
  try:
403
432
  result = self._instance.organization_scoped_graphql_client().execute(
404
433
  GET_AGENTS_QUERY,
@@ -569,6 +598,193 @@ class DagsterCloudUserCodeLauncher(
569
598
  workspace_entry: DagsterCloudUploadWorkspaceEntry,
570
599
  server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
571
600
  ) -> None:
601
+ if self._direct_snapshot_uploads:
602
+ self._update_workspace_entry_direct_upload(
603
+ deployment_name,
604
+ workspace_entry,
605
+ server_or_error,
606
+ )
607
+ else:
608
+ self._update_workspace_entry_server_upload(
609
+ deployment_name,
610
+ workspace_entry,
611
+ server_or_error,
612
+ )
613
+
614
+ def _ensure_snapshot_uploaded(
615
+ self,
616
+ deployment_name: str,
617
+ snapshot_type: str,
618
+ serialized_object: str,
619
+ ) -> StoredSnapshot:
620
+ object_bytes = serialized_object.encode("utf-8")
621
+ sha1 = hashlib.sha1(object_bytes).hexdigest()
622
+ byte_count = len(object_bytes)
623
+ response = self._instance.requests_managed_retries_session.get(
624
+ self._instance.dagster_cloud_check_snapshot_url,
625
+ headers=self._instance.headers_for_deployment(deployment_name),
626
+ params={
627
+ "type": snapshot_type,
628
+ "sha1": sha1,
629
+ "size": byte_count,
630
+ "formats": _SUPPORTED_FILE_FORMATS,
631
+ },
632
+ timeout=self._instance.dagster_cloud_api_timeout,
633
+ proxies=self._instance.dagster_cloud_api_proxies,
634
+ )
635
+ raise_http_error(response)
636
+
637
+ result = unpack_value(response.json(), CheckSnapshotResult)
638
+
639
+ if not result.stored_snapshot:
640
+ upload_data = check.not_none(
641
+ result.upload_data,
642
+ "upload_data expected when stored_snapshot is None",
643
+ )
644
+ file = _file_for_format(object_bytes, upload_data.format)
645
+ response = self._instance.requests_managed_retries_session.put(
646
+ url=upload_data.presigned_put_url,
647
+ data=file,
648
+ timeout=self._instance.dagster_cloud_api_timeout,
649
+ )
650
+ raise_http_error(response)
651
+
652
+ response = self._instance.requests_managed_retries_session.put(
653
+ self._instance.dagster_cloud_confirm_upload_url,
654
+ headers=self._instance.headers_for_deployment(deployment_name),
655
+ json=pack_value(upload_data),
656
+ timeout=self._instance.dagster_cloud_api_timeout,
657
+ proxies=self._instance.dagster_cloud_api_proxies,
658
+ )
659
+ raise_http_error(response)
660
+ result = unpack_value(response.json(), ConfirmUploadResult)
661
+ return result.stored_snapshot
662
+
663
+ return result.stored_snapshot
664
+
665
+ def _update_workspace_entry_direct_upload(
666
+ self,
667
+ deployment_name: str,
668
+ workspace_entry: DagsterCloudUploadWorkspaceEntry,
669
+ server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
670
+ ) -> None:
671
+ # updated scheme, uploading definitions to blob storage via signed urls
672
+ error_snap = None
673
+ manifest = None
674
+ if workspace_entry.serialized_error_info:
675
+ error_snap = self._ensure_snapshot_uploaded(
676
+ deployment_name,
677
+ SnapshotType.ERROR,
678
+ serialize_value(workspace_entry.serialized_error_info),
679
+ )
680
+ elif isinstance(server_or_error, SerializableErrorInfo):
681
+ error_snap = self._ensure_snapshot_uploaded(
682
+ deployment_name,
683
+ SnapshotType.ERROR,
684
+ serialize_value(server_or_error),
685
+ )
686
+ elif workspace_entry.upload_location_data:
687
+ repos = []
688
+ for repo_data in workspace_entry.upload_location_data.upload_repository_datas:
689
+ stored_snapshot = self._ensure_snapshot_uploaded(
690
+ deployment_name,
691
+ SnapshotType.REPOSITORY,
692
+ repo_data.serialized_repository_data,
693
+ )
694
+ repos.append(
695
+ DagsterCloudRepositoryManifest(
696
+ name=repo_data.repository_name,
697
+ code_pointer=repo_data.code_pointer,
698
+ stored_snapshot=stored_snapshot,
699
+ )
700
+ )
701
+
702
+ manifest = DagsterCloudCodeLocationManifest(
703
+ repositories=repos,
704
+ executable_path=workspace_entry.upload_location_data.executable_path,
705
+ container_image=workspace_entry.upload_location_data.container_image,
706
+ dagster_library_versions=workspace_entry.upload_location_data.dagster_library_versions,
707
+ code_location_deploy_data=workspace_entry.code_location_deploy_data,
708
+ )
709
+ else:
710
+ check.failed(
711
+ "Expected DagsterCloudUploadWorkspaceEntry to have either location data or error, had neither."
712
+ )
713
+
714
+ result = DagsterCloudCodeLocationUpdateResult(
715
+ location_name=workspace_entry.location_name,
716
+ error_snapshot=error_snap,
717
+ manifest=manifest,
718
+ )
719
+
720
+ res = self._instance.requests_managed_retries_session.put(
721
+ self._instance.dagster_cloud_code_location_update_result_url,
722
+ headers=self._instance.headers_for_deployment(deployment_name),
723
+ json=pack_value(result),
724
+ timeout=self._instance.dagster_cloud_api_timeout,
725
+ proxies=self._instance.dagster_cloud_api_proxies,
726
+ )
727
+ raise_http_error(res)
728
+ first_response = unpack_value(res.json(), DagsterCloudCodeLocationUpdateResponse)
729
+ if first_response.updated:
730
+ self._logger.info(
731
+ "Code location update result for"
732
+ f" {deployment_name}:{workspace_entry.location_name} - {first_response.message}"
733
+ )
734
+ return
735
+
736
+ missing = check.not_none(
737
+ first_response.missing_job_snapshots,
738
+ "Expected missing_job_snapshots when updated is false.",
739
+ )
740
+ server = check.inst(
741
+ server_or_error,
742
+ DagsterCloudGrpcServer,
743
+ "Server should not be in error state if there are missing snapshots.",
744
+ )
745
+ self._logger.info(f"Uploading {len(missing)} job snapshots.")
746
+ with ThreadPoolExecutor() as executor:
747
+ _ = list(
748
+ executor.map(
749
+ lambda job_selector: self.upload_job_snap_direct(
750
+ deployment_name,
751
+ job_selector,
752
+ server,
753
+ ),
754
+ missing,
755
+ )
756
+ )
757
+ res = self._instance.requests_managed_retries_session.put(
758
+ self._instance.dagster_cloud_code_location_update_result_url,
759
+ headers=self._instance.headers_for_deployment(deployment_name),
760
+ json=pack_value(result),
761
+ timeout=self._instance.dagster_cloud_api_timeout,
762
+ proxies=self._instance.dagster_cloud_api_proxies,
763
+ )
764
+ raise_http_error(res)
765
+ second_response = unpack_value(res.json(), DagsterCloudCodeLocationUpdateResponse)
766
+ if not second_response.updated:
767
+ if second_response.missing_job_snapshots:
768
+ # this condition is expected to be extremely unlikely
769
+ raise Exception(
770
+ "Code location update failed, job definitions changed while uploading:"
771
+ f" {second_response.missing_job_snapshots}"
772
+ )
773
+ else:
774
+ raise Exception(f"Code location update failed: {second_response.message}")
775
+
776
+ self._logger.info(
777
+ "Code location update result for"
778
+ f" {deployment_name}:{workspace_entry.location_name} - {second_response.message}"
779
+ )
780
+
781
+ def _update_workspace_entry_server_upload(
782
+ self,
783
+ deployment_name: str,
784
+ workspace_entry: DagsterCloudUploadWorkspaceEntry,
785
+ server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
786
+ ) -> None:
787
+ # legacy scheme, uploading definitions blobs to web server
572
788
  with tempfile.TemporaryDirectory() as temp_dir:
573
789
  dst = os.path.join(temp_dir, "workspace_entry.tmp")
574
790
  with open(dst, "wb") as f:
@@ -647,7 +863,16 @@ class DagsterCloudUserCodeLauncher(
647
863
  f" {deployment_name}:{workspace_entry.location_name} {response.message}"
648
864
  )
649
865
 
650
- def _get_upload_location_data(
866
+ async def gen_list_repositories_response(
867
+ self,
868
+ client: DagsterGrpcClient,
869
+ ) -> "ListRepositoriesResponse":
870
+ return await gen_list_repositories_grpc(
871
+ client,
872
+ timeout=int(os.getenv("DAGSTER_CLOUD_LIST_REPOSITORIES_GRPC_TIMEOUT", "180")),
873
+ )
874
+
875
+ async def _get_upload_location_data(
651
876
  self,
652
877
  deployment_name: str,
653
878
  location_name: str,
@@ -656,30 +881,40 @@ class DagsterCloudUserCodeLauncher(
656
881
  location_origin = self._get_code_location_origin(location_name)
657
882
  client = server.server_endpoint.create_client()
658
883
 
659
- list_repositories_response = sync_list_repositories_grpc(client)
884
+ list_repositories_response = await self.gen_list_repositories_response(client)
660
885
 
661
- upload_repo_datas: List[DagsterCloudUploadRepositoryData] = []
886
+ upload_repo_datas: list[DagsterCloudUploadRepositoryData] = []
662
887
 
663
888
  for (
664
889
  repository_name,
665
890
  code_pointer,
666
891
  ) in list_repositories_response.repository_code_pointer_dict.items():
667
- external_repository_chunks = list(
668
- client.streaming_external_repository(
669
- external_repository_origin=RemoteRepositoryOrigin(
892
+ if os.getenv("DAGSTER_CLOUD_USE_STREAMING_EXTERNAL_REPOSITORY"):
893
+ external_repository_chunks = [
894
+ chunk
895
+ async for chunk in client.gen_streaming_external_repository(
896
+ remote_repository_origin=RemoteRepositoryOrigin(
897
+ location_origin,
898
+ repository_name,
899
+ ),
900
+ defer_snapshots=True,
901
+ )
902
+ ]
903
+
904
+ serialized_repository_data = "".join(
905
+ [
906
+ chunk["serialized_external_repository_chunk"]
907
+ for chunk in external_repository_chunks
908
+ ]
909
+ )
910
+ else:
911
+ serialized_repository_data = await client.gen_external_repository(
912
+ remote_repository_origin=RemoteRepositoryOrigin(
670
913
  location_origin,
671
914
  repository_name,
672
915
  ),
673
- defer_snapshots=self._defer_job_snapshots,
916
+ defer_snapshots=True,
674
917
  )
675
- )
676
-
677
- serialized_repository_data = "".join(
678
- [
679
- chunk["serialized_external_repository_chunk"]
680
- for chunk in external_repository_chunks
681
- ]
682
- )
683
918
 
684
919
  # Don't deserialize in case there are breaking changes - let the server do it
685
920
  upload_repo_datas.append(
@@ -729,7 +964,27 @@ class DagsterCloudUserCodeLauncher(
729
964
  deployment_name, errored_workspace_entry, server_or_error=error_info
730
965
  )
731
966
 
732
- def _update_location_data(
967
+ async def _try_update_location_data(
968
+ self,
969
+ deployment_name: str,
970
+ location_name: str,
971
+ server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
972
+ metadata: CodeLocationDeployData,
973
+ ):
974
+ try:
975
+ await self._update_location_data(
976
+ deployment_name,
977
+ location_name,
978
+ server_or_error,
979
+ metadata,
980
+ )
981
+ except Exception:
982
+ self._logger.error(
983
+ f"Error while writing location data for {deployment_name}:{location_name}:"
984
+ f" {serializable_error_info_from_exc_info(sys.exc_info())}"
985
+ )
986
+
987
+ async def _update_location_data(
733
988
  self,
734
989
  deployment_name: str,
735
990
  location_name: str,
@@ -756,7 +1011,7 @@ class DagsterCloudUserCodeLauncher(
756
1011
  loaded_workspace_entry = DagsterCloudUploadWorkspaceEntry(
757
1012
  location_name=location_name,
758
1013
  code_location_deploy_data=metadata,
759
- upload_location_data=self._get_upload_location_data(
1014
+ upload_location_data=await self._get_upload_location_data(
760
1015
  deployment_name,
761
1016
  location_name,
762
1017
  server_or_error,
@@ -789,7 +1044,7 @@ class DagsterCloudUserCodeLauncher(
789
1044
 
790
1045
  def _get_existing_pex_servers(
791
1046
  self, deployment_name: str, location_name: str
792
- ) -> List[PexServerHandle]:
1047
+ ) -> list[PexServerHandle]:
793
1048
  server = self._multipex_servers.get((deployment_name, location_name))
794
1049
 
795
1050
  if not server:
@@ -870,6 +1125,11 @@ class DagsterCloudUserCodeLauncher(
870
1125
  ):
871
1126
  deployment_name, location_name = to_update_key
872
1127
 
1128
+ attributes = {
1129
+ "deployment": deployment_name,
1130
+ "location": location_name,
1131
+ }
1132
+
873
1133
  code_location_deploy_data = desired_entry.code_location_deploy_data
874
1134
  pex_metadata = code_location_deploy_data.pex_metadata
875
1135
  deployment_info = (
@@ -882,13 +1142,19 @@ class DagsterCloudUserCodeLauncher(
882
1142
  self._logger.info(
883
1143
  f"Waiting for new grpc server for {deployment_name}:{location_name} for {deployment_info} to be ready..."
884
1144
  )
885
- await self._wait_for_new_server_ready(
886
- deployment_name,
887
- location_name,
888
- desired_entry,
889
- server_or_error.server_handle,
890
- server_or_error.server_endpoint,
891
- )
1145
+ with observe_execution(
1146
+ opentelemetry=self.opentelemetry,
1147
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.server.process_wait",
1148
+ short_description="waiting for new server process to be ready",
1149
+ attributes=attributes,
1150
+ ):
1151
+ await self._wait_for_new_server_ready(
1152
+ deployment_name,
1153
+ location_name,
1154
+ desired_entry,
1155
+ server_or_error.server_handle,
1156
+ server_or_error.server_endpoint,
1157
+ )
892
1158
  except Exception:
893
1159
  error_info = serializable_error_info_from_exc_info(sys.exc_info())
894
1160
  self._logger.error(
@@ -898,21 +1164,18 @@ class DagsterCloudUserCodeLauncher(
898
1164
  server_or_error = error_info
899
1165
 
900
1166
  if should_upload:
901
- try:
902
- self._update_location_data(
1167
+ with observe_execution(
1168
+ opentelemetry=self.opentelemetry,
1169
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.upload",
1170
+ short_description="uploading user code data",
1171
+ attributes=attributes,
1172
+ ):
1173
+ await self._try_update_location_data(
903
1174
  deployment_name,
904
1175
  location_name,
905
1176
  server_or_error,
906
1177
  desired_entry.code_location_deploy_data,
907
1178
  )
908
- except Exception:
909
- # If there was a failure uploading snapshots, log it but don't block other code locations
910
- # from updating (and still use the new server to serve new requests)
911
- error_info = serializable_error_info_from_exc_info(sys.exc_info())
912
- self._logger.error(
913
- f"Error while writing location data for {deployment_name}:{location_name}:"
914
- f" {error_info}"
915
- )
916
1179
 
917
1180
  # Once we've verified that the new server has uploaded its data successfully, swap in
918
1181
  # the server to start serving new requests
@@ -986,7 +1249,7 @@ class DagsterCloudUserCodeLauncher(
986
1249
  self._pending_delete_grpc_server_handles.discard(server_handle)
987
1250
 
988
1251
  def _cleanup_servers(
989
- self, active_agent_ids: Optional[Set[str]], include_own_servers: bool
1252
+ self, active_agent_ids: Optional[set[str]], include_own_servers: bool
990
1253
  ) -> None:
991
1254
  """Remove all servers, across all deployments and locations."""
992
1255
  with ThreadPoolExecutor() as executor:
@@ -1009,7 +1272,7 @@ class DagsterCloudUserCodeLauncher(
1009
1272
  self._logger.exception("Error cleaning up server")
1010
1273
 
1011
1274
  @abstractmethod
1012
- def _list_server_handles(self) -> List[ServerHandle]:
1275
+ def _list_server_handles(self) -> list[ServerHandle]:
1013
1276
  """Return a list of all server handles across all deployments and locations."""
1014
1277
 
1015
1278
  @abstractmethod
@@ -1021,7 +1284,7 @@ class DagsterCloudUserCodeLauncher(
1021
1284
  """Returns the update_timestamp value from the given code server."""
1022
1285
 
1023
1286
  def _can_cleanup_server(
1024
- self, handle: ServerHandle, active_agent_ids: Optional[Set[str]], include_own_servers: bool
1287
+ self, handle: ServerHandle, active_agent_ids: Optional[set[str]], include_own_servers: bool
1025
1288
  ) -> bool:
1026
1289
  """Returns true if we can clean up the server identified by the handle without issues (server was started by this agent, or agent is no longer active)."""
1027
1290
  agent_id_for_server = self.get_agent_id_for_server(handle)
@@ -1057,7 +1320,7 @@ class DagsterCloudUserCodeLauncher(
1057
1320
  return False
1058
1321
 
1059
1322
  return (active_agent_ids is not None) and (
1060
- agent_id_for_server not in cast(Set[str], active_agent_ids)
1323
+ agent_id_for_server not in cast("set[str]", active_agent_ids)
1061
1324
  )
1062
1325
 
1063
1326
  def _graceful_cleanup_servers(self, include_own_servers: bool): # ServerHandles
@@ -1066,7 +1329,7 @@ class DagsterCloudUserCodeLauncher(
1066
1329
  return self._cleanup_servers(active_agent_ids, include_own_servers=include_own_servers)
1067
1330
 
1068
1331
  handles = self._list_server_handles()
1069
- servers_to_remove: List[ServerHandle] = []
1332
+ servers_to_remove: list[ServerHandle] = []
1070
1333
  with self._grpc_servers_lock:
1071
1334
  for handle in handles:
1072
1335
  if self._can_cleanup_server(
@@ -1090,7 +1353,7 @@ class DagsterCloudUserCodeLauncher(
1090
1353
  self._reconcile_grpc_metadata_thread.join()
1091
1354
 
1092
1355
  if self._run_worker_monitoring_thread:
1093
- self._run_worker_monitoring_thread_shutdown_event.set()
1356
+ self._run_worker_monitoring_thread_shutdown_event.set() # pyright: ignore[reportOptionalMemberAccess]
1094
1357
  self._run_worker_monitoring_thread.join()
1095
1358
 
1096
1359
  if self._reconcile_location_utilization_metrics_thread:
@@ -1100,10 +1363,10 @@ class DagsterCloudUserCodeLauncher(
1100
1363
  if self._started:
1101
1364
  self._graceful_cleanup_servers(include_own_servers=True)
1102
1365
 
1103
- super().__exit__(exception_value, exception_value, traceback)
1366
+ super().__exit__(exception_value, exception_value, traceback) # pyright: ignore[reportAbstractUsage]
1104
1367
 
1105
1368
  def add_upload_metadata(
1106
- self, upload_metadata: Dict[DeploymentAndLocation, UserCodeLauncherEntry]
1369
+ self, upload_metadata: dict[DeploymentAndLocation, UserCodeLauncherEntry]
1107
1370
  ):
1108
1371
  """Add a set of locations to be uploaded in the next reconcilation loop."""
1109
1372
  with self._metadata_lock:
@@ -1112,7 +1375,7 @@ class DagsterCloudUserCodeLauncher(
1112
1375
 
1113
1376
  def update_grpc_metadata(
1114
1377
  self,
1115
- desired_metadata: Dict[DeploymentAndLocation, UserCodeLauncherEntry],
1378
+ desired_metadata: dict[DeploymentAndLocation, UserCodeLauncherEntry],
1116
1379
  ):
1117
1380
  check.dict_param(
1118
1381
  desired_metadata,
@@ -1152,7 +1415,7 @@ class DagsterCloudUserCodeLauncher(
1152
1415
  endpoints_or_errors = self.get_grpc_endpoints()
1153
1416
  for (deployment_name, location_name), endpoint_or_error in endpoints_or_errors.items():
1154
1417
  if isinstance(endpoint_or_error, ServerEndpoint):
1155
- endpoint = cast(ServerEndpoint, endpoint_or_error)
1418
+ endpoint = cast("ServerEndpoint", endpoint_or_error)
1156
1419
  raw_metrics_str = (
1157
1420
  endpoint.create_client().ping("").get("serialized_server_utilization_metrics")
1158
1421
  )
@@ -1217,7 +1480,7 @@ class DagsterCloudUserCodeLauncher(
1217
1480
  try:
1218
1481
  self._graceful_cleanup_servers(include_own_servers=False)
1219
1482
  except:
1220
- self._logger.exception("Failed to clean up dangling code serverrs.")
1483
+ self._logger.exception("Failed to clean up dangling code servers.")
1221
1484
  self._last_cleaned_up_dangling_code_servers = now
1222
1485
 
1223
1486
  if now - self._last_refreshed_actual_entries > ACTUAL_ENTRIES_REFRESH_INTERVAL:
@@ -1227,6 +1490,8 @@ class DagsterCloudUserCodeLauncher(
1227
1490
  self._logger.exception("Failed to refresh actual entries.")
1228
1491
  self._last_refreshed_actual_entries = now
1229
1492
 
1493
+ self._in_progress_reconcile_start_time = time.time()
1494
+
1230
1495
  self._reconcile(
1231
1496
  desired_entries,
1232
1497
  upload_locations,
@@ -1243,6 +1508,7 @@ class DagsterCloudUserCodeLauncher(
1243
1508
  f"Started polling for requests from {self._instance.dagster_cloud_url}"
1244
1509
  )
1245
1510
 
1511
+ self._in_progress_reconcile_start_time = None
1246
1512
  self._reconcile_count += 1
1247
1513
 
1248
1514
  def _update_metrics_thread(self, shutdown_event):
@@ -1267,19 +1533,39 @@ class DagsterCloudUserCodeLauncher(
1267
1533
  # thread-safe since reconcile_count is an integer
1268
1534
  return self._reconcile_count > 0
1269
1535
 
1270
- def _make_check_on_running_server_endpoint(self, server_endpoint: ServerEndpoint):
1271
- # Ensure that server_endpoint is bound correctly
1272
- return lambda: server_endpoint.create_client().ping("")
1536
+ @property
1537
+ def in_progress_reconcile_start_time(self) -> Optional[float]:
1538
+ return self._in_progress_reconcile_start_time
1539
+
1540
+ def _make_check_on_running_server_endpoint(
1541
+ self, server_endpoint: ServerEndpoint
1542
+ ) -> Callable[[], Union[ListRepositoriesResponse, SerializableErrorInfo]]:
1543
+ return lambda: deserialize_value(
1544
+ server_endpoint.create_client().list_repositories(),
1545
+ (ListRepositoriesResponse, SerializableErrorInfo),
1546
+ )
1547
+
1548
+ def _trigger_recovery_server_restart(self, deployment_location: DeploymentAndLocation):
1549
+ del self._actual_entries[deployment_location]
1550
+
1551
+ if deployment_location in self._first_unavailable_times:
1552
+ del self._first_unavailable_times[deployment_location]
1553
+
1554
+ # redeploy the multipex server in this case as well to ensure a fresh start
1555
+ # if it resource contrained (and ensure that we don't try to create the same
1556
+ # PexServerHandle again and delete the code location in a loop)
1557
+ if deployment_location in self._multipex_servers:
1558
+ del self._multipex_servers[deployment_location]
1273
1559
 
1274
1560
  def _refresh_actual_entries(self) -> None:
1275
- for deployment_location, server in self._multipex_servers.items():
1561
+ for deployment_location, multipex_server in self._multipex_servers.items():
1276
1562
  if deployment_location in self._actual_entries:
1277
1563
  # If a multipex server exists, we query it over gRPC
1278
1564
  # to make sure the pex server is still available.
1279
1565
 
1280
1566
  # First verify that the multipex server is running
1281
1567
  try:
1282
- server.server_endpoint.create_multipex_client().ping("")
1568
+ multipex_server.server_endpoint.create_multipex_client().ping("")
1283
1569
  except:
1284
1570
  # If it isn't, this is expected if ECS is currently spinning up this service
1285
1571
  # after it crashed. In this case, we want to wait for it to fully come up
@@ -1291,14 +1577,20 @@ class DagsterCloudUserCodeLauncher(
1291
1577
  )
1292
1578
  return
1293
1579
  deployment_name, location_name = deployment_location
1580
+
1581
+ # If we expect there to be a running code location here but there is none,
1294
1582
  if not self._get_existing_pex_servers(deployment_name, location_name):
1295
- self._logger.warning(
1296
- "Pex servers disappeared for %s:%s. Removing actual entries to"
1297
- " activate reconciliation logic.",
1298
- deployment_name,
1299
- location_name,
1300
- )
1301
- del self._actual_entries[deployment_location]
1583
+ with self._grpc_servers_lock:
1584
+ grpc_server_or_error = self._grpc_servers.get(deployment_location)
1585
+
1586
+ if isinstance(grpc_server_or_error, DagsterCloudGrpcServer):
1587
+ self._logger.warning(
1588
+ "Pex servers disappeared for running code location %s:%s. Removing actual entries to"
1589
+ " activate reconciliation logic and deploy a new code server and multipex server.",
1590
+ deployment_name,
1591
+ location_name,
1592
+ )
1593
+ self._trigger_recovery_server_restart(deployment_location)
1302
1594
 
1303
1595
  # Check to see if any servers have become unresponsive
1304
1596
  unavailable_server_timeout = int(
@@ -1332,8 +1624,6 @@ class DagsterCloudUserCodeLauncher(
1332
1624
  ) as executor:
1333
1625
  futures = {}
1334
1626
  for deployment_location, endpoint_or_error in running_locations.items():
1335
- deployment_name, location_name = deployment_location
1336
-
1337
1627
  futures[
1338
1628
  executor.submit(self._make_check_on_running_server_endpoint(endpoint_or_error))
1339
1629
  ] = deployment_location
@@ -1343,15 +1633,22 @@ class DagsterCloudUserCodeLauncher(
1343
1633
 
1344
1634
  deployment_name, location_name = deployment_location
1345
1635
  try:
1346
- future.result()
1347
-
1636
+ response_or_error = future.result()
1348
1637
  # Successful ping resets the tracked last unavailable time for this code server, if set
1349
1638
  self._first_unavailable_times.pop(deployment_location, None)
1639
+ if isinstance(response_or_error, SerializableErrorInfo):
1640
+ # This can happen if the server was previously healthy but restarted
1641
+ # and moved into an error state - attempt to recover
1642
+ self._logger.exception(
1643
+ f"Code server for {deployment_name}:{location_name} unexpectedly moved into an error state. Deploying a new code server. Observed error: \n{response_or_error.to_string()}"
1644
+ )
1645
+ self._trigger_recovery_server_restart(deployment_location)
1350
1646
  except Exception as e:
1351
1647
  if (
1352
1648
  isinstance(e, DagsterUserCodeUnreachableError)
1353
1649
  and isinstance(e.__cause__, grpc.RpcError)
1354
- and cast(grpc.RpcError, e.__cause__).code() == grpc.StatusCode.UNAVAILABLE
1650
+ and cast("grpc.RpcError", e.__cause__).code()
1651
+ in {grpc.StatusCode.UNAVAILABLE, grpc.StatusCode.UNKNOWN}
1355
1652
  ):
1356
1653
  first_unavailable_time = self._first_unavailable_times.get(
1357
1654
  deployment_location
@@ -1369,8 +1666,8 @@ class DagsterCloudUserCodeLauncher(
1369
1666
  self._logger.warning(
1370
1667
  f"Code server for {deployment_name}:{location_name} has been unresponsive for more than {unavailable_server_timeout} seconds. Deploying a new code server."
1371
1668
  )
1372
- del self._actual_entries[deployment_location]
1373
- del self._first_unavailable_times[deployment_location]
1669
+ self._trigger_recovery_server_restart(deployment_location)
1670
+
1374
1671
  else:
1375
1672
  self._logger.exception(
1376
1673
  f"Code server for {deployment_name}:{location_name} health check failed, but the error did not indicate that the server was unavailable."
@@ -1406,15 +1703,15 @@ class DagsterCloudUserCodeLauncher(
1406
1703
 
1407
1704
  def _deployments_and_locations_to_string(
1408
1705
  self,
1409
- deployments_and_locations: Set[DeploymentAndLocation],
1410
- entries: Dict[DeploymentAndLocation, UserCodeLauncherEntry],
1706
+ deployments_and_locations: set[DeploymentAndLocation],
1707
+ entries: dict[DeploymentAndLocation, UserCodeLauncherEntry],
1411
1708
  ):
1412
1709
  return (
1413
1710
  "{"
1414
1711
  + ", ".join(
1415
1712
  sorted(
1416
1713
  [
1417
- f"({dep}, {loc}, {entries[(dep,loc)].update_timestamp})"
1714
+ f"({dep}, {loc}, {entries[(dep, loc)].update_timestamp})"
1418
1715
  for dep, loc in deployments_and_locations
1419
1716
  ]
1420
1717
  )
@@ -1431,8 +1728,8 @@ class DagsterCloudUserCodeLauncher(
1431
1728
 
1432
1729
  def _reconcile(
1433
1730
  self,
1434
- desired_entries: Dict[DeploymentAndLocation, UserCodeLauncherEntry],
1435
- upload_locations: Set[DeploymentAndLocation],
1731
+ desired_entries: dict[DeploymentAndLocation, UserCodeLauncherEntry],
1732
+ upload_locations: set[DeploymentAndLocation],
1436
1733
  check_on_pending_delete_servers: bool,
1437
1734
  ):
1438
1735
  if check_on_pending_delete_servers:
@@ -1443,7 +1740,11 @@ class DagsterCloudUserCodeLauncher(
1443
1740
  for handle in handles:
1444
1741
  self._graceful_remove_server_handle(handle)
1445
1742
 
1446
- diff = diff_serializable_namedtuple_map(desired_entries, self._actual_entries)
1743
+ diff = diff_serializable_namedtuple_map(
1744
+ desired_entries,
1745
+ self._actual_entries,
1746
+ update_key_fn=lambda entry: entry.update_timestamp,
1747
+ )
1447
1748
  has_changes = diff.to_add or diff.to_update or diff.to_remove or upload_locations
1448
1749
 
1449
1750
  if not has_changes:
@@ -1469,26 +1770,26 @@ class DagsterCloudUserCodeLauncher(
1469
1770
  to_update_keys = diff.to_add.union(diff.to_update)
1470
1771
 
1471
1772
  # Handles for all running standalone Dagster GRPC servers
1472
- existing_standalone_dagster_server_handles: Dict[
1773
+ existing_standalone_dagster_server_handles: dict[
1473
1774
  DeploymentAndLocation, Collection[ServerHandle]
1474
1775
  ] = {}
1475
1776
 
1476
1777
  # Handles for all running Dagster multipex servers (which can each host multiple grpc subprocesses)
1477
- existing_multipex_server_handles: Dict[DeploymentAndLocation, Collection[ServerHandle]] = {}
1778
+ existing_multipex_server_handles: dict[DeploymentAndLocation, Collection[ServerHandle]] = {}
1478
1779
 
1479
1780
  # For each location, all currently running pex servers on the current multipex server
1480
- existing_pex_server_handles: Dict[DeploymentAndLocation, List[PexServerHandle]] = {}
1781
+ existing_pex_server_handles: dict[DeploymentAndLocation, list[PexServerHandle]] = {}
1481
1782
 
1482
1783
  # Dagster grpc servers created in this loop (including both standalone grpc servers
1483
1784
  # and pex servers on a multipex server) - or an error that explains why it couldn't load
1484
- new_dagster_servers: Dict[
1785
+ new_dagster_servers: dict[
1485
1786
  DeploymentAndLocation, Union[DagsterCloudGrpcServer, SerializableErrorInfo]
1486
1787
  ] = {}
1487
1788
 
1488
1789
  # Multipex servers created in this loop (a new multipex server might not always
1489
1790
  # be created on each loop even if the code has changed, as long as the base image
1490
1791
  # is the same)
1491
- new_multipex_servers: Dict[DeploymentAndLocation, DagsterCloudGrpcServer] = {}
1792
+ new_multipex_servers: dict[DeploymentAndLocation, DagsterCloudGrpcServer] = {}
1492
1793
 
1493
1794
  for to_update_key in to_update_keys:
1494
1795
  deployment_name, location_name = to_update_key
@@ -1534,16 +1835,29 @@ class DagsterCloudUserCodeLauncher(
1534
1835
  )
1535
1836
  # confirm it's a valid image since _start_new_server_spinup will launch a container
1536
1837
  self._check_for_image(desired_entry.code_location_deploy_data)
1537
- multipex_server = self._start_new_server_spinup(
1538
- deployment_name, location_name, desired_entry
1539
- )
1540
- self._multipex_servers[to_update_key] = multipex_server
1541
- assert self._get_multipex_server(
1542
- deployment_name,
1543
- location_name,
1544
- desired_entry.code_location_deploy_data,
1545
- )
1546
- new_multipex_servers[to_update_key] = multipex_server
1838
+
1839
+ attributes = {
1840
+ "deployment": deployment_name,
1841
+ "location": location_name,
1842
+ "image": desired_entry.code_location_deploy_data.image,
1843
+ "python_version": desired_python_version,
1844
+ }
1845
+ with observe_execution(
1846
+ opentelemetry=self.opentelemetry,
1847
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.multipex_server.start",
1848
+ short_description="starting new multipex server",
1849
+ attributes=attributes,
1850
+ ):
1851
+ multipex_server = self._start_new_server_spinup(
1852
+ deployment_name, location_name, desired_entry
1853
+ )
1854
+ self._multipex_servers[to_update_key] = multipex_server
1855
+ assert self._get_multipex_server(
1856
+ deployment_name,
1857
+ location_name,
1858
+ desired_entry.code_location_deploy_data,
1859
+ )
1860
+ new_multipex_servers[to_update_key] = multipex_server
1547
1861
  else:
1548
1862
  self._logger.info(
1549
1863
  f"Found running multipex server for {multipex_server_repr}"
@@ -1569,8 +1883,7 @@ class DagsterCloudUserCodeLauncher(
1569
1883
  deployment_name, location_name = to_update_key
1570
1884
 
1571
1885
  self._logger.info(
1572
- f"Waiting for new multipex server for {deployment_name}:{location_name} to be"
1573
- " ready"
1886
+ f"Waiting for new multipex server for {deployment_name}:{location_name} to be ready"
1574
1887
  )
1575
1888
  tasks[to_update_key] = self._wait_for_new_multipex_server(
1576
1889
  deployment_name,
@@ -1681,6 +1994,7 @@ class DagsterCloudUserCodeLauncher(
1681
1994
 
1682
1995
  for server_handle in server_handles:
1683
1996
  try:
1997
+ # TODO - telemetry of removing standalone servers
1684
1998
  self._graceful_remove_server_handle(server_handle)
1685
1999
  except Exception:
1686
2000
  self._logger.error(
@@ -1711,6 +2025,7 @@ class DagsterCloudUserCodeLauncher(
1711
2025
  )
1712
2026
 
1713
2027
  try:
2028
+ # TODO - telemetry of removing multipex server
1714
2029
  self._graceful_remove_server_handle(multipex_server_handle)
1715
2030
  except Exception:
1716
2031
  self._logger.error(
@@ -1728,6 +2043,7 @@ class DagsterCloudUserCodeLauncher(
1728
2043
  )
1729
2044
  for pex_server_handle in pex_server_handles:
1730
2045
  try:
2046
+ # TODO - telemetry of removing pex server
1731
2047
  self._remove_pex_server_handle(
1732
2048
  deployment_name,
1733
2049
  location_name,
@@ -1752,6 +2068,7 @@ class DagsterCloudUserCodeLauncher(
1752
2068
  for to_remove_key in diff.to_remove:
1753
2069
  deployment_name, location_name = to_remove_key
1754
2070
  try:
2071
+ # TODO - telemetry of removing location's server
1755
2072
  self._remove_server(deployment_name, location_name)
1756
2073
  except Exception:
1757
2074
  self._logger.error(
@@ -1763,25 +2080,27 @@ class DagsterCloudUserCodeLauncher(
1763
2080
  del self._grpc_servers[to_remove_key]
1764
2081
  del self._actual_entries[to_remove_key]
1765
2082
 
2083
+ if to_remove_key in self._multipex_servers:
2084
+ del self._multipex_servers[to_remove_key]
2085
+
1766
2086
  # Upload any locations that were requested to be uploaded, but weren't updated
1767
2087
  # as part of this reconciliation loop
2088
+
2089
+ tasks = {}
1768
2090
  for location in upload_locations:
1769
2091
  with self._grpc_servers_lock:
1770
2092
  server_or_error = self._grpc_servers[location]
1771
2093
 
1772
2094
  deployment_name, location_name = location
1773
- try:
1774
- self._update_location_data(
1775
- deployment_name,
1776
- location_name,
1777
- server_or_error,
1778
- self._actual_entries[location].code_location_deploy_data,
1779
- )
1780
- except Exception:
1781
- self._logger.error(
1782
- f"Error while writing location data for {deployment_name}:{location_name}:"
1783
- f" {serializable_error_info_from_exc_info(sys.exc_info())}"
1784
- )
2095
+ tasks[location] = self._try_update_location_data(
2096
+ deployment_name,
2097
+ location_name,
2098
+ server_or_error,
2099
+ self._actual_entries[location].code_location_deploy_data,
2100
+ )
2101
+
2102
+ if tasks:
2103
+ results = asyncio.run(self._gather_tasks(tasks.values()))
1785
2104
 
1786
2105
  seconds = time.time() - start_time
1787
2106
  self._logger.info(f"Finished reconciling in {seconds} seconds.")
@@ -1854,6 +2173,11 @@ class DagsterCloudUserCodeLauncher(
1854
2173
  def _start_new_dagster_server(
1855
2174
  self, deployment_name: str, location_name: str, desired_entry: UserCodeLauncherEntry
1856
2175
  ) -> DagsterCloudGrpcServer:
2176
+ attributes = {
2177
+ "deployment": deployment_name,
2178
+ "location": location_name,
2179
+ }
2180
+
1857
2181
  if desired_entry.code_location_deploy_data.pex_metadata:
1858
2182
  multipex_server = self._get_multipex_server(
1859
2183
  deployment_name, location_name, desired_entry.code_location_deploy_data
@@ -1861,26 +2185,45 @@ class DagsterCloudUserCodeLauncher(
1861
2185
 
1862
2186
  assert multipex_server # should have been started earlier or we should never reach here
1863
2187
 
1864
- self._create_pex_server(deployment_name, location_name, desired_entry, multipex_server)
2188
+ if desired_entry.code_location_deploy_data.pex_metadata.python_version:
2189
+ attributes["python_version"] = (
2190
+ desired_entry.code_location_deploy_data.pex_metadata.python_version
2191
+ )
1865
2192
 
1866
- server_handle = multipex_server.server_handle
1867
- multipex_endpoint = multipex_server.server_endpoint
2193
+ with observe_execution(
2194
+ opentelemetry=self.opentelemetry,
2195
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.pex_server.start",
2196
+ short_description="starting new pex server",
2197
+ attributes=attributes,
2198
+ ):
2199
+ self._create_pex_server(
2200
+ deployment_name, location_name, desired_entry, multipex_server
2201
+ )
1868
2202
 
1869
- # start a new pex server on the multipexer, which we can count on already existing
1870
- return DagsterCloudGrpcServer(
1871
- server_handle,
1872
- multipex_endpoint.with_metadata(
1873
- [
1874
- ("has_pex", "1"),
1875
- ("deployment", deployment_name),
1876
- ("location", location_name),
1877
- ("timestamp", str(int(desired_entry.update_timestamp))),
1878
- ],
1879
- ),
1880
- desired_entry.code_location_deploy_data,
1881
- )
2203
+ server_handle = multipex_server.server_handle
2204
+ multipex_endpoint = multipex_server.server_endpoint
2205
+
2206
+ # start a new pex server on the multipexer, which we can count on already existing
2207
+ return DagsterCloudGrpcServer(
2208
+ server_handle,
2209
+ multipex_endpoint.with_metadata(
2210
+ [
2211
+ ("has_pex", "1"),
2212
+ ("deployment", deployment_name),
2213
+ ("location", location_name),
2214
+ ("timestamp", str(int(desired_entry.update_timestamp))),
2215
+ ],
2216
+ ),
2217
+ desired_entry.code_location_deploy_data,
2218
+ )
1882
2219
  else:
1883
- return self._start_new_server_spinup(deployment_name, location_name, desired_entry)
2220
+ with observe_execution(
2221
+ opentelemetry=self.opentelemetry,
2222
+ event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.code_server.start",
2223
+ short_description="new code server spin up",
2224
+ attributes=attributes,
2225
+ ):
2226
+ return self._start_new_server_spinup(deployment_name, location_name, desired_entry)
1884
2227
 
1885
2228
  def get_grpc_endpoint(
1886
2229
  self,
@@ -1924,12 +2267,12 @@ class DagsterCloudUserCodeLauncher(
1924
2267
 
1925
2268
  return server
1926
2269
 
1927
- def get_grpc_server_heartbeats(self) -> Dict[str, List[CloudCodeServerHeartbeat]]:
2270
+ def get_grpc_server_heartbeats(self) -> dict[str, list[CloudCodeServerHeartbeat]]:
1928
2271
  endpoint_or_errors = self.get_grpc_endpoints()
1929
2272
  with self._metadata_lock:
1930
2273
  desired_entries = set(self._desired_entries.keys())
1931
2274
 
1932
- heartbeats: Dict[str, List[CloudCodeServerHeartbeat]] = {}
2275
+ heartbeats: dict[str, list[CloudCodeServerHeartbeat]] = {}
1933
2276
  for entry_key in desired_entries:
1934
2277
  deployment_name, location_name = entry_key
1935
2278
  endpoint_or_error = endpoint_or_errors.get(entry_key)
@@ -1979,7 +2322,7 @@ class DagsterCloudUserCodeLauncher(
1979
2322
 
1980
2323
  def get_grpc_endpoints(
1981
2324
  self,
1982
- ) -> Dict[DeploymentAndLocation, Union[ServerEndpoint, SerializableErrorInfo]]:
2325
+ ) -> dict[DeploymentAndLocation, Union[ServerEndpoint, SerializableErrorInfo]]:
1983
2326
  with self._grpc_servers_lock:
1984
2327
  return {
1985
2328
  key: val if isinstance(val, SerializableErrorInfo) else val.server_endpoint
@@ -2011,7 +2354,7 @@ class DagsterCloudUserCodeLauncher(
2011
2354
  client, timeout, additional_check, get_timeout_debug_info=get_timeout_debug_info
2012
2355
  )
2013
2356
  # Call a method that raises an exception if there was an error importing the code
2014
- sync_list_repositories_grpc(client)
2357
+ await self.gen_list_repositories_response(client)
2015
2358
 
2016
2359
  async def _wait_for_server_process(
2017
2360
  self,
@@ -2070,6 +2413,7 @@ class DagsterCloudUserCodeLauncher(
2070
2413
  response = client.external_job(
2071
2414
  RemoteRepositoryOrigin(location_origin, job_selector.repository_name),
2072
2415
  job_selector.job_name,
2416
+ timeout=int(os.getenv("DAGSTER_CLOUD_EXTERNAL_JOB_GRPC_TIMEOUT", "180")),
2073
2417
  )
2074
2418
  if not response.serialized_job_data:
2075
2419
  error = (
@@ -2098,3 +2442,39 @@ class DagsterCloudUserCodeLauncher(
2098
2442
  f" {job_selector.job_name}@{job_selector.repository_name} ({os.path.getsize(dst)} bytes)"
2099
2443
  )
2100
2444
  return response
2445
+
2446
+ @property
2447
+ def opentelemetry(self) -> Optional[OpenTelemetryController]:
2448
+ if not self.has_instance:
2449
+ return None
2450
+ else:
2451
+ return self._instance.opentelemetry
2452
+
2453
+ def upload_job_snap_direct(
2454
+ self,
2455
+ deployment_name: str,
2456
+ job_selector: JobSelector,
2457
+ server: DagsterCloudGrpcServer,
2458
+ ):
2459
+ client = server.server_endpoint.create_client()
2460
+ location_origin = self._get_code_location_origin(job_selector.location_name)
2461
+ response = client.external_job(
2462
+ RemoteRepositoryOrigin(location_origin, job_selector.repository_name),
2463
+ job_selector.job_name,
2464
+ )
2465
+ if not response.serialized_job_data:
2466
+ error = (
2467
+ deserialize_value(response.serialized_error, SerializableErrorInfo)
2468
+ if response.serialized_error
2469
+ else "no captured error"
2470
+ )
2471
+ raise Exception(f"Error fetching job data in code server:\n{error}")
2472
+
2473
+ job_snapshot = extract_serialized_job_snap_from_serialized_job_data_snap(
2474
+ response.serialized_job_data
2475
+ )
2476
+ return self._ensure_snapshot_uploaded(
2477
+ deployment_name,
2478
+ SnapshotType.JOB,
2479
+ job_snapshot,
2480
+ )