dagster-cloud 1.8.2__py3-none-any.whl → 1.12.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_cloud/__init__.py +3 -3
- dagster_cloud/agent/__init__.py +4 -4
- dagster_cloud/agent/cli/__init__.py +56 -17
- dagster_cloud/agent/dagster_cloud_agent.py +360 -172
- dagster_cloud/agent/instrumentation/__init__.py +0 -0
- dagster_cloud/agent/instrumentation/constants.py +2 -0
- dagster_cloud/agent/instrumentation/run_launch.py +23 -0
- dagster_cloud/agent/instrumentation/schedule.py +34 -0
- dagster_cloud/agent/instrumentation/sensor.py +34 -0
- dagster_cloud/anomaly_detection/__init__.py +2 -2
- dagster_cloud/anomaly_detection/defs.py +17 -12
- dagster_cloud/anomaly_detection/types.py +3 -3
- dagster_cloud/api/dagster_cloud_api.py +209 -293
- dagster_cloud/auth/constants.py +21 -5
- dagster_cloud/batching/__init__.py +1 -0
- dagster_cloud/batching/batcher.py +210 -0
- dagster_cloud/dagster_insights/__init__.py +12 -6
- dagster_cloud/dagster_insights/bigquery/bigquery_utils.py +3 -2
- dagster_cloud/dagster_insights/bigquery/dbt_wrapper.py +39 -12
- dagster_cloud/dagster_insights/bigquery/insights_bigquery_resource.py +8 -6
- dagster_cloud/dagster_insights/insights_utils.py +18 -8
- dagster_cloud/dagster_insights/metrics_utils.py +12 -12
- dagster_cloud/dagster_insights/snowflake/dagster_snowflake_insights.py +5 -12
- dagster_cloud/dagster_insights/snowflake/dbt_wrapper.py +34 -8
- dagster_cloud/dagster_insights/snowflake/definitions.py +38 -12
- dagster_cloud/dagster_insights/snowflake/insights_snowflake_resource.py +11 -23
- dagster_cloud/definitions/__init__.py +0 -0
- dagster_cloud/definitions/job_selection.py +36 -0
- dagster_cloud/execution/cloud_run_launcher/k8s.py +1 -1
- dagster_cloud/execution/cloud_run_launcher/process.py +3 -3
- dagster_cloud/execution/monitoring/__init__.py +27 -33
- dagster_cloud/execution/utils/process.py +3 -3
- dagster_cloud/instance/__init__.py +125 -38
- dagster_cloud/instrumentation/__init__.py +32 -0
- dagster_cloud/metadata/source_code.py +13 -8
- dagster_cloud/metrics/__init__.py +0 -0
- dagster_cloud/metrics/tracer.py +59 -0
- dagster_cloud/opentelemetry/__init__.py +0 -0
- dagster_cloud/opentelemetry/config/__init__.py +73 -0
- dagster_cloud/opentelemetry/config/exporter.py +81 -0
- dagster_cloud/opentelemetry/config/log_record_processor.py +40 -0
- dagster_cloud/opentelemetry/config/logging_handler.py +14 -0
- dagster_cloud/opentelemetry/config/meter_provider.py +9 -0
- dagster_cloud/opentelemetry/config/metric_reader.py +39 -0
- dagster_cloud/opentelemetry/controller.py +319 -0
- dagster_cloud/opentelemetry/enum.py +58 -0
- dagster_cloud/opentelemetry/factories/__init__.py +1 -0
- dagster_cloud/opentelemetry/factories/logs.py +113 -0
- dagster_cloud/opentelemetry/factories/metrics.py +121 -0
- dagster_cloud/opentelemetry/metrics/__init__.py +0 -0
- dagster_cloud/opentelemetry/metrics/meter.py +140 -0
- dagster_cloud/opentelemetry/observers/__init__.py +0 -0
- dagster_cloud/opentelemetry/observers/dagster_exception_handler.py +40 -0
- dagster_cloud/opentelemetry/observers/execution_observer.py +178 -0
- dagster_cloud/pex/grpc/__generated__/multi_pex_api_pb2.pyi +175 -0
- dagster_cloud/pex/grpc/__init__.py +2 -2
- dagster_cloud/pex/grpc/client.py +4 -4
- dagster_cloud/pex/grpc/compile.py +2 -2
- dagster_cloud/pex/grpc/server/__init__.py +2 -2
- dagster_cloud/pex/grpc/server/cli/__init__.py +31 -19
- dagster_cloud/pex/grpc/server/manager.py +60 -42
- dagster_cloud/pex/grpc/server/registry.py +28 -21
- dagster_cloud/pex/grpc/server/server.py +23 -14
- dagster_cloud/pex/grpc/types.py +5 -5
- dagster_cloud/py.typed +0 -0
- dagster_cloud/secrets/__init__.py +1 -1
- dagster_cloud/secrets/loader.py +3 -3
- dagster_cloud/serverless/__init__.py +1 -1
- dagster_cloud/serverless/io_manager.py +36 -53
- dagster_cloud/storage/client.py +54 -17
- dagster_cloud/storage/compute_logs/__init__.py +3 -1
- dagster_cloud/storage/compute_logs/compute_log_manager.py +22 -17
- dagster_cloud/storage/defs_state/__init__.py +3 -0
- dagster_cloud/storage/defs_state/queries.py +15 -0
- dagster_cloud/storage/defs_state/storage.py +113 -0
- dagster_cloud/storage/event_logs/__init__.py +3 -1
- dagster_cloud/storage/event_logs/queries.py +102 -4
- dagster_cloud/storage/event_logs/storage.py +266 -73
- dagster_cloud/storage/event_logs/utils.py +88 -7
- dagster_cloud/storage/runs/__init__.py +1 -1
- dagster_cloud/storage/runs/queries.py +17 -2
- dagster_cloud/storage/runs/storage.py +88 -42
- dagster_cloud/storage/schedules/__init__.py +1 -1
- dagster_cloud/storage/schedules/storage.py +6 -8
- dagster_cloud/storage/tags.py +66 -1
- dagster_cloud/util/__init__.py +10 -12
- dagster_cloud/util/errors.py +49 -64
- dagster_cloud/version.py +1 -1
- dagster_cloud/workspace/config_schema/__init__.py +55 -13
- dagster_cloud/workspace/docker/__init__.py +76 -25
- dagster_cloud/workspace/docker/utils.py +1 -1
- dagster_cloud/workspace/ecs/__init__.py +1 -1
- dagster_cloud/workspace/ecs/client.py +51 -33
- dagster_cloud/workspace/ecs/launcher.py +76 -22
- dagster_cloud/workspace/ecs/run_launcher.py +3 -3
- dagster_cloud/workspace/ecs/utils.py +14 -5
- dagster_cloud/workspace/kubernetes/__init__.py +1 -1
- dagster_cloud/workspace/kubernetes/launcher.py +61 -29
- dagster_cloud/workspace/kubernetes/utils.py +34 -22
- dagster_cloud/workspace/user_code_launcher/__init__.py +5 -3
- dagster_cloud/workspace/user_code_launcher/process.py +16 -14
- dagster_cloud/workspace/user_code_launcher/user_code_launcher.py +552 -172
- dagster_cloud/workspace/user_code_launcher/utils.py +105 -1
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/METADATA +48 -42
- dagster_cloud-1.12.6.dist-info/RECORD +134 -0
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/WHEEL +1 -1
- dagster_cloud-1.8.2.dist-info/RECORD +0 -100
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# ruff: noqa: PLE1205
|
|
3
3
|
import asyncio
|
|
4
4
|
import functools
|
|
5
|
+
import hashlib
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
@@ -11,57 +12,66 @@ import threading
|
|
|
11
12
|
import time
|
|
12
13
|
import zlib
|
|
13
14
|
from abc import abstractmethod, abstractproperty
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
from collections.abc import Collection, Mapping, Sequence
|
|
14
17
|
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
|
15
18
|
from contextlib import AbstractContextManager
|
|
16
|
-
from
|
|
17
|
-
|
|
18
|
-
Callable,
|
|
19
|
-
Collection,
|
|
20
|
-
DefaultDict,
|
|
21
|
-
Dict,
|
|
22
|
-
Generic,
|
|
23
|
-
List,
|
|
24
|
-
Mapping,
|
|
25
|
-
NamedTuple,
|
|
26
|
-
Optional,
|
|
27
|
-
Sequence,
|
|
28
|
-
Set,
|
|
29
|
-
Tuple,
|
|
30
|
-
TypeVar,
|
|
31
|
-
Union,
|
|
32
|
-
cast,
|
|
33
|
-
)
|
|
19
|
+
from io import BytesIO
|
|
20
|
+
from typing import Any, Callable, Generic, NamedTuple, Optional, TypeVar, Union, cast
|
|
34
21
|
|
|
35
22
|
import dagster._check as check
|
|
36
23
|
import grpc
|
|
37
24
|
from dagster import BoolSource, Field, IntSource
|
|
38
|
-
from dagster._api.list_repositories import
|
|
25
|
+
from dagster._api.list_repositories import gen_list_repositories_grpc
|
|
39
26
|
from dagster._core.definitions.selector import JobSelector
|
|
40
27
|
from dagster._core.errors import DagsterUserCodeUnreachableError
|
|
41
28
|
from dagster._core.instance import MayHaveInstanceWeakref
|
|
42
29
|
from dagster._core.launcher import RunLauncher
|
|
43
|
-
from dagster._core.
|
|
44
|
-
from dagster._core.remote_representation.origin import (
|
|
30
|
+
from dagster._core.remote_origin import (
|
|
45
31
|
CodeLocationOrigin,
|
|
46
32
|
RegisteredCodeLocationOrigin,
|
|
33
|
+
RemoteRepositoryOrigin,
|
|
34
|
+
)
|
|
35
|
+
from dagster._core.remote_representation.external_data import (
|
|
36
|
+
extract_serialized_job_snap_from_serialized_job_data_snap,
|
|
47
37
|
)
|
|
48
38
|
from dagster._grpc.client import DagsterGrpcClient
|
|
49
|
-
from dagster._grpc.types import GetCurrentImageResult
|
|
50
|
-
from dagster._serdes import
|
|
39
|
+
from dagster._grpc.types import GetCurrentImageResult, ListRepositoriesResponse
|
|
40
|
+
from dagster._serdes import (
|
|
41
|
+
deserialize_value,
|
|
42
|
+
pack_value,
|
|
43
|
+
serialize_value,
|
|
44
|
+
unpack_value,
|
|
45
|
+
whitelist_for_serdes,
|
|
46
|
+
)
|
|
51
47
|
from dagster._time import get_current_timestamp
|
|
52
|
-
from dagster._utils.error import
|
|
48
|
+
from dagster._utils.error import (
|
|
49
|
+
SerializableErrorInfo,
|
|
50
|
+
serializable_error_info_from_exc_info,
|
|
51
|
+
truncate_serialized_error,
|
|
52
|
+
)
|
|
53
53
|
from dagster._utils.merger import merge_dicts
|
|
54
54
|
from dagster._utils.typed_dict import init_optional_typeddict
|
|
55
55
|
from dagster_cloud_cli.core.errors import raise_http_error
|
|
56
56
|
from dagster_cloud_cli.core.workspace import CodeLocationDeployData
|
|
57
57
|
from typing_extensions import Self, TypeAlias
|
|
58
58
|
|
|
59
|
+
from dagster_cloud.agent.instrumentation.constants import DAGSTER_CLOUD_AGENT_METRIC_PREFIX
|
|
59
60
|
from dagster_cloud.agent.queries import GET_AGENTS_QUERY
|
|
60
61
|
from dagster_cloud.api.dagster_cloud_api import (
|
|
62
|
+
CheckSnapshotResult,
|
|
63
|
+
ConfirmUploadResult,
|
|
64
|
+
DagsterCloudCodeLocationManifest,
|
|
65
|
+
DagsterCloudCodeLocationUpdateResponse,
|
|
66
|
+
DagsterCloudCodeLocationUpdateResult,
|
|
67
|
+
DagsterCloudRepositoryManifest,
|
|
61
68
|
DagsterCloudUploadLocationData,
|
|
62
69
|
DagsterCloudUploadRepositoryData,
|
|
63
70
|
DagsterCloudUploadWorkspaceEntry,
|
|
64
71
|
DagsterCloudUploadWorkspaceResponse,
|
|
72
|
+
FileFormat,
|
|
73
|
+
SnapshotType,
|
|
74
|
+
StoredSnapshot,
|
|
65
75
|
UserCodeDeploymentType,
|
|
66
76
|
)
|
|
67
77
|
from dagster_cloud.execution.monitoring import (
|
|
@@ -75,6 +85,8 @@ from dagster_cloud.execution.monitoring import (
|
|
|
75
85
|
start_run_worker_monitoring_thread,
|
|
76
86
|
)
|
|
77
87
|
from dagster_cloud.instance import DagsterCloudAgentInstance
|
|
88
|
+
from dagster_cloud.opentelemetry.controller import OpenTelemetryController
|
|
89
|
+
from dagster_cloud.opentelemetry.observers.execution_observer import observe_execution
|
|
78
90
|
from dagster_cloud.pex.grpc.client import MultiPexGrpcClient
|
|
79
91
|
from dagster_cloud.pex.grpc.types import (
|
|
80
92
|
CreatePexServerArgs,
|
|
@@ -83,7 +95,6 @@ from dagster_cloud.pex.grpc.types import (
|
|
|
83
95
|
ShutdownPexServerArgs,
|
|
84
96
|
)
|
|
85
97
|
from dagster_cloud.util import diff_serializable_namedtuple_map
|
|
86
|
-
from dagster_cloud.util.errors import truncate_serialized_error
|
|
87
98
|
|
|
88
99
|
DEFAULT_SERVER_PROCESS_STARTUP_TIMEOUT = 180
|
|
89
100
|
DEFAULT_MAX_TTL_SERVERS = 25
|
|
@@ -155,7 +166,7 @@ class UserCodeLauncherEntry(
|
|
|
155
166
|
code_location_deploy_data,
|
|
156
167
|
update_timestamp,
|
|
157
168
|
):
|
|
158
|
-
return super(
|
|
169
|
+
return super().__new__(
|
|
159
170
|
cls,
|
|
160
171
|
check.inst_param(
|
|
161
172
|
code_location_deploy_data, "code_location_deploy_data", CodeLocationDeployData
|
|
@@ -234,10 +245,13 @@ SHARED_USER_CODE_LAUNCHER_CONFIG = {
|
|
|
234
245
|
BoolSource,
|
|
235
246
|
is_required=False,
|
|
236
247
|
default_value=True,
|
|
237
|
-
description=(
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
248
|
+
description=("Deprecated - no longer used"),
|
|
249
|
+
),
|
|
250
|
+
"direct_snapshot_uploads": Field(
|
|
251
|
+
BoolSource,
|
|
252
|
+
is_required=False,
|
|
253
|
+
default_value=True,
|
|
254
|
+
description=("Opt-out for uploading definition snapshots directly to blob storage."),
|
|
241
255
|
),
|
|
242
256
|
"upload_snapshots_on_startup": Field(
|
|
243
257
|
BoolSource,
|
|
@@ -260,8 +274,8 @@ SHARED_USER_CODE_LAUNCHER_CONFIG = {
|
|
|
260
274
|
),
|
|
261
275
|
}
|
|
262
276
|
|
|
263
|
-
DeploymentAndLocation: TypeAlias =
|
|
264
|
-
UserCodeLauncherEntryMap: TypeAlias =
|
|
277
|
+
DeploymentAndLocation: TypeAlias = tuple[str, str]
|
|
278
|
+
UserCodeLauncherEntryMap: TypeAlias = dict[DeploymentAndLocation, UserCodeLauncherEntry]
|
|
265
279
|
|
|
266
280
|
|
|
267
281
|
class ServerEndpoint(
|
|
@@ -271,12 +285,12 @@ class ServerEndpoint(
|
|
|
271
285
|
("host", str),
|
|
272
286
|
("port", Optional[int]),
|
|
273
287
|
("socket", Optional[str]),
|
|
274
|
-
("metadata", Optional[
|
|
288
|
+
("metadata", Optional[list[tuple[str, str]]]),
|
|
275
289
|
],
|
|
276
290
|
)
|
|
277
291
|
):
|
|
278
292
|
def __new__(cls, host, port, socket, metadata=None):
|
|
279
|
-
return super(
|
|
293
|
+
return super().__new__(
|
|
280
294
|
cls,
|
|
281
295
|
check.str_param(host, "host"),
|
|
282
296
|
check.opt_int_param(port, "port"),
|
|
@@ -292,7 +306,7 @@ class ServerEndpoint(
|
|
|
292
306
|
def create_multipex_client(self) -> MultiPexGrpcClient:
|
|
293
307
|
return MultiPexGrpcClient(port=self.port, socket=self.socket, host=self.host)
|
|
294
308
|
|
|
295
|
-
def with_metadata(self, metadata: Optional[
|
|
309
|
+
def with_metadata(self, metadata: Optional[list[tuple[str, str]]]):
|
|
296
310
|
return self._replace(metadata=metadata)
|
|
297
311
|
|
|
298
312
|
|
|
@@ -312,7 +326,7 @@ class DagsterCloudGrpcServer(
|
|
|
312
326
|
server_endpoint: ServerEndpoint,
|
|
313
327
|
code_location_deploy_data: CodeLocationDeployData,
|
|
314
328
|
):
|
|
315
|
-
return super(
|
|
329
|
+
return super().__new__(
|
|
316
330
|
cls,
|
|
317
331
|
server_handle,
|
|
318
332
|
check.inst_param(server_endpoint, "server_endpoint", ServerEndpoint),
|
|
@@ -322,57 +336,72 @@ class DagsterCloudGrpcServer(
|
|
|
322
336
|
)
|
|
323
337
|
|
|
324
338
|
|
|
339
|
+
_SUPPORTED_FILE_FORMATS = [FileFormat.GZIPPED_JSON, FileFormat.JSON]
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _file_for_format(obj_bytes: bytes, fmt: str):
|
|
343
|
+
if fmt == FileFormat.JSON:
|
|
344
|
+
return BytesIO(obj_bytes)
|
|
345
|
+
elif fmt == FileFormat.GZIPPED_JSON:
|
|
346
|
+
return BytesIO(zlib.compress(obj_bytes))
|
|
347
|
+
else:
|
|
348
|
+
check.failed(f"Unexpected file format {fmt}")
|
|
349
|
+
|
|
350
|
+
|
|
325
351
|
class DagsterCloudUserCodeLauncher(
|
|
326
352
|
AbstractContextManager, MayHaveInstanceWeakref[DagsterCloudAgentInstance], Generic[ServerHandle]
|
|
327
353
|
):
|
|
328
354
|
def __init__(
|
|
329
355
|
self,
|
|
330
356
|
server_ttl: Optional[dict] = None,
|
|
331
|
-
defer_job_snapshots: bool = True,
|
|
332
357
|
server_process_startup_timeout=None,
|
|
333
358
|
upload_snapshots_on_startup: bool = True,
|
|
334
359
|
requires_healthcheck: bool = False,
|
|
335
360
|
code_server_metrics: Optional[Mapping[str, Any]] = None,
|
|
336
361
|
agent_metrics: Optional[Mapping[str, Any]] = None,
|
|
362
|
+
direct_snapshot_uploads: bool = False,
|
|
363
|
+
# ignored old setting, allowed to flow through to avoid breakage
|
|
364
|
+
defer_job_snapshots: bool = True,
|
|
337
365
|
):
|
|
338
|
-
self._grpc_servers:
|
|
366
|
+
self._grpc_servers: dict[
|
|
339
367
|
DeploymentAndLocation, Union[DagsterCloudGrpcServer, SerializableErrorInfo]
|
|
340
368
|
] = {}
|
|
341
|
-
self._first_unavailable_times:
|
|
369
|
+
self._first_unavailable_times: dict[DeploymentAndLocation, float] = {}
|
|
342
370
|
|
|
343
|
-
self._pending_delete_grpc_server_handles:
|
|
371
|
+
self._pending_delete_grpc_server_handles: set[ServerHandle] = set()
|
|
344
372
|
self._grpc_servers_lock = threading.Lock()
|
|
345
|
-
self._per_location_metrics:
|
|
373
|
+
self._per_location_metrics: dict[
|
|
346
374
|
DeploymentAndLocation, CloudCodeServerUtilizationMetrics
|
|
347
|
-
] =
|
|
375
|
+
] = defaultdict(lambda: init_optional_typeddict(CloudCodeServerUtilizationMetrics))
|
|
348
376
|
|
|
349
|
-
self._multipex_servers:
|
|
377
|
+
self._multipex_servers: dict[DeploymentAndLocation, DagsterCloudGrpcServer] = {}
|
|
350
378
|
|
|
351
379
|
self._server_ttl_config = check.opt_dict_param(server_ttl, "server_ttl")
|
|
352
|
-
self.
|
|
380
|
+
self._direct_snapshot_uploads = direct_snapshot_uploads
|
|
353
381
|
self.upload_snapshots_on_startup = check.bool_param(
|
|
354
382
|
upload_snapshots_on_startup, "upload_snapshots_on_startup"
|
|
355
383
|
)
|
|
356
384
|
self._requires_healthcheck = check.bool_param(requires_healthcheck, "requires_healthcheck")
|
|
357
385
|
|
|
358
386
|
# periodically reconciles to make desired = actual
|
|
359
|
-
self._desired_entries:
|
|
360
|
-
self._actual_entries:
|
|
387
|
+
self._desired_entries: dict[DeploymentAndLocation, UserCodeLauncherEntry] = {}
|
|
388
|
+
self._actual_entries: dict[DeploymentAndLocation, UserCodeLauncherEntry] = {}
|
|
361
389
|
self._last_refreshed_actual_entries = 0
|
|
362
390
|
self._last_cleaned_up_dangling_code_servers = 0
|
|
363
391
|
self._metadata_lock = threading.Lock()
|
|
364
392
|
|
|
365
|
-
self._upload_locations:
|
|
393
|
+
self._upload_locations: set[DeploymentAndLocation] = set()
|
|
366
394
|
|
|
367
395
|
self._logger = logging.getLogger("dagster_cloud.user_code_launcher")
|
|
368
396
|
self._event_logger = logging.getLogger("cloud-events")
|
|
369
397
|
self._started: bool = False
|
|
370
398
|
self._run_worker_monitoring_thread = None
|
|
371
399
|
self._run_worker_monitoring_thread_shutdown_event = None
|
|
372
|
-
self._run_worker_deployments_to_check:
|
|
373
|
-
self._run_worker_statuses_dict:
|
|
400
|
+
self._run_worker_deployments_to_check: set[str] = set()
|
|
401
|
+
self._run_worker_statuses_dict: dict[str, list[CloudRunWorkerStatus]] = {}
|
|
374
402
|
self._run_worker_monitoring_lock = threading.Lock()
|
|
375
403
|
|
|
404
|
+
self._in_progress_reconcile_start_time = time.time()
|
|
376
405
|
self._reconcile_count = 0
|
|
377
406
|
self._reconcile_grpc_metadata_shutdown_event = threading.Event()
|
|
378
407
|
self._reconcile_grpc_metadata_thread = None
|
|
@@ -390,7 +419,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
390
419
|
self._agent_metrics_config = agent_metrics
|
|
391
420
|
super().__init__()
|
|
392
421
|
|
|
393
|
-
def get_active_grpc_server_handles(self) ->
|
|
422
|
+
def get_active_grpc_server_handles(self) -> list[ServerHandle]:
|
|
394
423
|
with self._grpc_servers_lock:
|
|
395
424
|
return [
|
|
396
425
|
s.server_handle
|
|
@@ -398,7 +427,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
398
427
|
if not isinstance(s, SerializableErrorInfo)
|
|
399
428
|
] + list(self._pending_delete_grpc_server_handles)
|
|
400
429
|
|
|
401
|
-
def get_active_agent_ids(self) -> Optional[
|
|
430
|
+
def get_active_agent_ids(self) -> Optional[set[str]]:
|
|
402
431
|
try:
|
|
403
432
|
result = self._instance.organization_scoped_graphql_client().execute(
|
|
404
433
|
GET_AGENTS_QUERY,
|
|
@@ -569,6 +598,193 @@ class DagsterCloudUserCodeLauncher(
|
|
|
569
598
|
workspace_entry: DagsterCloudUploadWorkspaceEntry,
|
|
570
599
|
server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
|
|
571
600
|
) -> None:
|
|
601
|
+
if self._direct_snapshot_uploads:
|
|
602
|
+
self._update_workspace_entry_direct_upload(
|
|
603
|
+
deployment_name,
|
|
604
|
+
workspace_entry,
|
|
605
|
+
server_or_error,
|
|
606
|
+
)
|
|
607
|
+
else:
|
|
608
|
+
self._update_workspace_entry_server_upload(
|
|
609
|
+
deployment_name,
|
|
610
|
+
workspace_entry,
|
|
611
|
+
server_or_error,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
def _ensure_snapshot_uploaded(
|
|
615
|
+
self,
|
|
616
|
+
deployment_name: str,
|
|
617
|
+
snapshot_type: str,
|
|
618
|
+
serialized_object: str,
|
|
619
|
+
) -> StoredSnapshot:
|
|
620
|
+
object_bytes = serialized_object.encode("utf-8")
|
|
621
|
+
sha1 = hashlib.sha1(object_bytes).hexdigest()
|
|
622
|
+
byte_count = len(object_bytes)
|
|
623
|
+
response = self._instance.requests_managed_retries_session.get(
|
|
624
|
+
self._instance.dagster_cloud_check_snapshot_url,
|
|
625
|
+
headers=self._instance.headers_for_deployment(deployment_name),
|
|
626
|
+
params={
|
|
627
|
+
"type": snapshot_type,
|
|
628
|
+
"sha1": sha1,
|
|
629
|
+
"size": byte_count,
|
|
630
|
+
"formats": _SUPPORTED_FILE_FORMATS,
|
|
631
|
+
},
|
|
632
|
+
timeout=self._instance.dagster_cloud_api_timeout,
|
|
633
|
+
proxies=self._instance.dagster_cloud_api_proxies,
|
|
634
|
+
)
|
|
635
|
+
raise_http_error(response)
|
|
636
|
+
|
|
637
|
+
result = unpack_value(response.json(), CheckSnapshotResult)
|
|
638
|
+
|
|
639
|
+
if not result.stored_snapshot:
|
|
640
|
+
upload_data = check.not_none(
|
|
641
|
+
result.upload_data,
|
|
642
|
+
"upload_data expected when stored_snapshot is None",
|
|
643
|
+
)
|
|
644
|
+
file = _file_for_format(object_bytes, upload_data.format)
|
|
645
|
+
response = self._instance.requests_managed_retries_session.put(
|
|
646
|
+
url=upload_data.presigned_put_url,
|
|
647
|
+
data=file,
|
|
648
|
+
timeout=self._instance.dagster_cloud_api_timeout,
|
|
649
|
+
)
|
|
650
|
+
raise_http_error(response)
|
|
651
|
+
|
|
652
|
+
response = self._instance.requests_managed_retries_session.put(
|
|
653
|
+
self._instance.dagster_cloud_confirm_upload_url,
|
|
654
|
+
headers=self._instance.headers_for_deployment(deployment_name),
|
|
655
|
+
json=pack_value(upload_data),
|
|
656
|
+
timeout=self._instance.dagster_cloud_api_timeout,
|
|
657
|
+
proxies=self._instance.dagster_cloud_api_proxies,
|
|
658
|
+
)
|
|
659
|
+
raise_http_error(response)
|
|
660
|
+
result = unpack_value(response.json(), ConfirmUploadResult)
|
|
661
|
+
return result.stored_snapshot
|
|
662
|
+
|
|
663
|
+
return result.stored_snapshot
|
|
664
|
+
|
|
665
|
+
def _update_workspace_entry_direct_upload(
|
|
666
|
+
self,
|
|
667
|
+
deployment_name: str,
|
|
668
|
+
workspace_entry: DagsterCloudUploadWorkspaceEntry,
|
|
669
|
+
server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
|
|
670
|
+
) -> None:
|
|
671
|
+
# updated scheme, uploading definitions to blob storage via signed urls
|
|
672
|
+
error_snap = None
|
|
673
|
+
manifest = None
|
|
674
|
+
if workspace_entry.serialized_error_info:
|
|
675
|
+
error_snap = self._ensure_snapshot_uploaded(
|
|
676
|
+
deployment_name,
|
|
677
|
+
SnapshotType.ERROR,
|
|
678
|
+
serialize_value(workspace_entry.serialized_error_info),
|
|
679
|
+
)
|
|
680
|
+
elif isinstance(server_or_error, SerializableErrorInfo):
|
|
681
|
+
error_snap = self._ensure_snapshot_uploaded(
|
|
682
|
+
deployment_name,
|
|
683
|
+
SnapshotType.ERROR,
|
|
684
|
+
serialize_value(server_or_error),
|
|
685
|
+
)
|
|
686
|
+
elif workspace_entry.upload_location_data:
|
|
687
|
+
repos = []
|
|
688
|
+
for repo_data in workspace_entry.upload_location_data.upload_repository_datas:
|
|
689
|
+
stored_snapshot = self._ensure_snapshot_uploaded(
|
|
690
|
+
deployment_name,
|
|
691
|
+
SnapshotType.REPOSITORY,
|
|
692
|
+
repo_data.serialized_repository_data,
|
|
693
|
+
)
|
|
694
|
+
repos.append(
|
|
695
|
+
DagsterCloudRepositoryManifest(
|
|
696
|
+
name=repo_data.repository_name,
|
|
697
|
+
code_pointer=repo_data.code_pointer,
|
|
698
|
+
stored_snapshot=stored_snapshot,
|
|
699
|
+
)
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
manifest = DagsterCloudCodeLocationManifest(
|
|
703
|
+
repositories=repos,
|
|
704
|
+
executable_path=workspace_entry.upload_location_data.executable_path,
|
|
705
|
+
container_image=workspace_entry.upload_location_data.container_image,
|
|
706
|
+
dagster_library_versions=workspace_entry.upload_location_data.dagster_library_versions,
|
|
707
|
+
code_location_deploy_data=workspace_entry.code_location_deploy_data,
|
|
708
|
+
)
|
|
709
|
+
else:
|
|
710
|
+
check.failed(
|
|
711
|
+
"Expected DagsterCloudUploadWorkspaceEntry to have either location data or error, had neither."
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
result = DagsterCloudCodeLocationUpdateResult(
|
|
715
|
+
location_name=workspace_entry.location_name,
|
|
716
|
+
error_snapshot=error_snap,
|
|
717
|
+
manifest=manifest,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
res = self._instance.requests_managed_retries_session.put(
|
|
721
|
+
self._instance.dagster_cloud_code_location_update_result_url,
|
|
722
|
+
headers=self._instance.headers_for_deployment(deployment_name),
|
|
723
|
+
json=pack_value(result),
|
|
724
|
+
timeout=self._instance.dagster_cloud_api_timeout,
|
|
725
|
+
proxies=self._instance.dagster_cloud_api_proxies,
|
|
726
|
+
)
|
|
727
|
+
raise_http_error(res)
|
|
728
|
+
first_response = unpack_value(res.json(), DagsterCloudCodeLocationUpdateResponse)
|
|
729
|
+
if first_response.updated:
|
|
730
|
+
self._logger.info(
|
|
731
|
+
"Code location update result for"
|
|
732
|
+
f" {deployment_name}:{workspace_entry.location_name} - {first_response.message}"
|
|
733
|
+
)
|
|
734
|
+
return
|
|
735
|
+
|
|
736
|
+
missing = check.not_none(
|
|
737
|
+
first_response.missing_job_snapshots,
|
|
738
|
+
"Expected missing_job_snapshots when updated is false.",
|
|
739
|
+
)
|
|
740
|
+
server = check.inst(
|
|
741
|
+
server_or_error,
|
|
742
|
+
DagsterCloudGrpcServer,
|
|
743
|
+
"Server should not be in error state if there are missing snapshots.",
|
|
744
|
+
)
|
|
745
|
+
self._logger.info(f"Uploading {len(missing)} job snapshots.")
|
|
746
|
+
with ThreadPoolExecutor() as executor:
|
|
747
|
+
_ = list(
|
|
748
|
+
executor.map(
|
|
749
|
+
lambda job_selector: self.upload_job_snap_direct(
|
|
750
|
+
deployment_name,
|
|
751
|
+
job_selector,
|
|
752
|
+
server,
|
|
753
|
+
),
|
|
754
|
+
missing,
|
|
755
|
+
)
|
|
756
|
+
)
|
|
757
|
+
res = self._instance.requests_managed_retries_session.put(
|
|
758
|
+
self._instance.dagster_cloud_code_location_update_result_url,
|
|
759
|
+
headers=self._instance.headers_for_deployment(deployment_name),
|
|
760
|
+
json=pack_value(result),
|
|
761
|
+
timeout=self._instance.dagster_cloud_api_timeout,
|
|
762
|
+
proxies=self._instance.dagster_cloud_api_proxies,
|
|
763
|
+
)
|
|
764
|
+
raise_http_error(res)
|
|
765
|
+
second_response = unpack_value(res.json(), DagsterCloudCodeLocationUpdateResponse)
|
|
766
|
+
if not second_response.updated:
|
|
767
|
+
if second_response.missing_job_snapshots:
|
|
768
|
+
# this condition is expected to be extremely unlikely
|
|
769
|
+
raise Exception(
|
|
770
|
+
"Code location update failed, job definitions changed while uploading:"
|
|
771
|
+
f" {second_response.missing_job_snapshots}"
|
|
772
|
+
)
|
|
773
|
+
else:
|
|
774
|
+
raise Exception(f"Code location update failed: {second_response.message}")
|
|
775
|
+
|
|
776
|
+
self._logger.info(
|
|
777
|
+
"Code location update result for"
|
|
778
|
+
f" {deployment_name}:{workspace_entry.location_name} - {second_response.message}"
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
def _update_workspace_entry_server_upload(
|
|
782
|
+
self,
|
|
783
|
+
deployment_name: str,
|
|
784
|
+
workspace_entry: DagsterCloudUploadWorkspaceEntry,
|
|
785
|
+
server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
|
|
786
|
+
) -> None:
|
|
787
|
+
# legacy scheme, uploading definitions blobs to web server
|
|
572
788
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
573
789
|
dst = os.path.join(temp_dir, "workspace_entry.tmp")
|
|
574
790
|
with open(dst, "wb") as f:
|
|
@@ -647,7 +863,16 @@ class DagsterCloudUserCodeLauncher(
|
|
|
647
863
|
f" {deployment_name}:{workspace_entry.location_name} {response.message}"
|
|
648
864
|
)
|
|
649
865
|
|
|
650
|
-
def
|
|
866
|
+
async def gen_list_repositories_response(
|
|
867
|
+
self,
|
|
868
|
+
client: DagsterGrpcClient,
|
|
869
|
+
) -> "ListRepositoriesResponse":
|
|
870
|
+
return await gen_list_repositories_grpc(
|
|
871
|
+
client,
|
|
872
|
+
timeout=int(os.getenv("DAGSTER_CLOUD_LIST_REPOSITORIES_GRPC_TIMEOUT", "180")),
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
async def _get_upload_location_data(
|
|
651
876
|
self,
|
|
652
877
|
deployment_name: str,
|
|
653
878
|
location_name: str,
|
|
@@ -656,30 +881,40 @@ class DagsterCloudUserCodeLauncher(
|
|
|
656
881
|
location_origin = self._get_code_location_origin(location_name)
|
|
657
882
|
client = server.server_endpoint.create_client()
|
|
658
883
|
|
|
659
|
-
list_repositories_response =
|
|
884
|
+
list_repositories_response = await self.gen_list_repositories_response(client)
|
|
660
885
|
|
|
661
|
-
upload_repo_datas:
|
|
886
|
+
upload_repo_datas: list[DagsterCloudUploadRepositoryData] = []
|
|
662
887
|
|
|
663
888
|
for (
|
|
664
889
|
repository_name,
|
|
665
890
|
code_pointer,
|
|
666
891
|
) in list_repositories_response.repository_code_pointer_dict.items():
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
892
|
+
if os.getenv("DAGSTER_CLOUD_USE_STREAMING_EXTERNAL_REPOSITORY"):
|
|
893
|
+
external_repository_chunks = [
|
|
894
|
+
chunk
|
|
895
|
+
async for chunk in client.gen_streaming_external_repository(
|
|
896
|
+
remote_repository_origin=RemoteRepositoryOrigin(
|
|
897
|
+
location_origin,
|
|
898
|
+
repository_name,
|
|
899
|
+
),
|
|
900
|
+
defer_snapshots=True,
|
|
901
|
+
)
|
|
902
|
+
]
|
|
903
|
+
|
|
904
|
+
serialized_repository_data = "".join(
|
|
905
|
+
[
|
|
906
|
+
chunk["serialized_external_repository_chunk"]
|
|
907
|
+
for chunk in external_repository_chunks
|
|
908
|
+
]
|
|
909
|
+
)
|
|
910
|
+
else:
|
|
911
|
+
serialized_repository_data = await client.gen_external_repository(
|
|
912
|
+
remote_repository_origin=RemoteRepositoryOrigin(
|
|
670
913
|
location_origin,
|
|
671
914
|
repository_name,
|
|
672
915
|
),
|
|
673
|
-
defer_snapshots=
|
|
916
|
+
defer_snapshots=True,
|
|
674
917
|
)
|
|
675
|
-
)
|
|
676
|
-
|
|
677
|
-
serialized_repository_data = "".join(
|
|
678
|
-
[
|
|
679
|
-
chunk["serialized_external_repository_chunk"]
|
|
680
|
-
for chunk in external_repository_chunks
|
|
681
|
-
]
|
|
682
|
-
)
|
|
683
918
|
|
|
684
919
|
# Don't deserialize in case there are breaking changes - let the server do it
|
|
685
920
|
upload_repo_datas.append(
|
|
@@ -729,7 +964,27 @@ class DagsterCloudUserCodeLauncher(
|
|
|
729
964
|
deployment_name, errored_workspace_entry, server_or_error=error_info
|
|
730
965
|
)
|
|
731
966
|
|
|
732
|
-
def
|
|
967
|
+
async def _try_update_location_data(
|
|
968
|
+
self,
|
|
969
|
+
deployment_name: str,
|
|
970
|
+
location_name: str,
|
|
971
|
+
server_or_error: Union[DagsterCloudGrpcServer, SerializableErrorInfo],
|
|
972
|
+
metadata: CodeLocationDeployData,
|
|
973
|
+
):
|
|
974
|
+
try:
|
|
975
|
+
await self._update_location_data(
|
|
976
|
+
deployment_name,
|
|
977
|
+
location_name,
|
|
978
|
+
server_or_error,
|
|
979
|
+
metadata,
|
|
980
|
+
)
|
|
981
|
+
except Exception:
|
|
982
|
+
self._logger.error(
|
|
983
|
+
f"Error while writing location data for {deployment_name}:{location_name}:"
|
|
984
|
+
f" {serializable_error_info_from_exc_info(sys.exc_info())}"
|
|
985
|
+
)
|
|
986
|
+
|
|
987
|
+
async def _update_location_data(
|
|
733
988
|
self,
|
|
734
989
|
deployment_name: str,
|
|
735
990
|
location_name: str,
|
|
@@ -756,7 +1011,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
756
1011
|
loaded_workspace_entry = DagsterCloudUploadWorkspaceEntry(
|
|
757
1012
|
location_name=location_name,
|
|
758
1013
|
code_location_deploy_data=metadata,
|
|
759
|
-
upload_location_data=self._get_upload_location_data(
|
|
1014
|
+
upload_location_data=await self._get_upload_location_data(
|
|
760
1015
|
deployment_name,
|
|
761
1016
|
location_name,
|
|
762
1017
|
server_or_error,
|
|
@@ -789,7 +1044,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
789
1044
|
|
|
790
1045
|
def _get_existing_pex_servers(
|
|
791
1046
|
self, deployment_name: str, location_name: str
|
|
792
|
-
) ->
|
|
1047
|
+
) -> list[PexServerHandle]:
|
|
793
1048
|
server = self._multipex_servers.get((deployment_name, location_name))
|
|
794
1049
|
|
|
795
1050
|
if not server:
|
|
@@ -870,6 +1125,11 @@ class DagsterCloudUserCodeLauncher(
|
|
|
870
1125
|
):
|
|
871
1126
|
deployment_name, location_name = to_update_key
|
|
872
1127
|
|
|
1128
|
+
attributes = {
|
|
1129
|
+
"deployment": deployment_name,
|
|
1130
|
+
"location": location_name,
|
|
1131
|
+
}
|
|
1132
|
+
|
|
873
1133
|
code_location_deploy_data = desired_entry.code_location_deploy_data
|
|
874
1134
|
pex_metadata = code_location_deploy_data.pex_metadata
|
|
875
1135
|
deployment_info = (
|
|
@@ -882,13 +1142,19 @@ class DagsterCloudUserCodeLauncher(
|
|
|
882
1142
|
self._logger.info(
|
|
883
1143
|
f"Waiting for new grpc server for {deployment_name}:{location_name} for {deployment_info} to be ready..."
|
|
884
1144
|
)
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
1145
|
+
with observe_execution(
|
|
1146
|
+
opentelemetry=self.opentelemetry,
|
|
1147
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.server.process_wait",
|
|
1148
|
+
short_description="waiting for new server process to be ready",
|
|
1149
|
+
attributes=attributes,
|
|
1150
|
+
):
|
|
1151
|
+
await self._wait_for_new_server_ready(
|
|
1152
|
+
deployment_name,
|
|
1153
|
+
location_name,
|
|
1154
|
+
desired_entry,
|
|
1155
|
+
server_or_error.server_handle,
|
|
1156
|
+
server_or_error.server_endpoint,
|
|
1157
|
+
)
|
|
892
1158
|
except Exception:
|
|
893
1159
|
error_info = serializable_error_info_from_exc_info(sys.exc_info())
|
|
894
1160
|
self._logger.error(
|
|
@@ -898,21 +1164,18 @@ class DagsterCloudUserCodeLauncher(
|
|
|
898
1164
|
server_or_error = error_info
|
|
899
1165
|
|
|
900
1166
|
if should_upload:
|
|
901
|
-
|
|
902
|
-
self.
|
|
1167
|
+
with observe_execution(
|
|
1168
|
+
opentelemetry=self.opentelemetry,
|
|
1169
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.upload",
|
|
1170
|
+
short_description="uploading user code data",
|
|
1171
|
+
attributes=attributes,
|
|
1172
|
+
):
|
|
1173
|
+
await self._try_update_location_data(
|
|
903
1174
|
deployment_name,
|
|
904
1175
|
location_name,
|
|
905
1176
|
server_or_error,
|
|
906
1177
|
desired_entry.code_location_deploy_data,
|
|
907
1178
|
)
|
|
908
|
-
except Exception:
|
|
909
|
-
# If there was a failure uploading snapshots, log it but don't block other code locations
|
|
910
|
-
# from updating (and still use the new server to serve new requests)
|
|
911
|
-
error_info = serializable_error_info_from_exc_info(sys.exc_info())
|
|
912
|
-
self._logger.error(
|
|
913
|
-
f"Error while writing location data for {deployment_name}:{location_name}:"
|
|
914
|
-
f" {error_info}"
|
|
915
|
-
)
|
|
916
1179
|
|
|
917
1180
|
# Once we've verified that the new server has uploaded its data successfully, swap in
|
|
918
1181
|
# the server to start serving new requests
|
|
@@ -986,7 +1249,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
986
1249
|
self._pending_delete_grpc_server_handles.discard(server_handle)
|
|
987
1250
|
|
|
988
1251
|
def _cleanup_servers(
|
|
989
|
-
self, active_agent_ids: Optional[
|
|
1252
|
+
self, active_agent_ids: Optional[set[str]], include_own_servers: bool
|
|
990
1253
|
) -> None:
|
|
991
1254
|
"""Remove all servers, across all deployments and locations."""
|
|
992
1255
|
with ThreadPoolExecutor() as executor:
|
|
@@ -1009,7 +1272,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1009
1272
|
self._logger.exception("Error cleaning up server")
|
|
1010
1273
|
|
|
1011
1274
|
@abstractmethod
|
|
1012
|
-
def _list_server_handles(self) ->
|
|
1275
|
+
def _list_server_handles(self) -> list[ServerHandle]:
|
|
1013
1276
|
"""Return a list of all server handles across all deployments and locations."""
|
|
1014
1277
|
|
|
1015
1278
|
@abstractmethod
|
|
@@ -1021,7 +1284,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1021
1284
|
"""Returns the update_timestamp value from the given code server."""
|
|
1022
1285
|
|
|
1023
1286
|
def _can_cleanup_server(
|
|
1024
|
-
self, handle: ServerHandle, active_agent_ids: Optional[
|
|
1287
|
+
self, handle: ServerHandle, active_agent_ids: Optional[set[str]], include_own_servers: bool
|
|
1025
1288
|
) -> bool:
|
|
1026
1289
|
"""Returns true if we can clean up the server identified by the handle without issues (server was started by this agent, or agent is no longer active)."""
|
|
1027
1290
|
agent_id_for_server = self.get_agent_id_for_server(handle)
|
|
@@ -1057,7 +1320,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1057
1320
|
return False
|
|
1058
1321
|
|
|
1059
1322
|
return (active_agent_ids is not None) and (
|
|
1060
|
-
agent_id_for_server not in cast(
|
|
1323
|
+
agent_id_for_server not in cast("set[str]", active_agent_ids)
|
|
1061
1324
|
)
|
|
1062
1325
|
|
|
1063
1326
|
def _graceful_cleanup_servers(self, include_own_servers: bool): # ServerHandles
|
|
@@ -1066,7 +1329,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1066
1329
|
return self._cleanup_servers(active_agent_ids, include_own_servers=include_own_servers)
|
|
1067
1330
|
|
|
1068
1331
|
handles = self._list_server_handles()
|
|
1069
|
-
servers_to_remove:
|
|
1332
|
+
servers_to_remove: list[ServerHandle] = []
|
|
1070
1333
|
with self._grpc_servers_lock:
|
|
1071
1334
|
for handle in handles:
|
|
1072
1335
|
if self._can_cleanup_server(
|
|
@@ -1090,7 +1353,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1090
1353
|
self._reconcile_grpc_metadata_thread.join()
|
|
1091
1354
|
|
|
1092
1355
|
if self._run_worker_monitoring_thread:
|
|
1093
|
-
self._run_worker_monitoring_thread_shutdown_event.set()
|
|
1356
|
+
self._run_worker_monitoring_thread_shutdown_event.set() # pyright: ignore[reportOptionalMemberAccess]
|
|
1094
1357
|
self._run_worker_monitoring_thread.join()
|
|
1095
1358
|
|
|
1096
1359
|
if self._reconcile_location_utilization_metrics_thread:
|
|
@@ -1100,10 +1363,10 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1100
1363
|
if self._started:
|
|
1101
1364
|
self._graceful_cleanup_servers(include_own_servers=True)
|
|
1102
1365
|
|
|
1103
|
-
super().__exit__(exception_value, exception_value, traceback)
|
|
1366
|
+
super().__exit__(exception_value, exception_value, traceback) # pyright: ignore[reportAbstractUsage]
|
|
1104
1367
|
|
|
1105
1368
|
def add_upload_metadata(
|
|
1106
|
-
self, upload_metadata:
|
|
1369
|
+
self, upload_metadata: dict[DeploymentAndLocation, UserCodeLauncherEntry]
|
|
1107
1370
|
):
|
|
1108
1371
|
"""Add a set of locations to be uploaded in the next reconcilation loop."""
|
|
1109
1372
|
with self._metadata_lock:
|
|
@@ -1112,7 +1375,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1112
1375
|
|
|
1113
1376
|
def update_grpc_metadata(
|
|
1114
1377
|
self,
|
|
1115
|
-
desired_metadata:
|
|
1378
|
+
desired_metadata: dict[DeploymentAndLocation, UserCodeLauncherEntry],
|
|
1116
1379
|
):
|
|
1117
1380
|
check.dict_param(
|
|
1118
1381
|
desired_metadata,
|
|
@@ -1152,7 +1415,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1152
1415
|
endpoints_or_errors = self.get_grpc_endpoints()
|
|
1153
1416
|
for (deployment_name, location_name), endpoint_or_error in endpoints_or_errors.items():
|
|
1154
1417
|
if isinstance(endpoint_or_error, ServerEndpoint):
|
|
1155
|
-
endpoint = cast(ServerEndpoint, endpoint_or_error)
|
|
1418
|
+
endpoint = cast("ServerEndpoint", endpoint_or_error)
|
|
1156
1419
|
raw_metrics_str = (
|
|
1157
1420
|
endpoint.create_client().ping("").get("serialized_server_utilization_metrics")
|
|
1158
1421
|
)
|
|
@@ -1217,7 +1480,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1217
1480
|
try:
|
|
1218
1481
|
self._graceful_cleanup_servers(include_own_servers=False)
|
|
1219
1482
|
except:
|
|
1220
|
-
self._logger.exception("Failed to clean up dangling code
|
|
1483
|
+
self._logger.exception("Failed to clean up dangling code servers.")
|
|
1221
1484
|
self._last_cleaned_up_dangling_code_servers = now
|
|
1222
1485
|
|
|
1223
1486
|
if now - self._last_refreshed_actual_entries > ACTUAL_ENTRIES_REFRESH_INTERVAL:
|
|
@@ -1227,6 +1490,8 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1227
1490
|
self._logger.exception("Failed to refresh actual entries.")
|
|
1228
1491
|
self._last_refreshed_actual_entries = now
|
|
1229
1492
|
|
|
1493
|
+
self._in_progress_reconcile_start_time = time.time()
|
|
1494
|
+
|
|
1230
1495
|
self._reconcile(
|
|
1231
1496
|
desired_entries,
|
|
1232
1497
|
upload_locations,
|
|
@@ -1243,6 +1508,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1243
1508
|
f"Started polling for requests from {self._instance.dagster_cloud_url}"
|
|
1244
1509
|
)
|
|
1245
1510
|
|
|
1511
|
+
self._in_progress_reconcile_start_time = None
|
|
1246
1512
|
self._reconcile_count += 1
|
|
1247
1513
|
|
|
1248
1514
|
def _update_metrics_thread(self, shutdown_event):
|
|
@@ -1267,19 +1533,39 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1267
1533
|
# thread-safe since reconcile_count is an integer
|
|
1268
1534
|
return self._reconcile_count > 0
|
|
1269
1535
|
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
return
|
|
1536
|
+
@property
|
|
1537
|
+
def in_progress_reconcile_start_time(self) -> Optional[float]:
|
|
1538
|
+
return self._in_progress_reconcile_start_time
|
|
1539
|
+
|
|
1540
|
+
def _make_check_on_running_server_endpoint(
|
|
1541
|
+
self, server_endpoint: ServerEndpoint
|
|
1542
|
+
) -> Callable[[], Union[ListRepositoriesResponse, SerializableErrorInfo]]:
|
|
1543
|
+
return lambda: deserialize_value(
|
|
1544
|
+
server_endpoint.create_client().list_repositories(),
|
|
1545
|
+
(ListRepositoriesResponse, SerializableErrorInfo),
|
|
1546
|
+
)
|
|
1547
|
+
|
|
1548
|
+
def _trigger_recovery_server_restart(self, deployment_location: DeploymentAndLocation):
|
|
1549
|
+
del self._actual_entries[deployment_location]
|
|
1550
|
+
|
|
1551
|
+
if deployment_location in self._first_unavailable_times:
|
|
1552
|
+
del self._first_unavailable_times[deployment_location]
|
|
1553
|
+
|
|
1554
|
+
# redeploy the multipex server in this case as well to ensure a fresh start
|
|
1555
|
+
# if it resource contrained (and ensure that we don't try to create the same
|
|
1556
|
+
# PexServerHandle again and delete the code location in a loop)
|
|
1557
|
+
if deployment_location in self._multipex_servers:
|
|
1558
|
+
del self._multipex_servers[deployment_location]
|
|
1273
1559
|
|
|
1274
1560
|
def _refresh_actual_entries(self) -> None:
|
|
1275
|
-
for deployment_location,
|
|
1561
|
+
for deployment_location, multipex_server in self._multipex_servers.items():
|
|
1276
1562
|
if deployment_location in self._actual_entries:
|
|
1277
1563
|
# If a multipex server exists, we query it over gRPC
|
|
1278
1564
|
# to make sure the pex server is still available.
|
|
1279
1565
|
|
|
1280
1566
|
# First verify that the multipex server is running
|
|
1281
1567
|
try:
|
|
1282
|
-
|
|
1568
|
+
multipex_server.server_endpoint.create_multipex_client().ping("")
|
|
1283
1569
|
except:
|
|
1284
1570
|
# If it isn't, this is expected if ECS is currently spinning up this service
|
|
1285
1571
|
# after it crashed. In this case, we want to wait for it to fully come up
|
|
@@ -1291,14 +1577,20 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1291
1577
|
)
|
|
1292
1578
|
return
|
|
1293
1579
|
deployment_name, location_name = deployment_location
|
|
1580
|
+
|
|
1581
|
+
# If we expect there to be a running code location here but there is none,
|
|
1294
1582
|
if not self._get_existing_pex_servers(deployment_name, location_name):
|
|
1295
|
-
self.
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1583
|
+
with self._grpc_servers_lock:
|
|
1584
|
+
grpc_server_or_error = self._grpc_servers.get(deployment_location)
|
|
1585
|
+
|
|
1586
|
+
if isinstance(grpc_server_or_error, DagsterCloudGrpcServer):
|
|
1587
|
+
self._logger.warning(
|
|
1588
|
+
"Pex servers disappeared for running code location %s:%s. Removing actual entries to"
|
|
1589
|
+
" activate reconciliation logic and deploy a new code server and multipex server.",
|
|
1590
|
+
deployment_name,
|
|
1591
|
+
location_name,
|
|
1592
|
+
)
|
|
1593
|
+
self._trigger_recovery_server_restart(deployment_location)
|
|
1302
1594
|
|
|
1303
1595
|
# Check to see if any servers have become unresponsive
|
|
1304
1596
|
unavailable_server_timeout = int(
|
|
@@ -1332,8 +1624,6 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1332
1624
|
) as executor:
|
|
1333
1625
|
futures = {}
|
|
1334
1626
|
for deployment_location, endpoint_or_error in running_locations.items():
|
|
1335
|
-
deployment_name, location_name = deployment_location
|
|
1336
|
-
|
|
1337
1627
|
futures[
|
|
1338
1628
|
executor.submit(self._make_check_on_running_server_endpoint(endpoint_or_error))
|
|
1339
1629
|
] = deployment_location
|
|
@@ -1343,15 +1633,22 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1343
1633
|
|
|
1344
1634
|
deployment_name, location_name = deployment_location
|
|
1345
1635
|
try:
|
|
1346
|
-
future.result()
|
|
1347
|
-
|
|
1636
|
+
response_or_error = future.result()
|
|
1348
1637
|
# Successful ping resets the tracked last unavailable time for this code server, if set
|
|
1349
1638
|
self._first_unavailable_times.pop(deployment_location, None)
|
|
1639
|
+
if isinstance(response_or_error, SerializableErrorInfo):
|
|
1640
|
+
# This can happen if the server was previously healthy but restarted
|
|
1641
|
+
# and moved into an error state - attempt to recover
|
|
1642
|
+
self._logger.exception(
|
|
1643
|
+
f"Code server for {deployment_name}:{location_name} unexpectedly moved into an error state. Deploying a new code server. Observed error: \n{response_or_error.to_string()}"
|
|
1644
|
+
)
|
|
1645
|
+
self._trigger_recovery_server_restart(deployment_location)
|
|
1350
1646
|
except Exception as e:
|
|
1351
1647
|
if (
|
|
1352
1648
|
isinstance(e, DagsterUserCodeUnreachableError)
|
|
1353
1649
|
and isinstance(e.__cause__, grpc.RpcError)
|
|
1354
|
-
and cast(grpc.RpcError, e.__cause__).code()
|
|
1650
|
+
and cast("grpc.RpcError", e.__cause__).code()
|
|
1651
|
+
in {grpc.StatusCode.UNAVAILABLE, grpc.StatusCode.UNKNOWN}
|
|
1355
1652
|
):
|
|
1356
1653
|
first_unavailable_time = self._first_unavailable_times.get(
|
|
1357
1654
|
deployment_location
|
|
@@ -1369,8 +1666,8 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1369
1666
|
self._logger.warning(
|
|
1370
1667
|
f"Code server for {deployment_name}:{location_name} has been unresponsive for more than {unavailable_server_timeout} seconds. Deploying a new code server."
|
|
1371
1668
|
)
|
|
1372
|
-
|
|
1373
|
-
|
|
1669
|
+
self._trigger_recovery_server_restart(deployment_location)
|
|
1670
|
+
|
|
1374
1671
|
else:
|
|
1375
1672
|
self._logger.exception(
|
|
1376
1673
|
f"Code server for {deployment_name}:{location_name} health check failed, but the error did not indicate that the server was unavailable."
|
|
@@ -1406,15 +1703,15 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1406
1703
|
|
|
1407
1704
|
def _deployments_and_locations_to_string(
|
|
1408
1705
|
self,
|
|
1409
|
-
deployments_and_locations:
|
|
1410
|
-
entries:
|
|
1706
|
+
deployments_and_locations: set[DeploymentAndLocation],
|
|
1707
|
+
entries: dict[DeploymentAndLocation, UserCodeLauncherEntry],
|
|
1411
1708
|
):
|
|
1412
1709
|
return (
|
|
1413
1710
|
"{"
|
|
1414
1711
|
+ ", ".join(
|
|
1415
1712
|
sorted(
|
|
1416
1713
|
[
|
|
1417
|
-
f"({dep}, {loc}, {entries[(dep,loc)].update_timestamp})"
|
|
1714
|
+
f"({dep}, {loc}, {entries[(dep, loc)].update_timestamp})"
|
|
1418
1715
|
for dep, loc in deployments_and_locations
|
|
1419
1716
|
]
|
|
1420
1717
|
)
|
|
@@ -1431,8 +1728,8 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1431
1728
|
|
|
1432
1729
|
def _reconcile(
|
|
1433
1730
|
self,
|
|
1434
|
-
desired_entries:
|
|
1435
|
-
upload_locations:
|
|
1731
|
+
desired_entries: dict[DeploymentAndLocation, UserCodeLauncherEntry],
|
|
1732
|
+
upload_locations: set[DeploymentAndLocation],
|
|
1436
1733
|
check_on_pending_delete_servers: bool,
|
|
1437
1734
|
):
|
|
1438
1735
|
if check_on_pending_delete_servers:
|
|
@@ -1443,7 +1740,11 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1443
1740
|
for handle in handles:
|
|
1444
1741
|
self._graceful_remove_server_handle(handle)
|
|
1445
1742
|
|
|
1446
|
-
diff = diff_serializable_namedtuple_map(
|
|
1743
|
+
diff = diff_serializable_namedtuple_map(
|
|
1744
|
+
desired_entries,
|
|
1745
|
+
self._actual_entries,
|
|
1746
|
+
update_key_fn=lambda entry: entry.update_timestamp,
|
|
1747
|
+
)
|
|
1447
1748
|
has_changes = diff.to_add or diff.to_update or diff.to_remove or upload_locations
|
|
1448
1749
|
|
|
1449
1750
|
if not has_changes:
|
|
@@ -1469,26 +1770,26 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1469
1770
|
to_update_keys = diff.to_add.union(diff.to_update)
|
|
1470
1771
|
|
|
1471
1772
|
# Handles for all running standalone Dagster GRPC servers
|
|
1472
|
-
existing_standalone_dagster_server_handles:
|
|
1773
|
+
existing_standalone_dagster_server_handles: dict[
|
|
1473
1774
|
DeploymentAndLocation, Collection[ServerHandle]
|
|
1474
1775
|
] = {}
|
|
1475
1776
|
|
|
1476
1777
|
# Handles for all running Dagster multipex servers (which can each host multiple grpc subprocesses)
|
|
1477
|
-
existing_multipex_server_handles:
|
|
1778
|
+
existing_multipex_server_handles: dict[DeploymentAndLocation, Collection[ServerHandle]] = {}
|
|
1478
1779
|
|
|
1479
1780
|
# For each location, all currently running pex servers on the current multipex server
|
|
1480
|
-
existing_pex_server_handles:
|
|
1781
|
+
existing_pex_server_handles: dict[DeploymentAndLocation, list[PexServerHandle]] = {}
|
|
1481
1782
|
|
|
1482
1783
|
# Dagster grpc servers created in this loop (including both standalone grpc servers
|
|
1483
1784
|
# and pex servers on a multipex server) - or an error that explains why it couldn't load
|
|
1484
|
-
new_dagster_servers:
|
|
1785
|
+
new_dagster_servers: dict[
|
|
1485
1786
|
DeploymentAndLocation, Union[DagsterCloudGrpcServer, SerializableErrorInfo]
|
|
1486
1787
|
] = {}
|
|
1487
1788
|
|
|
1488
1789
|
# Multipex servers created in this loop (a new multipex server might not always
|
|
1489
1790
|
# be created on each loop even if the code has changed, as long as the base image
|
|
1490
1791
|
# is the same)
|
|
1491
|
-
new_multipex_servers:
|
|
1792
|
+
new_multipex_servers: dict[DeploymentAndLocation, DagsterCloudGrpcServer] = {}
|
|
1492
1793
|
|
|
1493
1794
|
for to_update_key in to_update_keys:
|
|
1494
1795
|
deployment_name, location_name = to_update_key
|
|
@@ -1534,16 +1835,29 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1534
1835
|
)
|
|
1535
1836
|
# confirm it's a valid image since _start_new_server_spinup will launch a container
|
|
1536
1837
|
self._check_for_image(desired_entry.code_location_deploy_data)
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1838
|
+
|
|
1839
|
+
attributes = {
|
|
1840
|
+
"deployment": deployment_name,
|
|
1841
|
+
"location": location_name,
|
|
1842
|
+
"image": desired_entry.code_location_deploy_data.image,
|
|
1843
|
+
"python_version": desired_python_version,
|
|
1844
|
+
}
|
|
1845
|
+
with observe_execution(
|
|
1846
|
+
opentelemetry=self.opentelemetry,
|
|
1847
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.multipex_server.start",
|
|
1848
|
+
short_description="starting new multipex server",
|
|
1849
|
+
attributes=attributes,
|
|
1850
|
+
):
|
|
1851
|
+
multipex_server = self._start_new_server_spinup(
|
|
1852
|
+
deployment_name, location_name, desired_entry
|
|
1853
|
+
)
|
|
1854
|
+
self._multipex_servers[to_update_key] = multipex_server
|
|
1855
|
+
assert self._get_multipex_server(
|
|
1856
|
+
deployment_name,
|
|
1857
|
+
location_name,
|
|
1858
|
+
desired_entry.code_location_deploy_data,
|
|
1859
|
+
)
|
|
1860
|
+
new_multipex_servers[to_update_key] = multipex_server
|
|
1547
1861
|
else:
|
|
1548
1862
|
self._logger.info(
|
|
1549
1863
|
f"Found running multipex server for {multipex_server_repr}"
|
|
@@ -1569,8 +1883,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1569
1883
|
deployment_name, location_name = to_update_key
|
|
1570
1884
|
|
|
1571
1885
|
self._logger.info(
|
|
1572
|
-
f"Waiting for new multipex server for {deployment_name}:{location_name} to be"
|
|
1573
|
-
" ready"
|
|
1886
|
+
f"Waiting for new multipex server for {deployment_name}:{location_name} to be ready"
|
|
1574
1887
|
)
|
|
1575
1888
|
tasks[to_update_key] = self._wait_for_new_multipex_server(
|
|
1576
1889
|
deployment_name,
|
|
@@ -1681,6 +1994,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1681
1994
|
|
|
1682
1995
|
for server_handle in server_handles:
|
|
1683
1996
|
try:
|
|
1997
|
+
# TODO - telemetry of removing standalone servers
|
|
1684
1998
|
self._graceful_remove_server_handle(server_handle)
|
|
1685
1999
|
except Exception:
|
|
1686
2000
|
self._logger.error(
|
|
@@ -1711,6 +2025,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1711
2025
|
)
|
|
1712
2026
|
|
|
1713
2027
|
try:
|
|
2028
|
+
# TODO - telemetry of removing multipex server
|
|
1714
2029
|
self._graceful_remove_server_handle(multipex_server_handle)
|
|
1715
2030
|
except Exception:
|
|
1716
2031
|
self._logger.error(
|
|
@@ -1728,6 +2043,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1728
2043
|
)
|
|
1729
2044
|
for pex_server_handle in pex_server_handles:
|
|
1730
2045
|
try:
|
|
2046
|
+
# TODO - telemetry of removing pex server
|
|
1731
2047
|
self._remove_pex_server_handle(
|
|
1732
2048
|
deployment_name,
|
|
1733
2049
|
location_name,
|
|
@@ -1752,6 +2068,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1752
2068
|
for to_remove_key in diff.to_remove:
|
|
1753
2069
|
deployment_name, location_name = to_remove_key
|
|
1754
2070
|
try:
|
|
2071
|
+
# TODO - telemetry of removing location's server
|
|
1755
2072
|
self._remove_server(deployment_name, location_name)
|
|
1756
2073
|
except Exception:
|
|
1757
2074
|
self._logger.error(
|
|
@@ -1763,25 +2080,27 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1763
2080
|
del self._grpc_servers[to_remove_key]
|
|
1764
2081
|
del self._actual_entries[to_remove_key]
|
|
1765
2082
|
|
|
2083
|
+
if to_remove_key in self._multipex_servers:
|
|
2084
|
+
del self._multipex_servers[to_remove_key]
|
|
2085
|
+
|
|
1766
2086
|
# Upload any locations that were requested to be uploaded, but weren't updated
|
|
1767
2087
|
# as part of this reconciliation loop
|
|
2088
|
+
|
|
2089
|
+
tasks = {}
|
|
1768
2090
|
for location in upload_locations:
|
|
1769
2091
|
with self._grpc_servers_lock:
|
|
1770
2092
|
server_or_error = self._grpc_servers[location]
|
|
1771
2093
|
|
|
1772
2094
|
deployment_name, location_name = location
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
f"Error while writing location data for {deployment_name}:{location_name}:"
|
|
1783
|
-
f" {serializable_error_info_from_exc_info(sys.exc_info())}"
|
|
1784
|
-
)
|
|
2095
|
+
tasks[location] = self._try_update_location_data(
|
|
2096
|
+
deployment_name,
|
|
2097
|
+
location_name,
|
|
2098
|
+
server_or_error,
|
|
2099
|
+
self._actual_entries[location].code_location_deploy_data,
|
|
2100
|
+
)
|
|
2101
|
+
|
|
2102
|
+
if tasks:
|
|
2103
|
+
results = asyncio.run(self._gather_tasks(tasks.values()))
|
|
1785
2104
|
|
|
1786
2105
|
seconds = time.time() - start_time
|
|
1787
2106
|
self._logger.info(f"Finished reconciling in {seconds} seconds.")
|
|
@@ -1854,6 +2173,11 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1854
2173
|
def _start_new_dagster_server(
|
|
1855
2174
|
self, deployment_name: str, location_name: str, desired_entry: UserCodeLauncherEntry
|
|
1856
2175
|
) -> DagsterCloudGrpcServer:
|
|
2176
|
+
attributes = {
|
|
2177
|
+
"deployment": deployment_name,
|
|
2178
|
+
"location": location_name,
|
|
2179
|
+
}
|
|
2180
|
+
|
|
1857
2181
|
if desired_entry.code_location_deploy_data.pex_metadata:
|
|
1858
2182
|
multipex_server = self._get_multipex_server(
|
|
1859
2183
|
deployment_name, location_name, desired_entry.code_location_deploy_data
|
|
@@ -1861,26 +2185,45 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1861
2185
|
|
|
1862
2186
|
assert multipex_server # should have been started earlier or we should never reach here
|
|
1863
2187
|
|
|
1864
|
-
|
|
2188
|
+
if desired_entry.code_location_deploy_data.pex_metadata.python_version:
|
|
2189
|
+
attributes["python_version"] = (
|
|
2190
|
+
desired_entry.code_location_deploy_data.pex_metadata.python_version
|
|
2191
|
+
)
|
|
1865
2192
|
|
|
1866
|
-
|
|
1867
|
-
|
|
2193
|
+
with observe_execution(
|
|
2194
|
+
opentelemetry=self.opentelemetry,
|
|
2195
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.pex_server.start",
|
|
2196
|
+
short_description="starting new pex server",
|
|
2197
|
+
attributes=attributes,
|
|
2198
|
+
):
|
|
2199
|
+
self._create_pex_server(
|
|
2200
|
+
deployment_name, location_name, desired_entry, multipex_server
|
|
2201
|
+
)
|
|
1868
2202
|
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
2203
|
+
server_handle = multipex_server.server_handle
|
|
2204
|
+
multipex_endpoint = multipex_server.server_endpoint
|
|
2205
|
+
|
|
2206
|
+
# start a new pex server on the multipexer, which we can count on already existing
|
|
2207
|
+
return DagsterCloudGrpcServer(
|
|
2208
|
+
server_handle,
|
|
2209
|
+
multipex_endpoint.with_metadata(
|
|
2210
|
+
[
|
|
2211
|
+
("has_pex", "1"),
|
|
2212
|
+
("deployment", deployment_name),
|
|
2213
|
+
("location", location_name),
|
|
2214
|
+
("timestamp", str(int(desired_entry.update_timestamp))),
|
|
2215
|
+
],
|
|
2216
|
+
),
|
|
2217
|
+
desired_entry.code_location_deploy_data,
|
|
2218
|
+
)
|
|
1882
2219
|
else:
|
|
1883
|
-
|
|
2220
|
+
with observe_execution(
|
|
2221
|
+
opentelemetry=self.opentelemetry,
|
|
2222
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.user_code.code_server.start",
|
|
2223
|
+
short_description="new code server spin up",
|
|
2224
|
+
attributes=attributes,
|
|
2225
|
+
):
|
|
2226
|
+
return self._start_new_server_spinup(deployment_name, location_name, desired_entry)
|
|
1884
2227
|
|
|
1885
2228
|
def get_grpc_endpoint(
|
|
1886
2229
|
self,
|
|
@@ -1924,12 +2267,12 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1924
2267
|
|
|
1925
2268
|
return server
|
|
1926
2269
|
|
|
1927
|
-
def get_grpc_server_heartbeats(self) ->
|
|
2270
|
+
def get_grpc_server_heartbeats(self) -> dict[str, list[CloudCodeServerHeartbeat]]:
|
|
1928
2271
|
endpoint_or_errors = self.get_grpc_endpoints()
|
|
1929
2272
|
with self._metadata_lock:
|
|
1930
2273
|
desired_entries = set(self._desired_entries.keys())
|
|
1931
2274
|
|
|
1932
|
-
heartbeats:
|
|
2275
|
+
heartbeats: dict[str, list[CloudCodeServerHeartbeat]] = {}
|
|
1933
2276
|
for entry_key in desired_entries:
|
|
1934
2277
|
deployment_name, location_name = entry_key
|
|
1935
2278
|
endpoint_or_error = endpoint_or_errors.get(entry_key)
|
|
@@ -1979,7 +2322,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
1979
2322
|
|
|
1980
2323
|
def get_grpc_endpoints(
|
|
1981
2324
|
self,
|
|
1982
|
-
) ->
|
|
2325
|
+
) -> dict[DeploymentAndLocation, Union[ServerEndpoint, SerializableErrorInfo]]:
|
|
1983
2326
|
with self._grpc_servers_lock:
|
|
1984
2327
|
return {
|
|
1985
2328
|
key: val if isinstance(val, SerializableErrorInfo) else val.server_endpoint
|
|
@@ -2011,7 +2354,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
2011
2354
|
client, timeout, additional_check, get_timeout_debug_info=get_timeout_debug_info
|
|
2012
2355
|
)
|
|
2013
2356
|
# Call a method that raises an exception if there was an error importing the code
|
|
2014
|
-
|
|
2357
|
+
await self.gen_list_repositories_response(client)
|
|
2015
2358
|
|
|
2016
2359
|
async def _wait_for_server_process(
|
|
2017
2360
|
self,
|
|
@@ -2070,6 +2413,7 @@ class DagsterCloudUserCodeLauncher(
|
|
|
2070
2413
|
response = client.external_job(
|
|
2071
2414
|
RemoteRepositoryOrigin(location_origin, job_selector.repository_name),
|
|
2072
2415
|
job_selector.job_name,
|
|
2416
|
+
timeout=int(os.getenv("DAGSTER_CLOUD_EXTERNAL_JOB_GRPC_TIMEOUT", "180")),
|
|
2073
2417
|
)
|
|
2074
2418
|
if not response.serialized_job_data:
|
|
2075
2419
|
error = (
|
|
@@ -2098,3 +2442,39 @@ class DagsterCloudUserCodeLauncher(
|
|
|
2098
2442
|
f" {job_selector.job_name}@{job_selector.repository_name} ({os.path.getsize(dst)} bytes)"
|
|
2099
2443
|
)
|
|
2100
2444
|
return response
|
|
2445
|
+
|
|
2446
|
+
@property
|
|
2447
|
+
def opentelemetry(self) -> Optional[OpenTelemetryController]:
|
|
2448
|
+
if not self.has_instance:
|
|
2449
|
+
return None
|
|
2450
|
+
else:
|
|
2451
|
+
return self._instance.opentelemetry
|
|
2452
|
+
|
|
2453
|
+
def upload_job_snap_direct(
|
|
2454
|
+
self,
|
|
2455
|
+
deployment_name: str,
|
|
2456
|
+
job_selector: JobSelector,
|
|
2457
|
+
server: DagsterCloudGrpcServer,
|
|
2458
|
+
):
|
|
2459
|
+
client = server.server_endpoint.create_client()
|
|
2460
|
+
location_origin = self._get_code_location_origin(job_selector.location_name)
|
|
2461
|
+
response = client.external_job(
|
|
2462
|
+
RemoteRepositoryOrigin(location_origin, job_selector.repository_name),
|
|
2463
|
+
job_selector.job_name,
|
|
2464
|
+
)
|
|
2465
|
+
if not response.serialized_job_data:
|
|
2466
|
+
error = (
|
|
2467
|
+
deserialize_value(response.serialized_error, SerializableErrorInfo)
|
|
2468
|
+
if response.serialized_error
|
|
2469
|
+
else "no captured error"
|
|
2470
|
+
)
|
|
2471
|
+
raise Exception(f"Error fetching job data in code server:\n{error}")
|
|
2472
|
+
|
|
2473
|
+
job_snapshot = extract_serialized_job_snap_from_serialized_job_data_snap(
|
|
2474
|
+
response.serialized_job_data
|
|
2475
|
+
)
|
|
2476
|
+
return self._ensure_snapshot_uploaded(
|
|
2477
|
+
deployment_name,
|
|
2478
|
+
SnapshotType.JOB,
|
|
2479
|
+
job_snapshot,
|
|
2480
|
+
)
|