dagster-cloud 1.8.2__py3-none-any.whl → 1.12.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_cloud/__init__.py +3 -3
- dagster_cloud/agent/__init__.py +4 -4
- dagster_cloud/agent/cli/__init__.py +56 -17
- dagster_cloud/agent/dagster_cloud_agent.py +360 -172
- dagster_cloud/agent/instrumentation/__init__.py +0 -0
- dagster_cloud/agent/instrumentation/constants.py +2 -0
- dagster_cloud/agent/instrumentation/run_launch.py +23 -0
- dagster_cloud/agent/instrumentation/schedule.py +34 -0
- dagster_cloud/agent/instrumentation/sensor.py +34 -0
- dagster_cloud/anomaly_detection/__init__.py +2 -2
- dagster_cloud/anomaly_detection/defs.py +17 -12
- dagster_cloud/anomaly_detection/types.py +3 -3
- dagster_cloud/api/dagster_cloud_api.py +209 -293
- dagster_cloud/auth/constants.py +21 -5
- dagster_cloud/batching/__init__.py +1 -0
- dagster_cloud/batching/batcher.py +210 -0
- dagster_cloud/dagster_insights/__init__.py +12 -6
- dagster_cloud/dagster_insights/bigquery/bigquery_utils.py +3 -2
- dagster_cloud/dagster_insights/bigquery/dbt_wrapper.py +39 -12
- dagster_cloud/dagster_insights/bigquery/insights_bigquery_resource.py +8 -6
- dagster_cloud/dagster_insights/insights_utils.py +18 -8
- dagster_cloud/dagster_insights/metrics_utils.py +12 -12
- dagster_cloud/dagster_insights/snowflake/dagster_snowflake_insights.py +5 -12
- dagster_cloud/dagster_insights/snowflake/dbt_wrapper.py +34 -8
- dagster_cloud/dagster_insights/snowflake/definitions.py +38 -12
- dagster_cloud/dagster_insights/snowflake/insights_snowflake_resource.py +11 -23
- dagster_cloud/definitions/__init__.py +0 -0
- dagster_cloud/definitions/job_selection.py +36 -0
- dagster_cloud/execution/cloud_run_launcher/k8s.py +1 -1
- dagster_cloud/execution/cloud_run_launcher/process.py +3 -3
- dagster_cloud/execution/monitoring/__init__.py +27 -33
- dagster_cloud/execution/utils/process.py +3 -3
- dagster_cloud/instance/__init__.py +125 -38
- dagster_cloud/instrumentation/__init__.py +32 -0
- dagster_cloud/metadata/source_code.py +13 -8
- dagster_cloud/metrics/__init__.py +0 -0
- dagster_cloud/metrics/tracer.py +59 -0
- dagster_cloud/opentelemetry/__init__.py +0 -0
- dagster_cloud/opentelemetry/config/__init__.py +73 -0
- dagster_cloud/opentelemetry/config/exporter.py +81 -0
- dagster_cloud/opentelemetry/config/log_record_processor.py +40 -0
- dagster_cloud/opentelemetry/config/logging_handler.py +14 -0
- dagster_cloud/opentelemetry/config/meter_provider.py +9 -0
- dagster_cloud/opentelemetry/config/metric_reader.py +39 -0
- dagster_cloud/opentelemetry/controller.py +319 -0
- dagster_cloud/opentelemetry/enum.py +58 -0
- dagster_cloud/opentelemetry/factories/__init__.py +1 -0
- dagster_cloud/opentelemetry/factories/logs.py +113 -0
- dagster_cloud/opentelemetry/factories/metrics.py +121 -0
- dagster_cloud/opentelemetry/metrics/__init__.py +0 -0
- dagster_cloud/opentelemetry/metrics/meter.py +140 -0
- dagster_cloud/opentelemetry/observers/__init__.py +0 -0
- dagster_cloud/opentelemetry/observers/dagster_exception_handler.py +40 -0
- dagster_cloud/opentelemetry/observers/execution_observer.py +178 -0
- dagster_cloud/pex/grpc/__generated__/multi_pex_api_pb2.pyi +175 -0
- dagster_cloud/pex/grpc/__init__.py +2 -2
- dagster_cloud/pex/grpc/client.py +4 -4
- dagster_cloud/pex/grpc/compile.py +2 -2
- dagster_cloud/pex/grpc/server/__init__.py +2 -2
- dagster_cloud/pex/grpc/server/cli/__init__.py +31 -19
- dagster_cloud/pex/grpc/server/manager.py +60 -42
- dagster_cloud/pex/grpc/server/registry.py +28 -21
- dagster_cloud/pex/grpc/server/server.py +23 -14
- dagster_cloud/pex/grpc/types.py +5 -5
- dagster_cloud/py.typed +0 -0
- dagster_cloud/secrets/__init__.py +1 -1
- dagster_cloud/secrets/loader.py +3 -3
- dagster_cloud/serverless/__init__.py +1 -1
- dagster_cloud/serverless/io_manager.py +36 -53
- dagster_cloud/storage/client.py +54 -17
- dagster_cloud/storage/compute_logs/__init__.py +3 -1
- dagster_cloud/storage/compute_logs/compute_log_manager.py +22 -17
- dagster_cloud/storage/defs_state/__init__.py +3 -0
- dagster_cloud/storage/defs_state/queries.py +15 -0
- dagster_cloud/storage/defs_state/storage.py +113 -0
- dagster_cloud/storage/event_logs/__init__.py +3 -1
- dagster_cloud/storage/event_logs/queries.py +102 -4
- dagster_cloud/storage/event_logs/storage.py +266 -73
- dagster_cloud/storage/event_logs/utils.py +88 -7
- dagster_cloud/storage/runs/__init__.py +1 -1
- dagster_cloud/storage/runs/queries.py +17 -2
- dagster_cloud/storage/runs/storage.py +88 -42
- dagster_cloud/storage/schedules/__init__.py +1 -1
- dagster_cloud/storage/schedules/storage.py +6 -8
- dagster_cloud/storage/tags.py +66 -1
- dagster_cloud/util/__init__.py +10 -12
- dagster_cloud/util/errors.py +49 -64
- dagster_cloud/version.py +1 -1
- dagster_cloud/workspace/config_schema/__init__.py +55 -13
- dagster_cloud/workspace/docker/__init__.py +76 -25
- dagster_cloud/workspace/docker/utils.py +1 -1
- dagster_cloud/workspace/ecs/__init__.py +1 -1
- dagster_cloud/workspace/ecs/client.py +51 -33
- dagster_cloud/workspace/ecs/launcher.py +76 -22
- dagster_cloud/workspace/ecs/run_launcher.py +3 -3
- dagster_cloud/workspace/ecs/utils.py +14 -5
- dagster_cloud/workspace/kubernetes/__init__.py +1 -1
- dagster_cloud/workspace/kubernetes/launcher.py +61 -29
- dagster_cloud/workspace/kubernetes/utils.py +34 -22
- dagster_cloud/workspace/user_code_launcher/__init__.py +5 -3
- dagster_cloud/workspace/user_code_launcher/process.py +16 -14
- dagster_cloud/workspace/user_code_launcher/user_code_launcher.py +552 -172
- dagster_cloud/workspace/user_code_launcher/utils.py +105 -1
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/METADATA +48 -42
- dagster_cloud-1.12.6.dist-info/RECORD +134 -0
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/WHEEL +1 -1
- dagster_cloud-1.8.2.dist-info/RECORD +0 -100
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/top_level.txt +0 -0
|
@@ -2,33 +2,49 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
import sys
|
|
4
4
|
import time
|
|
5
|
-
from collections import deque
|
|
5
|
+
from collections import defaultdict, deque
|
|
6
|
+
from collections.abc import Iterator
|
|
6
7
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
7
8
|
from contextlib import ExitStack
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import TYPE_CHECKING, Any,
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
|
10
11
|
|
|
11
12
|
import dagster._check as check
|
|
12
13
|
from dagster import DagsterInstance
|
|
13
14
|
from dagster._core.launcher.base import LaunchRunContext
|
|
14
|
-
from dagster._core.
|
|
15
|
-
from dagster._core.remote_representation.origin import RegisteredCodeLocationOrigin
|
|
15
|
+
from dagster._core.remote_origin import CodeLocationOrigin, RegisteredCodeLocationOrigin
|
|
16
16
|
from dagster._core.utils import FuturesAwareThreadPoolExecutor
|
|
17
17
|
from dagster._grpc.client import DagsterGrpcClient
|
|
18
18
|
from dagster._grpc.types import CancelExecutionRequest
|
|
19
19
|
from dagster._serdes import deserialize_value, serialize_value
|
|
20
20
|
from dagster._time import get_current_datetime, get_current_timestamp
|
|
21
|
+
from dagster._utils.cached_method import cached_method
|
|
21
22
|
from dagster._utils.container import retrieve_containerized_utilization_metrics
|
|
22
|
-
from dagster._utils.error import
|
|
23
|
+
from dagster._utils.error import (
|
|
24
|
+
SerializableErrorInfo,
|
|
25
|
+
serializable_error_info_from_exc_info,
|
|
26
|
+
truncate_serialized_error,
|
|
27
|
+
)
|
|
23
28
|
from dagster._utils.interrupts import raise_interrupts_as
|
|
24
29
|
from dagster._utils.merger import merge_dicts
|
|
25
30
|
from dagster._utils.typed_dict import init_optional_typeddict
|
|
26
31
|
from dagster_cloud_cli.core.errors import DagsterCloudHTTPError, raise_http_error
|
|
27
|
-
from dagster_cloud_cli.core.workspace import CodeLocationDeployData
|
|
32
|
+
from dagster_cloud_cli.core.workspace import CodeLocationDeployData
|
|
28
33
|
|
|
34
|
+
from dagster_cloud.agent.instrumentation.constants import DAGSTER_CLOUD_AGENT_METRIC_PREFIX
|
|
35
|
+
from dagster_cloud.agent.instrumentation.run_launch import extract_run_attributes
|
|
36
|
+
from dagster_cloud.agent.instrumentation.schedule import inspect_schedule_result
|
|
37
|
+
from dagster_cloud.agent.instrumentation.sensor import inspect_sensor_result
|
|
38
|
+
from dagster_cloud.agent.queries import (
|
|
39
|
+
ADD_AGENT_HEARTBEATS_MUTATION,
|
|
40
|
+
DEPLOYMENTS_QUERY,
|
|
41
|
+
GET_USER_CLOUD_REQUESTS_QUERY,
|
|
42
|
+
WORKSPACE_ENTRIES_QUERY,
|
|
43
|
+
)
|
|
29
44
|
from dagster_cloud.api.dagster_cloud_api import (
|
|
30
45
|
AgentHeartbeat,
|
|
31
46
|
AgentUtilizationMetrics,
|
|
47
|
+
BatchDagsterCloudUploadApiResponse,
|
|
32
48
|
DagsterCloudApi,
|
|
33
49
|
DagsterCloudApiErrorResponse,
|
|
34
50
|
DagsterCloudApiGrpcResponse,
|
|
@@ -40,21 +56,16 @@ from dagster_cloud.api.dagster_cloud_api import (
|
|
|
40
56
|
DagsterCloudUploadApiResponse,
|
|
41
57
|
TimestampedError,
|
|
42
58
|
)
|
|
59
|
+
from dagster_cloud.batching import Batcher
|
|
43
60
|
from dagster_cloud.instance import DagsterCloudAgentInstance
|
|
44
|
-
from dagster_cloud.
|
|
61
|
+
from dagster_cloud.opentelemetry.observers.execution_observer import observe_execution
|
|
62
|
+
from dagster_cloud.util import SERVER_HANDLE_TAG, compressed_namedtuple_upload_file, is_isolated_run
|
|
63
|
+
from dagster_cloud.version import __version__
|
|
45
64
|
from dagster_cloud.workspace.user_code_launcher import (
|
|
46
65
|
DagsterCloudUserCodeLauncher,
|
|
47
66
|
UserCodeLauncherEntry,
|
|
48
67
|
)
|
|
49
|
-
|
|
50
|
-
from ..util import SERVER_HANDLE_TAG, compressed_namedtuple_upload_file, is_isolated_run
|
|
51
|
-
from ..version import __version__
|
|
52
|
-
from .queries import (
|
|
53
|
-
ADD_AGENT_HEARTBEATS_MUTATION,
|
|
54
|
-
DEPLOYMENTS_QUERY,
|
|
55
|
-
GET_USER_CLOUD_REQUESTS_QUERY,
|
|
56
|
-
WORKSPACE_ENTRIES_QUERY,
|
|
57
|
-
)
|
|
68
|
+
from dagster_cloud.workspace.user_code_launcher.utils import get_instance_ref_for_user_code
|
|
58
69
|
|
|
59
70
|
if TYPE_CHECKING:
|
|
60
71
|
import datetime
|
|
@@ -73,6 +84,11 @@ DEFAULT_PENDING_REQUESTS_LIMIT = 100
|
|
|
73
84
|
|
|
74
85
|
SLEEP_INTERVAL_SECONDS = float(os.getenv("DAGSTER_CLOUD_AGENT_SLEEP_INTERVAL_SECONDS", "0.5"))
|
|
75
86
|
|
|
87
|
+
|
|
88
|
+
def UPLOAD_API_RESPONSE_BATCHING_ENABLED():
|
|
89
|
+
return os.getenv("DAGSTER_CLOUD_AGENT_UPLOAD_API_RESPONSE_BATCHING_ENABLED") == "true"
|
|
90
|
+
|
|
91
|
+
|
|
76
92
|
DEPLOYMENT_INFO_QUERY = """
|
|
77
93
|
query DeploymentInfo {
|
|
78
94
|
deploymentInfo {
|
|
@@ -96,10 +112,22 @@ LIVENESS_CHECK_INTERVAL_SECONDS = float(
|
|
|
96
112
|
|
|
97
113
|
|
|
98
114
|
class DagsterCloudAgent:
|
|
99
|
-
def __init__(
|
|
115
|
+
def __init__(
|
|
116
|
+
self,
|
|
117
|
+
instance: DagsterCloudAgentInstance,
|
|
118
|
+
pending_requests_limit: int = DEFAULT_PENDING_REQUESTS_LIMIT,
|
|
119
|
+
):
|
|
100
120
|
self._logger = logging.getLogger("dagster_cloud.agent")
|
|
121
|
+
self._instance: DagsterCloudAgentInstance = instance
|
|
101
122
|
|
|
102
|
-
self.
|
|
123
|
+
self._batcher: defaultdict[
|
|
124
|
+
str, Batcher[tuple[str, DagsterCloudUploadApiResponse], None]
|
|
125
|
+
] = defaultdict(self._batcher_factory)
|
|
126
|
+
|
|
127
|
+
if self._logger.isEnabledFor(logging.DEBUG):
|
|
128
|
+
self._logger.info("Starting Dagster Cloud agent with debug logging...")
|
|
129
|
+
else:
|
|
130
|
+
self._logger.info("Starting Dagster Cloud agent...")
|
|
103
131
|
|
|
104
132
|
self._exit_stack = ExitStack()
|
|
105
133
|
self._iteration = 0
|
|
@@ -110,7 +138,7 @@ class DagsterCloudAgent:
|
|
|
110
138
|
thread_name_prefix="dagster_cloud_agent_worker",
|
|
111
139
|
)
|
|
112
140
|
)
|
|
113
|
-
self._request_ids_to_futures:
|
|
141
|
+
self._request_ids_to_futures: dict[str, Future] = {}
|
|
114
142
|
self._utilization_metrics = init_optional_typeddict(AgentUtilizationMetrics)
|
|
115
143
|
|
|
116
144
|
self._last_heartbeat_time: Optional[datetime.datetime] = None
|
|
@@ -121,26 +149,38 @@ class DagsterCloudAgent:
|
|
|
121
149
|
maxlen=AGENT_HEARTBEAT_ERROR_LIMIT
|
|
122
150
|
) # (SerializableErrorInfo, timestamp) tuples
|
|
123
151
|
|
|
124
|
-
self._pending_requests:
|
|
125
|
-
self._locations_with_pending_requests:
|
|
126
|
-
self._ready_requests:
|
|
152
|
+
self._pending_requests: list[dict[str, Any]] = []
|
|
153
|
+
self._locations_with_pending_requests: set[tuple[str, str, bool]] = set()
|
|
154
|
+
self._ready_requests: list[dict[str, Any]] = []
|
|
127
155
|
|
|
128
|
-
self._location_query_times:
|
|
156
|
+
self._location_query_times: dict[tuple[str, str, bool], float] = {}
|
|
129
157
|
self._pending_requests_limit = check.int_param(
|
|
130
158
|
pending_requests_limit, "pending_requests_limit"
|
|
131
159
|
)
|
|
132
|
-
self._active_deployments:
|
|
160
|
+
self._active_deployments: set[tuple[str, bool]] = ( # deployment_name, is_branch_deployment
|
|
133
161
|
set()
|
|
134
162
|
)
|
|
135
163
|
|
|
136
164
|
self._last_liveness_check_time = None
|
|
137
165
|
|
|
166
|
+
self._warned_about_long_in_progress_reconcile = False
|
|
167
|
+
|
|
138
168
|
def __enter__(self):
|
|
139
169
|
return self
|
|
140
170
|
|
|
141
171
|
def __exit__(self, _exception_type, _exception_value, _traceback):
|
|
142
172
|
self._exit_stack.close()
|
|
143
173
|
|
|
174
|
+
def _batcher_factory(
|
|
175
|
+
self,
|
|
176
|
+
) -> Batcher[tuple[str, DagsterCloudUploadApiResponse], None]:
|
|
177
|
+
return Batcher(
|
|
178
|
+
"upload_api_response",
|
|
179
|
+
self._batch_upload_api_response,
|
|
180
|
+
max_wait_ms=50,
|
|
181
|
+
max_batch_size=32,
|
|
182
|
+
)
|
|
183
|
+
|
|
144
184
|
@property
|
|
145
185
|
def _active_deployment_names(self):
|
|
146
186
|
return [deployment[0] for deployment in self._active_deployments]
|
|
@@ -149,14 +189,15 @@ class DagsterCloudAgent:
|
|
|
149
189
|
def _active_full_deployment_names(self):
|
|
150
190
|
return [deployment[0] for deployment in self._active_deployments if not deployment[1]]
|
|
151
191
|
|
|
152
|
-
def _check_initial_deployment_names(self
|
|
153
|
-
if
|
|
154
|
-
result =
|
|
155
|
-
DEPLOYMENTS_QUERY,
|
|
192
|
+
def _check_initial_deployment_names(self):
|
|
193
|
+
if self._instance.deployment_names:
|
|
194
|
+
result = self._instance.organization_scoped_graphql_client().execute(
|
|
195
|
+
DEPLOYMENTS_QUERY,
|
|
196
|
+
variable_values={"deploymentNames": self._instance.deployment_names},
|
|
156
197
|
)
|
|
157
198
|
deployments = result["data"]["deployments"]
|
|
158
199
|
existing_deployment_names = {deployment["deploymentName"] for deployment in deployments}
|
|
159
|
-
requested_deployment_names = set(
|
|
200
|
+
requested_deployment_names = set(self._instance.deployment_names)
|
|
160
201
|
missing_deployment_names = requested_deployment_names.difference(
|
|
161
202
|
existing_deployment_names
|
|
162
203
|
)
|
|
@@ -171,13 +212,40 @@ class DagsterCloudAgent:
|
|
|
171
212
|
def _update_agent_resource_limits(
|
|
172
213
|
self, user_code_launcher: DagsterCloudUserCodeLauncher
|
|
173
214
|
) -> None:
|
|
174
|
-
#
|
|
175
|
-
|
|
176
|
-
#
|
|
177
|
-
|
|
215
|
+
# The agent should have environment variables defining its resource requests and limits.
|
|
216
|
+
# However, the agent may be running in a container with resource limits that are different
|
|
217
|
+
# For example, on k8s there are ways to effect change on the cpu limit, like mutating admission webhooks.
|
|
218
|
+
# Since the effective cgroup limits precede actual hosts resources when it comes to actual behavior for
|
|
219
|
+
# throttling and oom kills, we attempt to obtain these and fallback on the environment variables.
|
|
220
|
+
container_utilization_metrics = self._utilization_metrics.get("container_utilization", {})
|
|
221
|
+
memory_limit = container_utilization_metrics.get("memory_limit")
|
|
222
|
+
if not memory_limit:
|
|
223
|
+
memory_limit = os.getenv("DAGSTER_CLOUD_AGENT_MEMORY_LIMIT")
|
|
224
|
+
self._logger.info(
|
|
225
|
+
"Cannot obtain cgroup memory limit, using environment value: "
|
|
226
|
+
f"DAGSTER_CLOUD_AGENT_MEMORY_LIMIT={memory_limit}"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
cpu_cfs_period_us = container_utilization_metrics.get("cpu_cfs_period_us")
|
|
230
|
+
cpu_cfs_quota_us = container_utilization_metrics.get("cpu_cfs_quota_us")
|
|
231
|
+
|
|
232
|
+
cpu_limit = None
|
|
233
|
+
if cpu_cfs_quota_us and cpu_cfs_period_us:
|
|
234
|
+
cpu_limit = (
|
|
235
|
+
1000.0 * cpu_cfs_quota_us
|
|
236
|
+
) / cpu_cfs_period_us # cpu_limit expressed in milliseconds of cpu
|
|
237
|
+
|
|
238
|
+
if not cpu_limit:
|
|
239
|
+
cpu_limit = os.getenv("DAGSTER_CLOUD_AGENT_CPU_LIMIT")
|
|
240
|
+
self._logger.info(
|
|
241
|
+
"Cannot obtain CPU CFS values, using environment value: "
|
|
242
|
+
f"DAGSTER_CLOUD_AGENT_CPU_LIMIT={cpu_limit}"
|
|
243
|
+
)
|
|
244
|
+
|
|
178
245
|
if not user_code_launcher.user_code_deployment_type.supports_utilization_metrics:
|
|
179
246
|
self._logger.info(
|
|
180
|
-
f"Cannot interpret resource limits for agent type {user_code_launcher.user_code_deployment_type.value}.
|
|
247
|
+
f"Cannot interpret resource limits for agent type {user_code_launcher.user_code_deployment_type.value}."
|
|
248
|
+
"Skipping utilization metrics retrieval."
|
|
181
249
|
)
|
|
182
250
|
return
|
|
183
251
|
|
|
@@ -185,6 +253,7 @@ class DagsterCloudAgent:
|
|
|
185
253
|
"cpu_limit": cpu_limit,
|
|
186
254
|
"memory_limit": memory_limit,
|
|
187
255
|
}
|
|
256
|
+
|
|
188
257
|
cpu_request = os.getenv("DAGSTER_CLOUD_AGENT_CPU_REQUEST")
|
|
189
258
|
memory_request = os.getenv("DAGSTER_CLOUD_AGENT_MEMORY_REQUEST")
|
|
190
259
|
if cpu_request:
|
|
@@ -192,7 +261,6 @@ class DagsterCloudAgent:
|
|
|
192
261
|
if memory_request:
|
|
193
262
|
limits["memory_request"] = memory_request
|
|
194
263
|
|
|
195
|
-
# At this point, the only agent types possible are serverless, ecs, and k8s, all of which are supported. The linter isn't smart enough to realize this, so we disable it.
|
|
196
264
|
self._utilization_metrics["resource_limits"][
|
|
197
265
|
user_code_launcher.user_code_deployment_type.value
|
|
198
266
|
] = limits # type: ignore
|
|
@@ -216,16 +284,15 @@ class DagsterCloudAgent:
|
|
|
216
284
|
|
|
217
285
|
def run_loop(
|
|
218
286
|
self,
|
|
219
|
-
instance: DagsterCloudAgentInstance,
|
|
220
287
|
user_code_launcher,
|
|
221
288
|
agent_uuid,
|
|
222
289
|
):
|
|
223
290
|
heartbeat_interval_seconds = AGENT_HEARTBEAT_INTERVAL_SECONDS
|
|
224
291
|
|
|
225
292
|
if (
|
|
226
|
-
not
|
|
227
|
-
and not
|
|
228
|
-
and not
|
|
293
|
+
not self._instance.includes_branch_deployments
|
|
294
|
+
and not self._instance.deployment_names
|
|
295
|
+
and not self._instance.include_all_serverless_deployments
|
|
229
296
|
):
|
|
230
297
|
self._logger.info(
|
|
231
298
|
"Deployment name was not set - checking to see if it can be fetched from the"
|
|
@@ -233,20 +300,34 @@ class DagsterCloudAgent:
|
|
|
233
300
|
)
|
|
234
301
|
# Fetch the deployment name from the server if it isn't set (only true
|
|
235
302
|
# for old agents, and only will work if there's a single deployment in the org)
|
|
236
|
-
result =
|
|
303
|
+
result = self._instance.graphql_client.execute(DEPLOYMENT_INFO_QUERY)
|
|
237
304
|
deployment_name = result["data"]["deploymentInfo"]["deploymentName"]
|
|
238
|
-
|
|
239
|
-
DagsterInstance.from_ref(
|
|
305
|
+
self._instance = self._exit_stack.enter_context(
|
|
306
|
+
DagsterInstance.from_ref(self._instance.ref_for_deployment(deployment_name)) # type: ignore # (instance subclass)
|
|
240
307
|
)
|
|
241
308
|
|
|
242
|
-
self._check_initial_deployment_names(
|
|
309
|
+
self._check_initial_deployment_names()
|
|
310
|
+
|
|
311
|
+
serving = []
|
|
312
|
+
queues = list(filter(None, self._instance.agent_queues_config.queues))
|
|
313
|
+
if queues:
|
|
314
|
+
serving.append(f"queues{queues}")
|
|
315
|
+
if self._instance.deployment_names:
|
|
316
|
+
serving.append(f"deployments{self._instance.deployment_names}")
|
|
317
|
+
if self._instance.include_all_serverless_deployments:
|
|
318
|
+
serving.append("all serverless deployments")
|
|
319
|
+
if self._instance.includes_branch_deployments:
|
|
320
|
+
serving.append("branch deployments")
|
|
321
|
+
|
|
322
|
+
self._logger.info(f"Agent is serving: {', '.join(serving)}")
|
|
243
323
|
|
|
244
324
|
self._check_update_workspace(
|
|
245
|
-
|
|
325
|
+
user_code_launcher,
|
|
326
|
+
upload_all=user_code_launcher.upload_snapshots_on_startup,
|
|
246
327
|
)
|
|
247
328
|
|
|
248
329
|
self._logger.info(
|
|
249
|
-
f"Will start polling for requests from {
|
|
330
|
+
f"Will start polling for requests from {self._instance.dagster_cloud_url} once user code has"
|
|
250
331
|
" been loaded."
|
|
251
332
|
)
|
|
252
333
|
|
|
@@ -254,7 +335,7 @@ class DagsterCloudAgent:
|
|
|
254
335
|
|
|
255
336
|
while True:
|
|
256
337
|
try:
|
|
257
|
-
for error in self.run_iteration(
|
|
338
|
+
for error in self.run_iteration(user_code_launcher):
|
|
258
339
|
if error:
|
|
259
340
|
self._logger.error(str(error))
|
|
260
341
|
self._errors.appendleft(
|
|
@@ -283,16 +364,18 @@ class DagsterCloudAgent:
|
|
|
283
364
|
|
|
284
365
|
if user_code_launcher.ready_to_serve_requests:
|
|
285
366
|
try:
|
|
286
|
-
self._check_add_heartbeat(
|
|
367
|
+
self._check_add_heartbeat(agent_uuid, heartbeat_interval_seconds)
|
|
287
368
|
except Exception:
|
|
288
369
|
self._logger.exception("Failed to add heartbeat")
|
|
289
370
|
|
|
371
|
+
self._check_for_long_running_reconcile(user_code_launcher)
|
|
372
|
+
|
|
290
373
|
# Check for any received interrupts
|
|
291
374
|
with raise_interrupts_as(KeyboardInterrupt):
|
|
292
375
|
pass
|
|
293
376
|
|
|
294
377
|
try:
|
|
295
|
-
self._check_update_workspace(
|
|
378
|
+
self._check_update_workspace(user_code_launcher, upload_all=False)
|
|
296
379
|
|
|
297
380
|
except Exception:
|
|
298
381
|
self._logger.error(
|
|
@@ -329,7 +412,27 @@ class DagsterCloudAgent:
|
|
|
329
412
|
self._logger.error(f"Failed to write liveness sentinel and disabling it: {e}")
|
|
330
413
|
self._last_liveness_check_time = False
|
|
331
414
|
|
|
332
|
-
def
|
|
415
|
+
def _check_for_long_running_reconcile(self, user_code_launcher):
|
|
416
|
+
"""Detect from the main thread if the background reconcile thread is running behind or has gotten stuck."""
|
|
417
|
+
in_progress_reconcile_start_time = user_code_launcher.in_progress_reconcile_start_time
|
|
418
|
+
|
|
419
|
+
reconcile_start_time_warning = int(
|
|
420
|
+
os.getenv("DAGSTER_CLOUD_AGENT_RECONCILE_START_TIME_WARNING", "3600")
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
if (
|
|
424
|
+
in_progress_reconcile_start_time is not None
|
|
425
|
+
and (time.time() - in_progress_reconcile_start_time) >= reconcile_start_time_warning
|
|
426
|
+
):
|
|
427
|
+
if not self._warned_about_long_in_progress_reconcile:
|
|
428
|
+
self._logger.warning(
|
|
429
|
+
f"Agent has been redeploying code servers for more than {reconcile_start_time_warning} seconds. This may indicate the background thread that performs the redeploys is stuck."
|
|
430
|
+
)
|
|
431
|
+
self._warned_about_long_in_progress_reconcile = True
|
|
432
|
+
else:
|
|
433
|
+
self._warned_about_long_in_progress_reconcile = False
|
|
434
|
+
|
|
435
|
+
def _check_update_workspace(self, user_code_launcher, upload_all):
|
|
333
436
|
curr_time = get_current_datetime()
|
|
334
437
|
|
|
335
438
|
if (
|
|
@@ -340,11 +443,10 @@ class DagsterCloudAgent:
|
|
|
340
443
|
return
|
|
341
444
|
|
|
342
445
|
self._last_workspace_check_time = curr_time
|
|
343
|
-
self._query_for_workspace_updates(
|
|
446
|
+
self._query_for_workspace_updates(user_code_launcher, upload_all=upload_all)
|
|
344
447
|
|
|
345
448
|
def _check_add_heartbeat(
|
|
346
449
|
self,
|
|
347
|
-
instance: DagsterCloudAgentInstance,
|
|
348
450
|
agent_uuid,
|
|
349
451
|
heartbeat_interval_seconds,
|
|
350
452
|
):
|
|
@@ -357,19 +459,22 @@ class DagsterCloudAgent:
|
|
|
357
459
|
return
|
|
358
460
|
|
|
359
461
|
errors = [
|
|
360
|
-
TimestampedError(
|
|
462
|
+
TimestampedError(
|
|
463
|
+
timestamp=timestamp.timestamp(),
|
|
464
|
+
error=error,
|
|
465
|
+
)
|
|
361
466
|
for (error, timestamp) in self._errors
|
|
362
467
|
if timestamp.timestamp() > curr_time.timestamp() - 60 * 60 * 24
|
|
363
468
|
]
|
|
364
469
|
|
|
365
|
-
run_worker_statuses_dict =
|
|
470
|
+
run_worker_statuses_dict = self._instance.user_code_launcher.get_cloud_run_worker_statuses(
|
|
366
471
|
self._active_deployment_names
|
|
367
472
|
)
|
|
368
473
|
|
|
369
|
-
code_server_heartbeats_dict =
|
|
474
|
+
code_server_heartbeats_dict = self._instance.user_code_launcher.get_grpc_server_heartbeats()
|
|
370
475
|
|
|
371
476
|
agent_image_tag = os.getenv("DAGSTER_CLOUD_AGENT_IMAGE_TAG")
|
|
372
|
-
if
|
|
477
|
+
if self._instance.user_code_launcher.agent_metrics_enabled:
|
|
373
478
|
num_running_requests = self._utilization_metrics["request_utilization"][
|
|
374
479
|
"num_running_requests"
|
|
375
480
|
]
|
|
@@ -386,10 +491,10 @@ class DagsterCloudAgent:
|
|
|
386
491
|
deployment_name: AgentHeartbeat(
|
|
387
492
|
timestamp=curr_time.timestamp(),
|
|
388
493
|
agent_id=agent_uuid,
|
|
389
|
-
agent_label=
|
|
494
|
+
agent_label=self._instance.dagster_cloud_api_agent_label,
|
|
390
495
|
agent_type=(
|
|
391
|
-
type(
|
|
392
|
-
if
|
|
496
|
+
type(self._instance.user_code_launcher).__name__
|
|
497
|
+
if self._instance.user_code_launcher
|
|
393
498
|
else None
|
|
394
499
|
),
|
|
395
500
|
metadata=merge_dicts(
|
|
@@ -397,13 +502,13 @@ class DagsterCloudAgent:
|
|
|
397
502
|
{"image_tag": agent_image_tag} if agent_image_tag else {},
|
|
398
503
|
{
|
|
399
504
|
"utilization_metrics": self._utilization_metrics
|
|
400
|
-
if
|
|
505
|
+
if self._instance.user_code_launcher.agent_metrics_enabled
|
|
401
506
|
else {}
|
|
402
507
|
},
|
|
403
508
|
),
|
|
404
509
|
run_worker_statuses=run_worker_statuses_dict[deployment_name],
|
|
405
510
|
code_server_heartbeats=code_server_heartbeats_dict.get(deployment_name, []),
|
|
406
|
-
agent_queues_config=
|
|
511
|
+
agent_queues_config=self._instance.agent_queues_config,
|
|
407
512
|
)
|
|
408
513
|
for deployment_name in self._active_deployment_names
|
|
409
514
|
}
|
|
@@ -418,7 +523,7 @@ class DagsterCloudAgent:
|
|
|
418
523
|
|
|
419
524
|
serialized_errors = [serialize_value(error) for error in errors]
|
|
420
525
|
try:
|
|
421
|
-
|
|
526
|
+
self._instance.organization_scoped_graphql_client().execute(
|
|
422
527
|
ADD_AGENT_HEARTBEATS_MUTATION,
|
|
423
528
|
variable_values={
|
|
424
529
|
"serializedAgentHeartbeats": serialized_agent_heartbeats,
|
|
@@ -444,15 +549,15 @@ class DagsterCloudAgent:
|
|
|
444
549
|
for deployment_name, heartbeat in heartbeats.items()
|
|
445
550
|
]
|
|
446
551
|
|
|
447
|
-
|
|
552
|
+
self._instance.organization_scoped_graphql_client().execute(
|
|
448
553
|
ADD_AGENT_HEARTBEATS_MUTATION,
|
|
449
554
|
variable_values={
|
|
450
555
|
"serializedAgentHeartbeats": serialized_agent_heartbeats,
|
|
451
556
|
"serializedErrors": [
|
|
452
557
|
serialize_value(
|
|
453
558
|
TimestampedError(
|
|
454
|
-
curr_time.timestamp(),
|
|
455
|
-
SerializableErrorInfo(
|
|
559
|
+
timestamp=curr_time.timestamp(),
|
|
560
|
+
error=SerializableErrorInfo(
|
|
456
561
|
error_message,
|
|
457
562
|
stack=[],
|
|
458
563
|
cls_name=None,
|
|
@@ -471,26 +576,24 @@ class DagsterCloudAgent:
|
|
|
471
576
|
return self._executor
|
|
472
577
|
|
|
473
578
|
@property
|
|
474
|
-
def request_ids_to_futures(self) ->
|
|
579
|
+
def request_ids_to_futures(self) -> dict[str, Future]:
|
|
475
580
|
return self._request_ids_to_futures
|
|
476
581
|
|
|
477
582
|
def _upload_outdated_workspace_entries(
|
|
478
583
|
self,
|
|
479
|
-
instance: DagsterCloudAgentInstance,
|
|
480
584
|
deployment_name: str,
|
|
481
585
|
is_branch_deployment: bool,
|
|
482
586
|
user_code_launcher: DagsterCloudUserCodeLauncher,
|
|
483
587
|
):
|
|
484
|
-
result =
|
|
588
|
+
result = self._instance.graphql_client_for_deployment(deployment_name).execute(
|
|
485
589
|
WORKSPACE_ENTRIES_QUERY,
|
|
486
590
|
variable_values={
|
|
487
591
|
"deploymentNames": [deployment_name],
|
|
488
592
|
"includeAllServerlessDeployments": False,
|
|
489
|
-
"agentQueues":
|
|
593
|
+
"agentQueues": self._instance.agent_queues_config.queues,
|
|
490
594
|
},
|
|
491
595
|
)
|
|
492
596
|
entries = result["data"]["deployments"][0]["workspaceEntries"]
|
|
493
|
-
now = time.time()
|
|
494
597
|
|
|
495
598
|
upload_metadata = {}
|
|
496
599
|
|
|
@@ -501,10 +604,11 @@ class DagsterCloudAgent:
|
|
|
501
604
|
)
|
|
502
605
|
if entry["hasOutdatedData"]:
|
|
503
606
|
# Spin up a server for this location and upload its metadata to Cloud
|
|
504
|
-
# (Bump the TTL counter as well to leave the server up
|
|
607
|
+
# (Bump the TTL counter as well to leave the server up - ensure that a slighty
|
|
608
|
+
# different timestamp is chosen for each location to break ties)
|
|
505
609
|
self._location_query_times[
|
|
506
610
|
(deployment_name, location_name, is_branch_deployment)
|
|
507
|
-
] =
|
|
611
|
+
] = time.time()
|
|
508
612
|
upload_metadata[(deployment_name, location_name)] = UserCodeLauncherEntry(
|
|
509
613
|
code_location_deploy_data=code_location_deploy_data,
|
|
510
614
|
update_timestamp=float(entry["metadataTimestamp"]),
|
|
@@ -516,16 +620,14 @@ class DagsterCloudAgent:
|
|
|
516
620
|
# branch deployments always have TTLs, other deployments only if you asked for it specifically
|
|
517
621
|
return is_branch_deployment or user_code_launcher.server_ttl_enabled_for_full_deployments
|
|
518
622
|
|
|
519
|
-
def _get_ttl_seconds(self,
|
|
623
|
+
def _get_ttl_seconds(self, is_branch_deployment):
|
|
520
624
|
return (
|
|
521
|
-
|
|
625
|
+
self._instance.user_code_launcher.branch_deployment_ttl_seconds
|
|
522
626
|
if is_branch_deployment
|
|
523
|
-
else
|
|
627
|
+
else self._instance.user_code_launcher.full_deployment_ttl_seconds
|
|
524
628
|
)
|
|
525
629
|
|
|
526
|
-
def _get_locations_with_ttl_to_query(
|
|
527
|
-
self, instance, user_code_launcher
|
|
528
|
-
) -> List[Tuple[str, str]]:
|
|
630
|
+
def _get_locations_with_ttl_to_query(self, user_code_launcher) -> list[tuple[str, str]]:
|
|
529
631
|
now = time.time()
|
|
530
632
|
|
|
531
633
|
# For the deployments with TTLs, decide which locations to consider
|
|
@@ -533,13 +635,13 @@ class DagsterCloudAgent:
|
|
|
533
635
|
# - a) There's a pending request in the queue for it
|
|
534
636
|
# - b) It's TTL hasn't expired since the last time somebody asked for it
|
|
535
637
|
# Always include locations in a), and add locations from b) until you hit a limit
|
|
536
|
-
location_candidates:
|
|
638
|
+
location_candidates: set[tuple[str, str, float]] = {
|
|
537
639
|
(deployment, location, -1.0) # Score below 0 so that they're at the front of the list
|
|
538
640
|
for deployment, location, is_branch_deployment in self._locations_with_pending_requests
|
|
539
641
|
if self._has_ttl(user_code_launcher, is_branch_deployment)
|
|
540
642
|
}
|
|
541
643
|
|
|
542
|
-
num_locations_to_query =
|
|
644
|
+
num_locations_to_query = self._instance.user_code_launcher.server_ttl_max_servers
|
|
543
645
|
|
|
544
646
|
if len(location_candidates) > num_locations_to_query:
|
|
545
647
|
self._logger.warning(
|
|
@@ -561,7 +663,7 @@ class DagsterCloudAgent:
|
|
|
561
663
|
|
|
562
664
|
time_since_last_query = now - query_time
|
|
563
665
|
|
|
564
|
-
if time_since_last_query >= self._get_ttl_seconds(
|
|
666
|
+
if time_since_last_query >= self._get_ttl_seconds(is_branch_deployment):
|
|
565
667
|
continue
|
|
566
668
|
|
|
567
669
|
location_candidates.add((deployment_name, location, time_since_last_query))
|
|
@@ -583,13 +685,10 @@ class DagsterCloudAgent:
|
|
|
583
685
|
|
|
584
686
|
def _query_for_workspace_updates(
|
|
585
687
|
self,
|
|
586
|
-
instance: DagsterCloudAgentInstance,
|
|
587
688
|
user_code_launcher: DagsterCloudUserCodeLauncher,
|
|
588
689
|
upload_all: bool,
|
|
589
690
|
):
|
|
590
|
-
locations_with_ttl_to_query = self._get_locations_with_ttl_to_query(
|
|
591
|
-
instance, user_code_launcher
|
|
592
|
-
)
|
|
691
|
+
locations_with_ttl_to_query = self._get_locations_with_ttl_to_query(user_code_launcher)
|
|
593
692
|
|
|
594
693
|
deployments_to_query = {key[0] for key in locations_with_ttl_to_query}
|
|
595
694
|
|
|
@@ -600,23 +699,23 @@ class DagsterCloudAgent:
|
|
|
600
699
|
self._logger.debug(f"Querying for the following locations with TTL: {locations_str}")
|
|
601
700
|
|
|
602
701
|
# If you have specified a non-branch deployment and no TTL, always consider it
|
|
603
|
-
if
|
|
604
|
-
deployments_to_query = deployments_to_query.union(set(
|
|
702
|
+
if self._instance.deployment_names:
|
|
703
|
+
deployments_to_query = deployments_to_query.union(set(self._instance.deployment_names))
|
|
605
704
|
|
|
606
705
|
# Create mapping of
|
|
607
706
|
# - location name => deployment metadata
|
|
608
|
-
deployment_map:
|
|
609
|
-
all_locations:
|
|
707
|
+
deployment_map: dict[tuple[str, str], UserCodeLauncherEntry] = {}
|
|
708
|
+
all_locations: set[tuple[str, str]] = set()
|
|
610
709
|
|
|
611
710
|
self._active_deployments = set()
|
|
612
711
|
|
|
613
|
-
if deployments_to_query or
|
|
614
|
-
result =
|
|
712
|
+
if deployments_to_query or self._instance.include_all_serverless_deployments:
|
|
713
|
+
result = self._instance.organization_scoped_graphql_client().execute(
|
|
615
714
|
WORKSPACE_ENTRIES_QUERY,
|
|
616
715
|
variable_values={
|
|
617
716
|
"deploymentNames": list(deployments_to_query),
|
|
618
|
-
"includeAllServerlessDeployments":
|
|
619
|
-
"agentQueues":
|
|
717
|
+
"includeAllServerlessDeployments": self._instance.include_all_serverless_deployments,
|
|
718
|
+
"agentQueues": self._instance.agent_queues_config.queues,
|
|
620
719
|
},
|
|
621
720
|
)
|
|
622
721
|
|
|
@@ -645,7 +744,7 @@ class DagsterCloudAgent:
|
|
|
645
744
|
# only include the locations within locations_with_ttl_to_query.
|
|
646
745
|
if not self._has_ttl(
|
|
647
746
|
user_code_launcher, is_branch_deployment
|
|
648
|
-
) or location_key in cast(
|
|
747
|
+
) or location_key in cast("set[tuple[str, str]]", locations_with_ttl_to_query):
|
|
649
748
|
deployment_map[location_key] = UserCodeLauncherEntry(
|
|
650
749
|
code_location_deploy_data=code_location_deploy_data,
|
|
651
750
|
update_timestamp=float(entry["metadataTimestamp"]),
|
|
@@ -718,17 +817,15 @@ class DagsterCloudAgent:
|
|
|
718
817
|
else:
|
|
719
818
|
return None
|
|
720
819
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
return get_instance_ref_for_user_code(instance.ref_for_deployment(deployment_name))
|
|
820
|
+
@cached_method
|
|
821
|
+
def _get_user_code_instance_ref(self, deployment_name: str):
|
|
822
|
+
return get_instance_ref_for_user_code(self._instance.ref_for_deployment(deployment_name))
|
|
725
823
|
|
|
726
824
|
def _handle_api_request(
|
|
727
825
|
self,
|
|
728
826
|
request: DagsterCloudApiRequest,
|
|
729
827
|
deployment_name: str,
|
|
730
828
|
is_branch_deployment: bool,
|
|
731
|
-
instance: DagsterCloudAgentInstance,
|
|
732
829
|
user_code_launcher: DagsterCloudUserCodeLauncher,
|
|
733
830
|
) -> Union[DagsterCloudApiSuccess, DagsterCloudApiGrpcResponse]:
|
|
734
831
|
api_name = request.request_api
|
|
@@ -743,107 +840,153 @@ class DagsterCloudAgent:
|
|
|
743
840
|
# Dagster Cloud has requested that we upload new metadata for any out of date locations in
|
|
744
841
|
# the workspace
|
|
745
842
|
self._upload_outdated_workspace_entries(
|
|
746
|
-
|
|
843
|
+
deployment_name, is_branch_deployment, user_code_launcher
|
|
747
844
|
)
|
|
748
845
|
return DagsterCloudApiSuccess()
|
|
749
846
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_EXECUTION_PLAN:
|
|
750
847
|
client = self._get_grpc_client(
|
|
751
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
848
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
752
849
|
)
|
|
753
850
|
serialized_snapshot_or_error = client.execution_plan_snapshot(
|
|
754
851
|
execution_plan_snapshot_args=request.request_args._replace(
|
|
755
|
-
instance_ref=self._get_user_code_instance_ref(
|
|
852
|
+
instance_ref=self._get_user_code_instance_ref(deployment_name)
|
|
756
853
|
)
|
|
757
854
|
)
|
|
758
|
-
return DagsterCloudApiGrpcResponse(
|
|
855
|
+
return DagsterCloudApiGrpcResponse(
|
|
856
|
+
serialized_response_or_error=serialized_snapshot_or_error
|
|
857
|
+
)
|
|
759
858
|
|
|
760
859
|
elif api_name == DagsterCloudApi.GET_SUBSET_EXTERNAL_PIPELINE_RESULT:
|
|
761
860
|
client = self._get_grpc_client(
|
|
762
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
861
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
763
862
|
)
|
|
764
863
|
|
|
765
864
|
serialized_subset_result_or_error = client.external_pipeline_subset(
|
|
766
865
|
pipeline_subset_snapshot_args=request.request_args
|
|
767
866
|
)
|
|
768
867
|
|
|
769
|
-
return DagsterCloudApiGrpcResponse(
|
|
868
|
+
return DagsterCloudApiGrpcResponse(
|
|
869
|
+
serialized_response_or_error=serialized_subset_result_or_error
|
|
870
|
+
)
|
|
770
871
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_CONFIG:
|
|
771
872
|
client = self._get_grpc_client(
|
|
772
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
873
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
773
874
|
)
|
|
774
875
|
serialized_partition_config_or_error = client.external_partition_config(
|
|
775
876
|
partition_args=request.request_args,
|
|
776
877
|
)
|
|
777
|
-
return DagsterCloudApiGrpcResponse(
|
|
878
|
+
return DagsterCloudApiGrpcResponse(
|
|
879
|
+
serialized_response_or_error=serialized_partition_config_or_error
|
|
880
|
+
)
|
|
778
881
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_TAGS:
|
|
779
882
|
client = self._get_grpc_client(
|
|
780
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
883
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
781
884
|
)
|
|
782
885
|
serialized_partition_tags_or_error = client.external_partition_tags(
|
|
783
886
|
partition_args=request.request_args,
|
|
784
887
|
)
|
|
785
|
-
return DagsterCloudApiGrpcResponse(
|
|
888
|
+
return DagsterCloudApiGrpcResponse(
|
|
889
|
+
serialized_response_or_error=serialized_partition_tags_or_error
|
|
890
|
+
)
|
|
786
891
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_NAMES:
|
|
787
892
|
client = self._get_grpc_client(
|
|
788
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
893
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
789
894
|
)
|
|
790
895
|
serialized_partition_names_or_error = client.external_partition_names(
|
|
791
896
|
partition_names_args=request.request_args,
|
|
792
897
|
)
|
|
793
|
-
return DagsterCloudApiGrpcResponse(
|
|
898
|
+
return DagsterCloudApiGrpcResponse(
|
|
899
|
+
serialized_response_or_error=serialized_partition_names_or_error
|
|
900
|
+
)
|
|
794
901
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_PARTITION_SET_EXECUTION_PARAM_DATA:
|
|
795
902
|
client = self._get_grpc_client(
|
|
796
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
903
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
797
904
|
)
|
|
798
905
|
serialized_partition_execution_params_or_error = (
|
|
799
906
|
client.external_partition_set_execution_params(
|
|
800
907
|
partition_set_execution_param_args=request.request_args
|
|
801
908
|
)
|
|
802
909
|
)
|
|
803
|
-
return DagsterCloudApiGrpcResponse(
|
|
910
|
+
return DagsterCloudApiGrpcResponse(
|
|
911
|
+
serialized_response_or_error=serialized_partition_execution_params_or_error
|
|
912
|
+
)
|
|
804
913
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_SCHEDULE_EXECUTION_DATA:
|
|
805
914
|
client = self._get_grpc_client(
|
|
806
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
915
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
807
916
|
)
|
|
808
917
|
|
|
809
918
|
args = request.request_args._replace(
|
|
810
|
-
instance_ref=self._get_user_code_instance_ref(
|
|
919
|
+
instance_ref=self._get_user_code_instance_ref(deployment_name)
|
|
811
920
|
)
|
|
812
921
|
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
922
|
+
schedule_attributes = {
|
|
923
|
+
"schedule": args.schedule_name,
|
|
924
|
+
"repository": args.repository_origin.repository_name,
|
|
925
|
+
"location": args.repository_origin.code_location_origin.location_name,
|
|
926
|
+
"deployment": deployment_name,
|
|
927
|
+
}
|
|
816
928
|
|
|
817
|
-
|
|
929
|
+
with observe_execution(
|
|
930
|
+
opentelemetry=self._instance.opentelemetry,
|
|
931
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.schedule.evaluation",
|
|
932
|
+
short_description="schedule evaluation requests",
|
|
933
|
+
attributes=schedule_attributes,
|
|
934
|
+
result_evaluator_callback=inspect_schedule_result,
|
|
935
|
+
) as observer:
|
|
936
|
+
serialized_schedule_data_or_error = client.external_schedule_execution(
|
|
937
|
+
external_schedule_execution_args=args,
|
|
938
|
+
)
|
|
939
|
+
observer.evaluate_result(
|
|
940
|
+
serialized_data_or_error=serialized_schedule_data_or_error,
|
|
941
|
+
)
|
|
942
|
+
return DagsterCloudApiGrpcResponse(
|
|
943
|
+
serialized_response_or_error=serialized_schedule_data_or_error
|
|
944
|
+
)
|
|
818
945
|
|
|
819
946
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_SENSOR_EXECUTION_DATA:
|
|
820
947
|
client = self._get_grpc_client(
|
|
821
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
948
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
822
949
|
)
|
|
823
950
|
|
|
824
951
|
args = request.request_args._replace(
|
|
825
|
-
instance_ref=self._get_user_code_instance_ref(
|
|
952
|
+
instance_ref=self._get_user_code_instance_ref(deployment_name)
|
|
826
953
|
)
|
|
827
954
|
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
955
|
+
sensor_attributes = {
|
|
956
|
+
"sensor": args.sensor_name,
|
|
957
|
+
"repository": args.repository_origin.repository_name,
|
|
958
|
+
"location": args.repository_origin.code_location_origin.location_name,
|
|
959
|
+
"deployment": deployment_name,
|
|
960
|
+
}
|
|
831
961
|
|
|
832
|
-
|
|
962
|
+
with observe_execution(
|
|
963
|
+
opentelemetry=self._instance.opentelemetry,
|
|
964
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.sensor.evaluation",
|
|
965
|
+
short_description="sensor evaluation requests",
|
|
966
|
+
attributes=sensor_attributes,
|
|
967
|
+
result_evaluator_callback=inspect_sensor_result,
|
|
968
|
+
) as observer:
|
|
969
|
+
serialized_sensor_data_or_error = client.external_sensor_execution(
|
|
970
|
+
sensor_execution_args=args,
|
|
971
|
+
)
|
|
972
|
+
observer.evaluate_result(serialized_sensor_data_or_error)
|
|
973
|
+
return DagsterCloudApiGrpcResponse(
|
|
974
|
+
serialized_response_or_error=serialized_sensor_data_or_error
|
|
975
|
+
)
|
|
833
976
|
elif api_name == DagsterCloudApi.GET_EXTERNAL_NOTEBOOK_DATA:
|
|
834
977
|
client = self._get_grpc_client(
|
|
835
|
-
user_code_launcher, deployment_name, cast(str, location_name)
|
|
978
|
+
user_code_launcher, deployment_name, cast("str", location_name)
|
|
836
979
|
)
|
|
837
980
|
response = client.external_notebook_data(request.request_args.notebook_path)
|
|
838
|
-
return DagsterCloudApiGrpcResponse(response.decode())
|
|
981
|
+
return DagsterCloudApiGrpcResponse(serialized_response_or_error=response.decode())
|
|
839
982
|
elif api_name == DagsterCloudApi.LAUNCH_RUN:
|
|
840
983
|
run = request.request_args.dagster_run
|
|
841
984
|
|
|
842
985
|
with DagsterInstance.from_ref(
|
|
843
|
-
self._get_user_code_instance_ref(
|
|
986
|
+
self._get_user_code_instance_ref(deployment_name)
|
|
844
987
|
) as scoped_instance:
|
|
845
988
|
scoped_instance.report_engine_event(
|
|
846
|
-
f"{
|
|
989
|
+
f"{self._instance.agent_display_name} is launching run {run.run_id}",
|
|
847
990
|
run,
|
|
848
991
|
cls=self.__class__,
|
|
849
992
|
)
|
|
@@ -852,40 +995,50 @@ class DagsterCloudAgent:
|
|
|
852
995
|
run.run_id,
|
|
853
996
|
merge_dicts(
|
|
854
997
|
(
|
|
855
|
-
{"dagster/agent_label":
|
|
856
|
-
if
|
|
998
|
+
{"dagster/agent_label": self._instance.dagster_cloud_api_agent_label}
|
|
999
|
+
if self._instance.dagster_cloud_api_agent_label
|
|
857
1000
|
else {}
|
|
858
1001
|
),
|
|
859
|
-
{"dagster/agent_id":
|
|
1002
|
+
{"dagster/agent_id": self._instance.instance_uuid},
|
|
860
1003
|
),
|
|
861
1004
|
)
|
|
862
1005
|
|
|
863
|
-
|
|
1006
|
+
run_attributes = extract_run_attributes(deployment_name, run)
|
|
1007
|
+
with observe_execution(
|
|
1008
|
+
opentelemetry=self._instance.opentelemetry,
|
|
1009
|
+
event_key=f"{DAGSTER_CLOUD_AGENT_METRIC_PREFIX}.run.launches",
|
|
1010
|
+
short_description="run execution requests",
|
|
1011
|
+
attributes=run_attributes,
|
|
1012
|
+
) as observer:
|
|
1013
|
+
launcher = scoped_instance.get_run_launcher_for_run(run) # type: ignore # (instance subclass)
|
|
864
1014
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
1015
|
+
if is_isolated_run(run):
|
|
1016
|
+
launcher.launch_run(LaunchRunContext(dagster_run=run, workspace=None))
|
|
1017
|
+
else:
|
|
1018
|
+
scoped_instance.report_engine_event(
|
|
1019
|
+
f"Launching {run.run_id} without an isolated run environment.",
|
|
1020
|
+
run,
|
|
1021
|
+
cls=self.__class__,
|
|
1022
|
+
)
|
|
873
1023
|
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
1024
|
+
run_location_name = cast(
|
|
1025
|
+
"str",
|
|
1026
|
+
run.remote_job_origin.repository_origin.code_location_origin.location_name,
|
|
1027
|
+
)
|
|
878
1028
|
|
|
879
|
-
|
|
1029
|
+
server = user_code_launcher.get_grpc_server(
|
|
1030
|
+
deployment_name, run_location_name
|
|
1031
|
+
)
|
|
880
1032
|
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
1033
|
+
# Record the server handle that we launched it on to for run monitoring
|
|
1034
|
+
scoped_instance.add_run_tags(
|
|
1035
|
+
run.run_id, new_tags={SERVER_HANDLE_TAG: str(server.server_handle)}
|
|
1036
|
+
)
|
|
885
1037
|
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
1038
|
+
launcher.launch_run_from_grpc_client(
|
|
1039
|
+
scoped_instance, run, server.server_endpoint.create_client()
|
|
1040
|
+
)
|
|
1041
|
+
observer.evaluate_result(run=run)
|
|
889
1042
|
|
|
890
1043
|
return DagsterCloudApiSuccess()
|
|
891
1044
|
elif api_name == DagsterCloudApi.TERMINATE_RUN:
|
|
@@ -896,18 +1049,18 @@ class DagsterCloudAgent:
|
|
|
896
1049
|
run = request.request_args.dagster_run
|
|
897
1050
|
|
|
898
1051
|
with DagsterInstance.from_ref(
|
|
899
|
-
|
|
1052
|
+
self._instance.ref_for_deployment(deployment_name)
|
|
900
1053
|
) as scoped_instance:
|
|
901
|
-
if
|
|
1054
|
+
if self._instance.is_using_isolated_agents:
|
|
902
1055
|
scoped_instance.report_engine_event(
|
|
903
|
-
f"{
|
|
1056
|
+
f"{self._instance.agent_display_name} received request to mark run as canceling",
|
|
904
1057
|
run,
|
|
905
1058
|
cls=self.__class__,
|
|
906
1059
|
)
|
|
907
1060
|
scoped_instance.report_run_canceling(run)
|
|
908
1061
|
else:
|
|
909
1062
|
scoped_instance.report_engine_event(
|
|
910
|
-
f"{
|
|
1063
|
+
f"{self._instance.agent_display_name} received request to terminate run",
|
|
911
1064
|
run,
|
|
912
1065
|
cls=self.__class__,
|
|
913
1066
|
)
|
|
@@ -916,8 +1069,8 @@ class DagsterCloudAgent:
|
|
|
916
1069
|
launcher.terminate(run.run_id)
|
|
917
1070
|
else:
|
|
918
1071
|
run_location_name = cast(
|
|
919
|
-
str,
|
|
920
|
-
run.
|
|
1072
|
+
"str",
|
|
1073
|
+
run.remote_job_origin.repository_origin.code_location_origin.location_name,
|
|
921
1074
|
)
|
|
922
1075
|
|
|
923
1076
|
server = user_code_launcher.get_grpc_server(
|
|
@@ -929,15 +1082,20 @@ class DagsterCloudAgent:
|
|
|
929
1082
|
client.cancel_execution(CancelExecutionRequest(run_id=run.run_id))
|
|
930
1083
|
|
|
931
1084
|
return DagsterCloudApiSuccess()
|
|
932
|
-
|
|
1085
|
+
elif api_name in (
|
|
1086
|
+
DagsterCloudApi.CHECK_STEP_HEALTH,
|
|
1087
|
+
DagsterCloudApi.TERMINATE_STEP,
|
|
1088
|
+
DagsterCloudApi.LAUNCH_STEP,
|
|
1089
|
+
DagsterCloudApi.CHECK_RUN_HEALTH,
|
|
1090
|
+
DagsterCloudApi.LOAD_REPOSITORIES,
|
|
1091
|
+
):
|
|
1092
|
+
check.failed(f"Unexpected deprecated request type {api_name}")
|
|
933
1093
|
else:
|
|
934
1094
|
check.assert_never(api_name)
|
|
935
|
-
raise Exception(f"Unexpected dagster cloud api call {api_name}")
|
|
936
1095
|
|
|
937
1096
|
def _process_api_request(
|
|
938
1097
|
self,
|
|
939
|
-
json_request:
|
|
940
|
-
instance: DagsterCloudAgentInstance,
|
|
1098
|
+
json_request: dict,
|
|
941
1099
|
user_code_launcher: DagsterCloudUserCodeLauncher,
|
|
942
1100
|
submitted_to_executor_timestamp: float,
|
|
943
1101
|
) -> Optional[SerializableErrorInfo]:
|
|
@@ -956,7 +1114,7 @@ class DagsterCloudAgent:
|
|
|
956
1114
|
)
|
|
957
1115
|
|
|
958
1116
|
if request_api not in DagsterCloudApi.__members__:
|
|
959
|
-
api_result = DagsterCloudApiUnknownCommandResponse(request_api)
|
|
1117
|
+
api_result = DagsterCloudApiUnknownCommandResponse(request_api=request_api)
|
|
960
1118
|
self._logger.warning(
|
|
961
1119
|
f"Ignoring request {json_request}: Unknown command. This is likely due to running an "
|
|
962
1120
|
"older version of the agent."
|
|
@@ -966,7 +1124,7 @@ class DagsterCloudAgent:
|
|
|
966
1124
|
request = deserialize_value(request_body, DagsterCloudApiRequest)
|
|
967
1125
|
self._logger.info(f"Received request {request}.")
|
|
968
1126
|
api_result = self._handle_api_request(
|
|
969
|
-
request, deployment_name, is_branch_deployment,
|
|
1127
|
+
request, deployment_name, is_branch_deployment, user_code_launcher
|
|
970
1128
|
)
|
|
971
1129
|
except Exception:
|
|
972
1130
|
error_info = serializable_error_info_from_exc_info(sys.exc_info())
|
|
@@ -995,13 +1153,28 @@ class DagsterCloudAgent:
|
|
|
995
1153
|
|
|
996
1154
|
self._logger.info(f"Uploading response for request {request}.")
|
|
997
1155
|
|
|
998
|
-
|
|
1156
|
+
if UPLOAD_API_RESPONSE_BATCHING_ENABLED():
|
|
1157
|
+
self._batcher[deployment_name].submit((deployment_name, upload_response))
|
|
1158
|
+
else:
|
|
1159
|
+
upload_api_response(self._instance, deployment_name, upload_response)
|
|
999
1160
|
|
|
1000
1161
|
self._logger.info(f"Finished uploading response for request {request}.")
|
|
1001
1162
|
|
|
1002
1163
|
return error_info
|
|
1003
1164
|
|
|
1004
|
-
def
|
|
1165
|
+
def _batch_upload_api_response(
|
|
1166
|
+
self, upload_response_batch: list[tuple[str, DagsterCloudUploadApiResponse]]
|
|
1167
|
+
) -> list[None]:
|
|
1168
|
+
deployment_names = set(deployment_name for deployment_name, _ in upload_response_batch)
|
|
1169
|
+
assert len(deployment_names) == 1
|
|
1170
|
+
batch_upload_api_response(
|
|
1171
|
+
self._instance,
|
|
1172
|
+
next(iter(deployment_names)),
|
|
1173
|
+
[resp for _, resp in upload_response_batch],
|
|
1174
|
+
)
|
|
1175
|
+
return [None for _ in upload_response_batch]
|
|
1176
|
+
|
|
1177
|
+
def _get_location_from_request(self, json_request: dict[str, Any]) -> Optional[str]:
|
|
1005
1178
|
request_api = json_request["requestApi"]
|
|
1006
1179
|
request_body = json_request["requestBody"]
|
|
1007
1180
|
if request_api not in DagsterCloudApi.__members__:
|
|
@@ -1015,7 +1188,7 @@ class DagsterCloudAgent:
|
|
|
1015
1188
|
return location_origin.location_name
|
|
1016
1189
|
|
|
1017
1190
|
def run_iteration(
|
|
1018
|
-
self,
|
|
1191
|
+
self, user_code_launcher: DagsterCloudUserCodeLauncher
|
|
1019
1192
|
) -> Iterator[Optional[SerializableErrorInfo]]:
|
|
1020
1193
|
if not user_code_launcher.ready_to_serve_requests:
|
|
1021
1194
|
return
|
|
@@ -1024,12 +1197,12 @@ class DagsterCloudAgent:
|
|
|
1024
1197
|
|
|
1025
1198
|
if num_pending_requests < self._pending_requests_limit:
|
|
1026
1199
|
# limit (implicit default 10) applied separately for requests to branch deployments, and for each full deployment
|
|
1027
|
-
result =
|
|
1200
|
+
result = self._instance.organization_scoped_graphql_client().execute(
|
|
1028
1201
|
GET_USER_CLOUD_REQUESTS_QUERY,
|
|
1029
1202
|
{
|
|
1030
|
-
"forBranchDeployments":
|
|
1203
|
+
"forBranchDeployments": self._instance.includes_branch_deployments,
|
|
1031
1204
|
"forFullDeployments": self._active_full_deployment_names,
|
|
1032
|
-
"agentQueues":
|
|
1205
|
+
"agentQueues": self._instance.agent_queues_config.queues,
|
|
1033
1206
|
},
|
|
1034
1207
|
)
|
|
1035
1208
|
json_requests = result["data"]["userCloudAgent"]["popUserCloudAgentRequests"]
|
|
@@ -1082,7 +1255,6 @@ class DagsterCloudAgent:
|
|
|
1082
1255
|
future = self._executor.submit(
|
|
1083
1256
|
self._process_api_request,
|
|
1084
1257
|
json_request,
|
|
1085
|
-
instance,
|
|
1086
1258
|
user_code_launcher,
|
|
1087
1259
|
submitted_to_executor_timestamp,
|
|
1088
1260
|
)
|
|
@@ -1109,7 +1281,7 @@ class DagsterCloudAgent:
|
|
|
1109
1281
|
if response:
|
|
1110
1282
|
yield response
|
|
1111
1283
|
|
|
1112
|
-
if
|
|
1284
|
+
if self._instance.user_code_launcher.agent_metrics_enabled and (
|
|
1113
1285
|
self._utilization_metrics["container_utilization"]["measurement_timestamp"] is None
|
|
1114
1286
|
or (
|
|
1115
1287
|
get_current_timestamp()
|
|
@@ -1123,6 +1295,22 @@ class DagsterCloudAgent:
|
|
|
1123
1295
|
yield None
|
|
1124
1296
|
|
|
1125
1297
|
|
|
1298
|
+
def batch_upload_api_response(
|
|
1299
|
+
instance: DagsterCloudAgentInstance,
|
|
1300
|
+
deployment_name: str,
|
|
1301
|
+
batch: list[DagsterCloudUploadApiResponse],
|
|
1302
|
+
):
|
|
1303
|
+
with compressed_namedtuple_upload_file(BatchDagsterCloudUploadApiResponse(batch=batch)) as f:
|
|
1304
|
+
resp = instance.requests_managed_retries_session.put(
|
|
1305
|
+
instance.dagster_cloud_upload_api_response_url,
|
|
1306
|
+
headers=instance.headers_for_deployment(deployment_name),
|
|
1307
|
+
files={"api_response_batch.tmp": f},
|
|
1308
|
+
timeout=instance.dagster_cloud_api_timeout,
|
|
1309
|
+
proxies=instance.dagster_cloud_api_proxies,
|
|
1310
|
+
)
|
|
1311
|
+
raise_http_error(resp)
|
|
1312
|
+
|
|
1313
|
+
|
|
1126
1314
|
def upload_api_response(
|
|
1127
1315
|
instance: DagsterCloudAgentInstance,
|
|
1128
1316
|
deployment_name: str,
|