dagster-cloud 1.8.2__py3-none-any.whl → 1.12.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_cloud/__init__.py +3 -3
- dagster_cloud/agent/__init__.py +4 -4
- dagster_cloud/agent/cli/__init__.py +56 -17
- dagster_cloud/agent/dagster_cloud_agent.py +360 -172
- dagster_cloud/agent/instrumentation/__init__.py +0 -0
- dagster_cloud/agent/instrumentation/constants.py +2 -0
- dagster_cloud/agent/instrumentation/run_launch.py +23 -0
- dagster_cloud/agent/instrumentation/schedule.py +34 -0
- dagster_cloud/agent/instrumentation/sensor.py +34 -0
- dagster_cloud/anomaly_detection/__init__.py +2 -2
- dagster_cloud/anomaly_detection/defs.py +17 -12
- dagster_cloud/anomaly_detection/types.py +3 -3
- dagster_cloud/api/dagster_cloud_api.py +209 -293
- dagster_cloud/auth/constants.py +21 -5
- dagster_cloud/batching/__init__.py +1 -0
- dagster_cloud/batching/batcher.py +210 -0
- dagster_cloud/dagster_insights/__init__.py +12 -6
- dagster_cloud/dagster_insights/bigquery/bigquery_utils.py +3 -2
- dagster_cloud/dagster_insights/bigquery/dbt_wrapper.py +39 -12
- dagster_cloud/dagster_insights/bigquery/insights_bigquery_resource.py +8 -6
- dagster_cloud/dagster_insights/insights_utils.py +18 -8
- dagster_cloud/dagster_insights/metrics_utils.py +12 -12
- dagster_cloud/dagster_insights/snowflake/dagster_snowflake_insights.py +5 -12
- dagster_cloud/dagster_insights/snowflake/dbt_wrapper.py +34 -8
- dagster_cloud/dagster_insights/snowflake/definitions.py +38 -12
- dagster_cloud/dagster_insights/snowflake/insights_snowflake_resource.py +11 -23
- dagster_cloud/definitions/__init__.py +0 -0
- dagster_cloud/definitions/job_selection.py +36 -0
- dagster_cloud/execution/cloud_run_launcher/k8s.py +1 -1
- dagster_cloud/execution/cloud_run_launcher/process.py +3 -3
- dagster_cloud/execution/monitoring/__init__.py +27 -33
- dagster_cloud/execution/utils/process.py +3 -3
- dagster_cloud/instance/__init__.py +125 -38
- dagster_cloud/instrumentation/__init__.py +32 -0
- dagster_cloud/metadata/source_code.py +13 -8
- dagster_cloud/metrics/__init__.py +0 -0
- dagster_cloud/metrics/tracer.py +59 -0
- dagster_cloud/opentelemetry/__init__.py +0 -0
- dagster_cloud/opentelemetry/config/__init__.py +73 -0
- dagster_cloud/opentelemetry/config/exporter.py +81 -0
- dagster_cloud/opentelemetry/config/log_record_processor.py +40 -0
- dagster_cloud/opentelemetry/config/logging_handler.py +14 -0
- dagster_cloud/opentelemetry/config/meter_provider.py +9 -0
- dagster_cloud/opentelemetry/config/metric_reader.py +39 -0
- dagster_cloud/opentelemetry/controller.py +319 -0
- dagster_cloud/opentelemetry/enum.py +58 -0
- dagster_cloud/opentelemetry/factories/__init__.py +1 -0
- dagster_cloud/opentelemetry/factories/logs.py +113 -0
- dagster_cloud/opentelemetry/factories/metrics.py +121 -0
- dagster_cloud/opentelemetry/metrics/__init__.py +0 -0
- dagster_cloud/opentelemetry/metrics/meter.py +140 -0
- dagster_cloud/opentelemetry/observers/__init__.py +0 -0
- dagster_cloud/opentelemetry/observers/dagster_exception_handler.py +40 -0
- dagster_cloud/opentelemetry/observers/execution_observer.py +178 -0
- dagster_cloud/pex/grpc/__generated__/multi_pex_api_pb2.pyi +175 -0
- dagster_cloud/pex/grpc/__init__.py +2 -2
- dagster_cloud/pex/grpc/client.py +4 -4
- dagster_cloud/pex/grpc/compile.py +2 -2
- dagster_cloud/pex/grpc/server/__init__.py +2 -2
- dagster_cloud/pex/grpc/server/cli/__init__.py +31 -19
- dagster_cloud/pex/grpc/server/manager.py +60 -42
- dagster_cloud/pex/grpc/server/registry.py +28 -21
- dagster_cloud/pex/grpc/server/server.py +23 -14
- dagster_cloud/pex/grpc/types.py +5 -5
- dagster_cloud/py.typed +0 -0
- dagster_cloud/secrets/__init__.py +1 -1
- dagster_cloud/secrets/loader.py +3 -3
- dagster_cloud/serverless/__init__.py +1 -1
- dagster_cloud/serverless/io_manager.py +36 -53
- dagster_cloud/storage/client.py +54 -17
- dagster_cloud/storage/compute_logs/__init__.py +3 -1
- dagster_cloud/storage/compute_logs/compute_log_manager.py +22 -17
- dagster_cloud/storage/defs_state/__init__.py +3 -0
- dagster_cloud/storage/defs_state/queries.py +15 -0
- dagster_cloud/storage/defs_state/storage.py +113 -0
- dagster_cloud/storage/event_logs/__init__.py +3 -1
- dagster_cloud/storage/event_logs/queries.py +102 -4
- dagster_cloud/storage/event_logs/storage.py +266 -73
- dagster_cloud/storage/event_logs/utils.py +88 -7
- dagster_cloud/storage/runs/__init__.py +1 -1
- dagster_cloud/storage/runs/queries.py +17 -2
- dagster_cloud/storage/runs/storage.py +88 -42
- dagster_cloud/storage/schedules/__init__.py +1 -1
- dagster_cloud/storage/schedules/storage.py +6 -8
- dagster_cloud/storage/tags.py +66 -1
- dagster_cloud/util/__init__.py +10 -12
- dagster_cloud/util/errors.py +49 -64
- dagster_cloud/version.py +1 -1
- dagster_cloud/workspace/config_schema/__init__.py +55 -13
- dagster_cloud/workspace/docker/__init__.py +76 -25
- dagster_cloud/workspace/docker/utils.py +1 -1
- dagster_cloud/workspace/ecs/__init__.py +1 -1
- dagster_cloud/workspace/ecs/client.py +51 -33
- dagster_cloud/workspace/ecs/launcher.py +76 -22
- dagster_cloud/workspace/ecs/run_launcher.py +3 -3
- dagster_cloud/workspace/ecs/utils.py +14 -5
- dagster_cloud/workspace/kubernetes/__init__.py +1 -1
- dagster_cloud/workspace/kubernetes/launcher.py +61 -29
- dagster_cloud/workspace/kubernetes/utils.py +34 -22
- dagster_cloud/workspace/user_code_launcher/__init__.py +5 -3
- dagster_cloud/workspace/user_code_launcher/process.py +16 -14
- dagster_cloud/workspace/user_code_launcher/user_code_launcher.py +552 -172
- dagster_cloud/workspace/user_code_launcher/utils.py +105 -1
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/METADATA +48 -42
- dagster_cloud-1.12.6.dist-info/RECORD +134 -0
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/WHEEL +1 -1
- dagster_cloud-1.8.2.dist-info/RECORD +0 -100
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
7
7
|
|
|
8
8
|
import boto3
|
|
9
9
|
import botocore
|
|
@@ -13,7 +13,7 @@ from botocore.exceptions import ClientError
|
|
|
13
13
|
from dagster._utils.backoff import backoff
|
|
14
14
|
from dagster._utils.cached_method import cached_method
|
|
15
15
|
from dagster_aws.ecs.tasks import DagsterEcsTaskDefinitionConfig
|
|
16
|
-
from dagster_aws.ecs.utils import task_definitions_match
|
|
16
|
+
from dagster_aws.ecs.utils import is_transient_task_stopped_reason, task_definitions_match
|
|
17
17
|
|
|
18
18
|
from dagster_cloud.workspace.ecs.service import Service
|
|
19
19
|
|
|
@@ -57,13 +57,14 @@ class Client:
|
|
|
57
57
|
cluster_name: str,
|
|
58
58
|
service_discovery_namespace_id: str,
|
|
59
59
|
log_group: str,
|
|
60
|
-
subnet_ids: Optional[
|
|
61
|
-
security_group_ids: Optional[
|
|
60
|
+
subnet_ids: Optional[list[str]] = None,
|
|
61
|
+
security_group_ids: Optional[list[str]] = None,
|
|
62
62
|
ecs_client=None,
|
|
63
63
|
timeout: int = DEFAULT_ECS_TIMEOUT,
|
|
64
64
|
grace_period: int = DEFAULT_ECS_GRACE_PERIOD,
|
|
65
65
|
launch_type: str = "FARGATE",
|
|
66
66
|
show_debug_cluster_info: bool = True,
|
|
67
|
+
assign_public_ip: Optional[bool] = None,
|
|
67
68
|
):
|
|
68
69
|
self.ecs = ecs_client if ecs_client else boto3.client("ecs", config=config)
|
|
69
70
|
self.logs = boto3.client("logs", config=config)
|
|
@@ -85,6 +86,7 @@ class Client:
|
|
|
85
86
|
self.grace_period = check.int_param(grace_period, "grace_period")
|
|
86
87
|
self.launch_type = check.str_param(launch_type, "launch_type")
|
|
87
88
|
self._namespace: Optional[str] = None
|
|
89
|
+
self._assign_public_ip_override = assign_public_ip
|
|
88
90
|
|
|
89
91
|
@property
|
|
90
92
|
def ec2(self):
|
|
@@ -115,14 +117,17 @@ class Client:
|
|
|
115
117
|
@property
|
|
116
118
|
@cached_method
|
|
117
119
|
def network_configuration(self):
|
|
120
|
+
if self.launch_type != "FARGATE":
|
|
121
|
+
assign_public_ip = None
|
|
122
|
+
elif self._assign_public_ip_override is not None:
|
|
123
|
+
assign_public_ip = "ENABLED" if self._assign_public_ip_override else "DISABLED"
|
|
124
|
+
else:
|
|
125
|
+
assign_public_ip = self._infer_assign_public_ip()
|
|
126
|
+
|
|
118
127
|
network_configuration = {
|
|
119
128
|
"awsvpcConfiguration": {
|
|
120
129
|
"subnets": self.subnet_ids,
|
|
121
|
-
**(
|
|
122
|
-
{"assignPublicIp": self._assign_public_ip()}
|
|
123
|
-
if self.launch_type == "FARGATE"
|
|
124
|
-
else {}
|
|
125
|
-
),
|
|
130
|
+
**({"assignPublicIp": assign_public_ip} if assign_public_ip else {}),
|
|
126
131
|
},
|
|
127
132
|
}
|
|
128
133
|
|
|
@@ -162,7 +167,7 @@ class Client:
|
|
|
162
167
|
)
|
|
163
168
|
logger.info(f"Created new task definition {task_definition_arn}")
|
|
164
169
|
else:
|
|
165
|
-
task_definition_arn = existing_task_definition
|
|
170
|
+
task_definition_arn = check.not_none(existing_task_definition.get("taskDefinitionArn"))
|
|
166
171
|
logger.info(f"Re-using existing task definition {task_definition_arn}")
|
|
167
172
|
|
|
168
173
|
return task_definition_arn
|
|
@@ -333,7 +338,7 @@ class Client:
|
|
|
333
338
|
service=service.name,
|
|
334
339
|
desiredCount=0,
|
|
335
340
|
)
|
|
336
|
-
except botocore.exceptions.ClientError as error:
|
|
341
|
+
except botocore.exceptions.ClientError as error: # pyright: ignore[reportAttributeAccessIssue]
|
|
337
342
|
if error.response["Error"]["Code"] in [
|
|
338
343
|
"ServiceNotFoundException",
|
|
339
344
|
"ServiceNotActiveException",
|
|
@@ -416,7 +421,7 @@ class Client:
|
|
|
416
421
|
if resource_arn in actual_services:
|
|
417
422
|
services.append(Service(client=self, arn=resource_arn))
|
|
418
423
|
|
|
419
|
-
except botocore.exceptions.ClientError as error:
|
|
424
|
+
except botocore.exceptions.ClientError as error: # pyright: ignore[reportAttributeAccessIssue]
|
|
420
425
|
if error.response["Error"]["Code"] == "AccessDeniedException":
|
|
421
426
|
self._use_legacy_tag_filtering = True
|
|
422
427
|
logger.warning(
|
|
@@ -487,7 +492,7 @@ class Client:
|
|
|
487
492
|
)
|
|
488
493
|
|
|
489
494
|
if exit_code:
|
|
490
|
-
raise Exception(self.get_task_logs(task_arn))
|
|
495
|
+
raise Exception(self.get_task_logs(task_arn)) # pyright: ignore[reportCallIssue]
|
|
491
496
|
|
|
492
497
|
return True
|
|
493
498
|
|
|
@@ -608,7 +613,7 @@ class Client:
|
|
|
608
613
|
|
|
609
614
|
async def check_service_has_running_tasks(
|
|
610
615
|
self, service_name, container_name, logger=None
|
|
611
|
-
) ->
|
|
616
|
+
) -> list[str]:
|
|
612
617
|
# return the ARN of the task if it starts
|
|
613
618
|
logger = logger or logging.getLogger("dagster_cloud.EcsClient")
|
|
614
619
|
start_time = time.time()
|
|
@@ -622,33 +627,38 @@ class Client:
|
|
|
622
627
|
cluster=self.cluster_name,
|
|
623
628
|
services=[service_name],
|
|
624
629
|
)
|
|
625
|
-
if not services:
|
|
630
|
+
if not services or not services.get("services"):
|
|
626
631
|
raise Exception(
|
|
627
632
|
f"Service description not found for {self.cluster_name}/{service_name}"
|
|
628
633
|
)
|
|
629
634
|
|
|
630
|
-
service = services
|
|
635
|
+
service = services["services"][0]
|
|
631
636
|
desired_count = service.get("desiredCount")
|
|
632
637
|
running_count = service.get("runningCount")
|
|
633
638
|
|
|
634
639
|
# If the service has reached the desired count, we can start tracking the tasks
|
|
635
|
-
if desired_count == running_count:
|
|
640
|
+
if desired_count and (desired_count > 0) and (desired_count == running_count):
|
|
636
641
|
running_tasks = self.ecs.list_tasks(
|
|
637
642
|
cluster=self.cluster_name,
|
|
638
643
|
serviceName=service_name,
|
|
639
644
|
desiredStatus="RUNNING",
|
|
640
645
|
).get("taskArns")
|
|
641
646
|
|
|
642
|
-
if
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
647
|
+
if running_tasks:
|
|
648
|
+
tasks_to_track = running_tasks
|
|
649
|
+
|
|
650
|
+
if not tasks_to_track and time.time() > start_time + STOPPED_TASK_GRACE_PERIOD:
|
|
651
|
+
# If there are still no running_tasks tasks after a certain grace period, check for stopped tasks
|
|
652
|
+
stopped_tasks = self._check_for_stopped_tasks(service_name)
|
|
653
|
+
if stopped_tasks:
|
|
654
|
+
latest_stopped_task = stopped_tasks[0]
|
|
655
|
+
stopped_reason = latest_stopped_task.get("stoppedReason", "")
|
|
646
656
|
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
657
|
+
if is_transient_task_stopped_reason(stopped_reason):
|
|
658
|
+
logger.warning(
|
|
659
|
+
f"Task stopped with a transient stoppedReason: {stopped_reason} - waiting for the service to launch a new task"
|
|
660
|
+
)
|
|
661
|
+
else:
|
|
652
662
|
self._raise_failed_task(stopped_tasks[0], container_name, logger)
|
|
653
663
|
|
|
654
664
|
if tasks_to_track:
|
|
@@ -663,7 +673,15 @@ class Client:
|
|
|
663
673
|
if not self._check_all_essential_containers_are_running(task):
|
|
664
674
|
all_tasks_running = False
|
|
665
675
|
elif task.get("lastStatus") == "STOPPED":
|
|
666
|
-
|
|
676
|
+
stopped_reason = task.get("stoppedReason", "")
|
|
677
|
+
if is_transient_task_stopped_reason(stopped_reason):
|
|
678
|
+
logger.warning(
|
|
679
|
+
f"Running task stopped with a transient stoppedReason: {stopped_reason} - waiting for the service to launch a new task"
|
|
680
|
+
)
|
|
681
|
+
tasks_to_track = []
|
|
682
|
+
all_tasks_running = False
|
|
683
|
+
else:
|
|
684
|
+
self._raise_failed_task(task, container_name, logger)
|
|
667
685
|
|
|
668
686
|
if all_tasks_running:
|
|
669
687
|
return tasks_to_track
|
|
@@ -680,7 +698,7 @@ class Client:
|
|
|
680
698
|
)
|
|
681
699
|
if response.get("services"):
|
|
682
700
|
service = response["services"][0]
|
|
683
|
-
service_events = [event.get("message") for event in service.get("events")]
|
|
701
|
+
service_events = [str(event.get("message")) for event in service.get("events", [])]
|
|
684
702
|
service_events_str = "Service events:\n" + "\n".join(service_events)
|
|
685
703
|
except:
|
|
686
704
|
logger.exception(f"Error trying to get service event logs from service {service_name}")
|
|
@@ -715,9 +733,9 @@ class Client:
|
|
|
715
733
|
).get("taskDefinition")
|
|
716
734
|
|
|
717
735
|
essential_containers = {
|
|
718
|
-
container
|
|
719
|
-
for container in task_definition
|
|
720
|
-
if container
|
|
736
|
+
check.not_none(container.get("name"))
|
|
737
|
+
for container in task_definition.get("containerDefinitions", [])
|
|
738
|
+
if container.get("essential") and container.get("name")
|
|
721
739
|
}
|
|
722
740
|
|
|
723
741
|
# Just because the task is RUNNING doesn't mean everything has started up correctly -
|
|
@@ -729,7 +747,7 @@ class Client:
|
|
|
729
747
|
)
|
|
730
748
|
|
|
731
749
|
def _get_service_discovery_id(self, hostname):
|
|
732
|
-
service_name = hostname.split("." + self.namespace)[0]
|
|
750
|
+
service_name = hostname.split("." + self.namespace)[0] # pyright: ignore[reportOperatorIssue]
|
|
733
751
|
|
|
734
752
|
paginator = self.service_discovery.get_paginator("list_services")
|
|
735
753
|
for page in paginator.paginate(
|
|
@@ -747,7 +765,7 @@ class Client:
|
|
|
747
765
|
if service["Name"] == service_name:
|
|
748
766
|
return service["Id"]
|
|
749
767
|
|
|
750
|
-
def
|
|
768
|
+
def _infer_assign_public_ip(self):
|
|
751
769
|
# https://docs.aws.amazon.com/AmazonECS/latest/userguide/fargate-task-networking.html
|
|
752
770
|
# Assign a public IP if any of the subnets are public
|
|
753
771
|
route_tables = self.ec2.route_tables.filter(
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import os
|
|
3
|
+
from collections.abc import Collection, Mapping, Sequence
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import Any,
|
|
5
|
+
from typing import Any, Optional, cast
|
|
5
6
|
|
|
6
7
|
import boto3
|
|
7
8
|
import grpc
|
|
@@ -30,9 +31,15 @@ from dagster_cloud.workspace.ecs.client import (
|
|
|
30
31
|
DEFAULT_ECS_TIMEOUT,
|
|
31
32
|
ECS_EXEC_LINUX_PARAMETERS,
|
|
32
33
|
Client,
|
|
34
|
+
get_debug_ecs_prompt,
|
|
33
35
|
)
|
|
36
|
+
from dagster_cloud.workspace.ecs.run_launcher import CloudEcsRunLauncher
|
|
34
37
|
from dagster_cloud.workspace.ecs.service import Service
|
|
35
|
-
from dagster_cloud.workspace.ecs.utils import
|
|
38
|
+
from dagster_cloud.workspace.ecs.utils import (
|
|
39
|
+
get_ecs_human_readable_label,
|
|
40
|
+
get_server_task_definition_family,
|
|
41
|
+
unique_ecs_resource_name,
|
|
42
|
+
)
|
|
36
43
|
from dagster_cloud.workspace.user_code_launcher import (
|
|
37
44
|
DEFAULT_SERVER_PROCESS_STARTUP_TIMEOUT,
|
|
38
45
|
SHARED_USER_CODE_LAUNCHER_CONFIG,
|
|
@@ -44,11 +51,10 @@ from dagster_cloud.workspace.user_code_launcher.user_code_launcher import (
|
|
|
44
51
|
UserCodeLauncherEntry,
|
|
45
52
|
async_serialize_exceptions,
|
|
46
53
|
)
|
|
47
|
-
from dagster_cloud.workspace.user_code_launcher.utils import
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
from .utils import get_server_task_definition_family
|
|
54
|
+
from dagster_cloud.workspace.user_code_launcher.utils import (
|
|
55
|
+
deterministic_label_for_location,
|
|
56
|
+
get_grpc_server_env,
|
|
57
|
+
)
|
|
52
58
|
|
|
53
59
|
EcsServerHandleType = Service
|
|
54
60
|
|
|
@@ -60,12 +66,12 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
60
66
|
def __init__(
|
|
61
67
|
self,
|
|
62
68
|
cluster: str,
|
|
63
|
-
subnets:
|
|
69
|
+
subnets: list[str],
|
|
64
70
|
execution_role_arn: str,
|
|
65
71
|
log_group: str,
|
|
66
72
|
service_discovery_namespace_id: str,
|
|
67
73
|
task_role_arn: Optional[str] = None,
|
|
68
|
-
security_group_ids: Optional[
|
|
74
|
+
security_group_ids: Optional[list[str]] = None,
|
|
69
75
|
inst_data: Optional[ConfigurableClassData] = None,
|
|
70
76
|
secrets=None,
|
|
71
77
|
secrets_tag=None,
|
|
@@ -83,6 +89,10 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
83
89
|
server_ecs_tags: Optional[Sequence[Mapping[str, Optional[str]]]] = None,
|
|
84
90
|
run_ecs_tags: Optional[Sequence[Mapping[str, Optional[str]]]] = None,
|
|
85
91
|
server_health_check: Optional[Mapping[str, Any]] = None,
|
|
92
|
+
enable_ecs_exec=False,
|
|
93
|
+
server_task_definition_prefix: str = "server",
|
|
94
|
+
run_task_definition_prefix: str = "run",
|
|
95
|
+
assign_public_ip: Optional[bool] = None,
|
|
86
96
|
**kwargs,
|
|
87
97
|
):
|
|
88
98
|
self.ecs = boto3.client("ecs")
|
|
@@ -140,6 +150,22 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
140
150
|
run_sidecar_containers, "run_sidecar_containers"
|
|
141
151
|
)
|
|
142
152
|
|
|
153
|
+
self.server_task_definition_prefix = check.str_param(
|
|
154
|
+
server_task_definition_prefix, "server_task_definition_prefix"
|
|
155
|
+
)
|
|
156
|
+
check.invariant(
|
|
157
|
+
len(self.server_task_definition_prefix) <= 16,
|
|
158
|
+
"server_task_definition_prefix must be at most 16 characters",
|
|
159
|
+
)
|
|
160
|
+
self.run_task_definition_prefix = check.str_param(
|
|
161
|
+
run_task_definition_prefix, "run_task_definition_prefix"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
check.invariant(
|
|
165
|
+
len(self.run_task_definition_prefix) <= 16,
|
|
166
|
+
"run_task_definition_prefix must be at most 16 characters",
|
|
167
|
+
)
|
|
168
|
+
|
|
143
169
|
self.server_ecs_tags = check.opt_sequence_param(server_ecs_tags, "server_ecs_tags")
|
|
144
170
|
self.run_ecs_tags = check.opt_sequence_param(run_ecs_tags, "run_ecs_tags")
|
|
145
171
|
|
|
@@ -147,6 +173,8 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
147
173
|
server_health_check, "server_health_check"
|
|
148
174
|
)
|
|
149
175
|
|
|
176
|
+
self._enable_ecs_exec = enable_ecs_exec
|
|
177
|
+
|
|
150
178
|
self.client = Client(
|
|
151
179
|
cluster_name=self.cluster,
|
|
152
180
|
subnet_ids=self.subnets,
|
|
@@ -157,8 +185,9 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
157
185
|
timeout=self._ecs_timeout,
|
|
158
186
|
grace_period=self._ecs_grace_period,
|
|
159
187
|
launch_type=self.launch_type,
|
|
188
|
+
assign_public_ip=assign_public_ip,
|
|
160
189
|
)
|
|
161
|
-
super(
|
|
190
|
+
super().__init__(**kwargs)
|
|
162
191
|
|
|
163
192
|
@property
|
|
164
193
|
def show_debug_cluster_info(self) -> bool:
|
|
@@ -261,13 +290,34 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
261
290
|
{"enabled": Field(bool, is_required=False, default_value=False)},
|
|
262
291
|
is_required=False,
|
|
263
292
|
),
|
|
293
|
+
"enable_ecs_exec": Field(
|
|
294
|
+
bool,
|
|
295
|
+
is_required=False,
|
|
296
|
+
default_value=False,
|
|
297
|
+
),
|
|
298
|
+
"server_task_definition_prefix": Field(
|
|
299
|
+
str, is_required=False, default_value="server"
|
|
300
|
+
),
|
|
301
|
+
"run_task_definition_prefix": Field(
|
|
302
|
+
str, is_required=False, default_value="dagsterrun"
|
|
303
|
+
),
|
|
304
|
+
"assign_public_ip": Field(
|
|
305
|
+
Noneable(bool),
|
|
306
|
+
is_required=False,
|
|
307
|
+
default_value=None,
|
|
308
|
+
description=(
|
|
309
|
+
"When using the FARGATE launch type, the launcher will attempt to automatically determine if it is "
|
|
310
|
+
"necessary to assign a public IP to the ECS task. In complex network topologies, this automatic "
|
|
311
|
+
"determination may not be accurate. In this case, you can explicitly set this value to True or False."
|
|
312
|
+
),
|
|
313
|
+
),
|
|
264
314
|
},
|
|
265
315
|
SHARED_ECS_CONFIG,
|
|
266
316
|
SHARED_USER_CODE_LAUNCHER_CONFIG,
|
|
267
317
|
)
|
|
268
318
|
|
|
269
319
|
@classmethod
|
|
270
|
-
def from_config_value(cls, inst_data: ConfigurableClassData, config_value:
|
|
320
|
+
def from_config_value(cls, inst_data: ConfigurableClassData, config_value: dict[str, Any]): # pyright: ignore[reportIncompatibleMethodOverride], fix me!
|
|
271
321
|
return EcsUserCodeLauncher(inst_data=inst_data, **config_value)
|
|
272
322
|
|
|
273
323
|
@property
|
|
@@ -314,12 +364,12 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
314
364
|
return container_context.repository_credentials
|
|
315
365
|
|
|
316
366
|
def _get_enable_ecs_exec(self) -> bool:
|
|
317
|
-
return
|
|
367
|
+
return self._enable_ecs_exec
|
|
318
368
|
|
|
319
|
-
def _get_additional_grpc_server_env(self) ->
|
|
369
|
+
def _get_additional_grpc_server_env(self) -> dict[str, str]:
|
|
320
370
|
return {}
|
|
321
371
|
|
|
322
|
-
def _get_dagster_tags(self, deployment_name: str, location_name: str) ->
|
|
372
|
+
def _get_dagster_tags(self, deployment_name: str, location_name: str) -> dict[str, str]:
|
|
323
373
|
return {
|
|
324
374
|
"dagster/deployment_name": get_ecs_human_readable_label(deployment_name),
|
|
325
375
|
"dagster/location_name": get_ecs_human_readable_label(
|
|
@@ -337,7 +387,7 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
337
387
|
f"Getting resource limits for {deployment_name}:{location_name}. resources: {self.server_resources}"
|
|
338
388
|
)
|
|
339
389
|
metadata = self._actual_entries[(deployment_name, location_name)].code_location_deploy_data
|
|
340
|
-
resources = metadata.container_context.get("ecs", {}).get("server_resources")
|
|
390
|
+
resources = metadata.container_context.get("ecs", {}).get("server_resources", {})
|
|
341
391
|
return {
|
|
342
392
|
"ecs": {
|
|
343
393
|
"cpu_limit": resources.get("cpu"),
|
|
@@ -367,8 +417,8 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
367
417
|
command = metadata.get_grpc_server_command(
|
|
368
418
|
metrics_enabled=self._instance.user_code_launcher.code_server_metrics_enabled
|
|
369
419
|
)
|
|
370
|
-
additional_env =
|
|
371
|
-
PORT, location_name, self._instance.ref_for_deployment(deployment_name)
|
|
420
|
+
additional_env = get_grpc_server_env(
|
|
421
|
+
metadata, PORT, location_name, self._instance.ref_for_deployment(deployment_name)
|
|
372
422
|
)
|
|
373
423
|
tags = {
|
|
374
424
|
"dagster/grpc_server": "1",
|
|
@@ -420,7 +470,10 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
420
470
|
self._logger.info(f"Creating a new service for {deployment_name}:{location_name}...")
|
|
421
471
|
|
|
422
472
|
family = get_server_task_definition_family(
|
|
423
|
-
self.
|
|
473
|
+
self.server_task_definition_prefix,
|
|
474
|
+
self._instance.organization_name,
|
|
475
|
+
deployment_name,
|
|
476
|
+
location_name,
|
|
424
477
|
)
|
|
425
478
|
|
|
426
479
|
system_tags = {**self._get_dagster_tags(deployment_name, location_name), **tags}
|
|
@@ -524,7 +577,7 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
524
577
|
task_logs = "Task logs:\n" + "\n".join(logs) if logs else "No logs in task."
|
|
525
578
|
sections.append(task_logs)
|
|
526
579
|
except:
|
|
527
|
-
self._logger.exception("Error trying to get logs for failed task", task_arn=task_arn)
|
|
580
|
+
self._logger.exception("Error trying to get logs for failed task", task_arn=task_arn) # pyright: ignore[reportCallIssue]
|
|
528
581
|
|
|
529
582
|
if self.show_debug_cluster_info:
|
|
530
583
|
sections.append(get_debug_ecs_prompt(self.cluster, task_arn))
|
|
@@ -575,7 +628,7 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
575
628
|
except Exception as e:
|
|
576
629
|
if (
|
|
577
630
|
isinstance(e.__cause__, grpc.RpcError)
|
|
578
|
-
and cast(grpc.RpcError, e.__cause__).code() == grpc.StatusCode.UNIMPLEMENTED
|
|
631
|
+
and cast("grpc.RpcError", e.__cause__).code() == grpc.StatusCode.UNIMPLEMENTED
|
|
579
632
|
):
|
|
580
633
|
# New gRPC method not implemented on old multipex server versions
|
|
581
634
|
pass
|
|
@@ -640,7 +693,7 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
640
693
|
}
|
|
641
694
|
return self.client.list_services(tags)
|
|
642
695
|
|
|
643
|
-
def _list_server_handles(self) ->
|
|
696
|
+
def _list_server_handles(self) -> list[EcsServerHandleType]:
|
|
644
697
|
return [
|
|
645
698
|
service
|
|
646
699
|
for service in self.client.list_services()
|
|
@@ -654,7 +707,7 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
654
707
|
def get_server_create_timestamp(self, handle: EcsServerHandleType) -> Optional[float]:
|
|
655
708
|
return handle.create_timestamp
|
|
656
709
|
|
|
657
|
-
def _run_launcher_kwargs(self) ->
|
|
710
|
+
def _run_launcher_kwargs(self) -> dict[str, Any]:
|
|
658
711
|
return dict(
|
|
659
712
|
task_definition={
|
|
660
713
|
"log_group": self.log_group,
|
|
@@ -688,6 +741,7 @@ class EcsUserCodeLauncher(DagsterCloudUserCodeLauncher[EcsServerHandleType], Con
|
|
|
688
741
|
run_ecs_tags=self.run_ecs_tags,
|
|
689
742
|
container_name=CONTAINER_NAME,
|
|
690
743
|
run_resources=self.run_resources,
|
|
744
|
+
task_definition_prefix=self.run_task_definition_prefix,
|
|
691
745
|
)
|
|
692
746
|
|
|
693
747
|
def run_launcher(self) -> CloudEcsRunLauncher: # pyright: ignore[reportIncompatibleMethodOverride], fix me!
|
|
@@ -2,14 +2,14 @@ import dagster._check as check
|
|
|
2
2
|
from dagster_aws.ecs import EcsRunLauncher
|
|
3
3
|
|
|
4
4
|
from dagster_cloud.instance import DagsterCloudAgentInstance
|
|
5
|
-
|
|
6
|
-
from .utils import get_run_task_definition_family
|
|
5
|
+
from dagster_cloud.workspace.ecs.utils import get_run_task_definition_family
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class CloudEcsRunLauncher(EcsRunLauncher[DagsterCloudAgentInstance]):
|
|
10
9
|
def _get_run_task_definition_family(self, run) -> str:
|
|
11
10
|
return get_run_task_definition_family(
|
|
11
|
+
self._task_definition_prefix,
|
|
12
12
|
self._instance.organization_name,
|
|
13
13
|
check.not_none(self._instance.deployment_name),
|
|
14
|
-
check.not_none(run.
|
|
14
|
+
check.not_none(run.remote_job_origin),
|
|
15
15
|
)
|
|
@@ -2,10 +2,13 @@ import hashlib
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
|
-
from dagster._core.
|
|
5
|
+
from dagster._core.remote_origin import RemoteJobOrigin
|
|
6
6
|
from dagster_aws.ecs.utils import sanitize_family
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from dagster_cloud.workspace.user_code_launcher.utils import (
|
|
9
|
+
get_human_readable_label,
|
|
10
|
+
unique_resource_name,
|
|
11
|
+
)
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
def unique_ecs_resource_name(deployment_name, location_name):
|
|
@@ -29,10 +32,11 @@ def _get_family_hash(name, max_length=32, hash_size=8):
|
|
|
29
32
|
m = hashlib.sha1()
|
|
30
33
|
m.update(name.encode("utf-8"))
|
|
31
34
|
name_hash = m.hexdigest()[:hash_size]
|
|
32
|
-
return f"{name[:(max_length-hash_size-1)]}_{name_hash}"
|
|
35
|
+
return f"{name[: (max_length - hash_size - 1)]}_{name_hash}"
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
def get_server_task_definition_family(
|
|
39
|
+
task_definition_prefix: str,
|
|
36
40
|
organization_name: Optional[str],
|
|
37
41
|
deployment_name: str,
|
|
38
42
|
location_name: str,
|
|
@@ -43,9 +47,12 @@ def get_server_task_definition_family(
|
|
|
43
47
|
m = hashlib.sha1()
|
|
44
48
|
m.update(location_name.encode("utf-8"))
|
|
45
49
|
|
|
50
|
+
# '{16}_{64}_{64}_{64}': max 211 characters
|
|
46
51
|
truncated_location_name = _get_family_hash(location_name, max_length=64)
|
|
47
52
|
|
|
48
|
-
final_family =
|
|
53
|
+
final_family: str = (
|
|
54
|
+
f"{task_definition_prefix}_{organization_name}_{deployment_name}_{truncated_location_name}"
|
|
55
|
+
)
|
|
49
56
|
|
|
50
57
|
assert len(final_family) <= 255
|
|
51
58
|
|
|
@@ -53,6 +60,7 @@ def get_server_task_definition_family(
|
|
|
53
60
|
|
|
54
61
|
|
|
55
62
|
def get_run_task_definition_family(
|
|
63
|
+
task_definition_prefix: str,
|
|
56
64
|
organization_name: Optional[str],
|
|
57
65
|
deployment_name: str,
|
|
58
66
|
job_origin: RemoteJobOrigin,
|
|
@@ -64,12 +72,13 @@ def get_run_task_definition_family(
|
|
|
64
72
|
repo_name = job_origin.repository_origin.repository_name
|
|
65
73
|
location_name = job_origin.repository_origin.code_location_origin.location_name
|
|
66
74
|
|
|
75
|
+
assert len(task_definition_prefix) <= 16
|
|
67
76
|
assert len(str(organization_name)) <= 64
|
|
68
77
|
assert len(deployment_name) <= 64
|
|
69
78
|
|
|
70
79
|
# '{16}_{64}_{64}_{32}_{32}_{32}': max 245 characters
|
|
71
80
|
|
|
72
|
-
final_family = f"
|
|
81
|
+
final_family = f"{task_definition_prefix}_{organization_name}_{deployment_name}_{_get_family_hash(location_name)}_{_get_family_hash(repo_name)}_{_get_family_hash(job_name)}"
|
|
73
82
|
|
|
74
83
|
assert len(final_family) <= 255
|
|
75
84
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
from .launcher import K8sUserCodeLauncher as K8sUserCodeLauncher
|
|
1
|
+
from dagster_cloud.workspace.kubernetes.launcher import K8sUserCodeLauncher as K8sUserCodeLauncher
|