skypilot-nightly 1.0.0.dev20240910__py3-none-any.whl → 1.0.0.dev20240911__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +32 -67
- sky/authentication.py +12 -7
- sky/backends/backend_utils.py +40 -33
- sky/backends/cloud_vm_ray_backend.py +1 -1
- sky/check.py +1 -1
- sky/clouds/aws.py +8 -6
- sky/clouds/azure.py +7 -5
- sky/clouds/cloud.py +43 -14
- sky/clouds/cudo.py +1 -1
- sky/clouds/fluidstack.py +2 -2
- sky/clouds/gcp.py +12 -7
- sky/clouds/kubernetes.py +28 -15
- sky/clouds/lambda_cloud.py +2 -2
- sky/clouds/oci.py +1 -1
- sky/clouds/paperspace.py +1 -1
- sky/clouds/runpod.py +1 -1
- sky/clouds/scp.py +2 -2
- sky/clouds/service_catalog/aws_catalog.py +1 -1
- sky/clouds/vsphere.py +1 -1
- sky/provision/kubernetes/config.py +52 -34
- sky/provision/kubernetes/instance.py +73 -61
- sky/provision/kubernetes/network.py +11 -5
- sky/provision/kubernetes/network_utils.py +10 -8
- sky/provision/kubernetes/utils.py +72 -45
- sky/skylet/log_lib.py +4 -1
- sky/skylet/subprocess_daemon.py +47 -15
- sky/templates/kubernetes-port-forward-proxy-command.sh +29 -4
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/templates/lambda-ray.yml.j2 +2 -2
- sky/utils/command_runner.py +12 -6
- sky/utils/command_runner.pyi +1 -1
- sky/utils/kubernetes/rsync_helper.sh +12 -3
- {skypilot_nightly-1.0.0.dev20240910.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20240910.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/RECORD +39 -39
- {skypilot_nightly-1.0.0.dev20240910.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20240910.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20240910.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20240910.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/top_level.txt +0 -0
@@ -58,7 +58,11 @@ KIND_CONTEXT_NAME = 'kind-skypilot' # Context name used by sky local up
|
|
58
58
|
|
59
59
|
# Port-forward proxy command constants
|
60
60
|
PORT_FORWARD_PROXY_CMD_TEMPLATE = 'kubernetes-port-forward-proxy-command.sh'
|
61
|
-
|
61
|
+
# We add a version suffix to the port-forward proxy command to ensure backward
|
62
|
+
# compatibility and avoid overwriting the older version.
|
63
|
+
PORT_FORWARD_PROXY_CMD_VERSION = 2
|
64
|
+
PORT_FORWARD_PROXY_CMD_PATH = ('~/.sky/kubernetes-port-forward-proxy-command-'
|
65
|
+
f'v{PORT_FORWARD_PROXY_CMD_VERSION}.sh')
|
62
66
|
|
63
67
|
logger = sky_logging.init_logger(__name__)
|
64
68
|
|
@@ -355,6 +359,10 @@ def get_kubernetes_nodes() -> List[Any]:
|
|
355
359
|
|
356
360
|
|
357
361
|
def get_kubernetes_pods() -> List[Any]:
|
362
|
+
"""Gets the kubernetes pods in the current namespace and current context.
|
363
|
+
|
364
|
+
Used for computing cluster resource usage.
|
365
|
+
"""
|
358
366
|
try:
|
359
367
|
ns = get_current_kube_config_context_namespace()
|
360
368
|
pods = kubernetes.core_api().list_namespaced_pod(
|
@@ -572,37 +580,39 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
|
|
572
580
|
f'to set up GPUs.{suffix}')
|
573
581
|
|
574
582
|
|
575
|
-
def get_head_ssh_port(cluster_name: str, namespace: str
|
583
|
+
def get_head_ssh_port(cluster_name: str, namespace: str,
|
584
|
+
context: Optional[str]) -> int:
|
576
585
|
svc_name = f'{cluster_name}-head-ssh'
|
577
|
-
return get_port(svc_name, namespace)
|
586
|
+
return get_port(svc_name, namespace, context)
|
578
587
|
|
579
588
|
|
580
|
-
def get_port(svc_name: str, namespace: str) -> int:
|
589
|
+
def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
|
581
590
|
"""Gets the nodeport of the specified service.
|
582
591
|
|
583
592
|
Args:
|
584
593
|
svc_name (str): Name of the kubernetes service. Note that this may be
|
585
594
|
different from the cluster name.
|
586
595
|
namespace (str): Kubernetes namespace to look for the service in.
|
596
|
+
context (str): Kubernetes context to use.
|
587
597
|
"""
|
588
|
-
head_service = kubernetes.core_api().read_namespaced_service(
|
598
|
+
head_service = kubernetes.core_api(context).read_namespaced_service(
|
589
599
|
svc_name, namespace)
|
590
600
|
return head_service.spec.ports[0].node_port
|
591
601
|
|
592
602
|
|
593
|
-
def get_external_ip(
|
594
|
-
|
603
|
+
def get_external_ip(network_mode: Optional[
|
604
|
+
kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
|
595
605
|
if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
|
596
606
|
return '127.0.0.1'
|
597
607
|
# Return the IP address of the first node with an external IP
|
598
|
-
nodes = kubernetes.core_api().list_node().items
|
608
|
+
nodes = kubernetes.core_api(context).list_node().items
|
599
609
|
for node in nodes:
|
600
610
|
if node.status.addresses:
|
601
611
|
for address in node.status.addresses:
|
602
612
|
if address.type == 'ExternalIP':
|
603
613
|
return address.address
|
604
614
|
# If no external IP is found, use the API server IP
|
605
|
-
api_host = kubernetes.core_api().api_client.configuration.host
|
615
|
+
api_host = kubernetes.core_api(context).api_client.configuration.host
|
606
616
|
parsed_url = urlparse(api_host)
|
607
617
|
return parsed_url.hostname
|
608
618
|
|
@@ -620,7 +630,9 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \
|
|
620
630
|
"""
|
621
631
|
try:
|
622
632
|
ns = get_current_kube_config_context_namespace()
|
623
|
-
|
633
|
+
context = get_current_kube_config_context_name()
|
634
|
+
kubernetes.core_api(context).list_namespaced_pod(
|
635
|
+
ns, _request_timeout=timeout)
|
624
636
|
except ImportError:
|
625
637
|
# TODO(romilb): Update these error strs to also include link to docs
|
626
638
|
# when docs are ready.
|
@@ -948,14 +960,14 @@ def construct_ssh_jump_command(
|
|
948
960
|
proxy_cmd_path = os.path.expanduser(proxy_cmd_path)
|
949
961
|
# adding execution permission to the proxy command script
|
950
962
|
os.chmod(proxy_cmd_path, os.stat(proxy_cmd_path).st_mode | 0o111)
|
951
|
-
kube_context_flag = f' {current_kube_context}' if (
|
952
|
-
|
953
|
-
kube_namespace_flag = f' {current_kube_namespace}' if (
|
963
|
+
kube_context_flag = f'-c {current_kube_context} ' if (
|
964
|
+
current_kube_context is not None) else ''
|
965
|
+
kube_namespace_flag = f'-n {current_kube_namespace} ' if (
|
954
966
|
current_kube_namespace is not None) else ''
|
955
967
|
ssh_jump_proxy_command += (f' -o ProxyCommand=\'{proxy_cmd_path} '
|
956
|
-
f'{proxy_cmd_target_pod}'
|
957
968
|
f'{kube_context_flag}'
|
958
|
-
f'{kube_namespace_flag}
|
969
|
+
f'{kube_namespace_flag}'
|
970
|
+
f'{proxy_cmd_target_pod}\'')
|
959
971
|
return ssh_jump_proxy_command
|
960
972
|
|
961
973
|
|
@@ -963,7 +975,8 @@ def get_ssh_proxy_command(
|
|
963
975
|
k8s_ssh_target: str,
|
964
976
|
network_mode: kubernetes_enums.KubernetesNetworkingMode,
|
965
977
|
private_key_path: Optional[str] = None,
|
966
|
-
namespace: Optional[str] = None
|
978
|
+
namespace: Optional[str] = None,
|
979
|
+
context: Optional[str] = None) -> str:
|
967
980
|
"""Generates the SSH proxy command to connect to the pod.
|
968
981
|
|
969
982
|
Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
|
@@ -1011,11 +1024,11 @@ def get_ssh_proxy_command(
|
|
1011
1024
|
Required for NODEPORT networking mode.
|
1012
1025
|
"""
|
1013
1026
|
# Fetch IP to connect to for the jump svc
|
1014
|
-
ssh_jump_ip = get_external_ip(network_mode)
|
1027
|
+
ssh_jump_ip = get_external_ip(network_mode, context)
|
1015
1028
|
assert private_key_path is not None, 'Private key path must be provided'
|
1016
1029
|
if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
1017
1030
|
assert namespace is not None, 'Namespace must be provided for NodePort'
|
1018
|
-
ssh_jump_port = get_port(k8s_ssh_target, namespace)
|
1031
|
+
ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
|
1019
1032
|
ssh_jump_proxy_command = construct_ssh_jump_command(
|
1020
1033
|
private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
|
1021
1034
|
else:
|
@@ -1061,7 +1074,7 @@ def create_proxy_command_script() -> str:
|
|
1061
1074
|
return port_fwd_proxy_cmd_path
|
1062
1075
|
|
1063
1076
|
|
1064
|
-
def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
1077
|
+
def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str, context: str,
|
1065
1078
|
service_type: kubernetes_enums.KubernetesServiceType):
|
1066
1079
|
"""Sets up Kubernetes service resource to access for SSH jump pod.
|
1067
1080
|
|
@@ -1083,13 +1096,14 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
1083
1096
|
|
1084
1097
|
# Create service
|
1085
1098
|
try:
|
1086
|
-
kubernetes.core_api().create_namespaced_service(
|
1087
|
-
|
1099
|
+
kubernetes.core_api(context).create_namespaced_service(
|
1100
|
+
namespace, content['service_spec'])
|
1088
1101
|
except kubernetes.api_exception() as e:
|
1089
1102
|
# SSH Jump Pod service already exists.
|
1090
1103
|
if e.status == 409:
|
1091
|
-
ssh_jump_service = kubernetes.core_api(
|
1092
|
-
name=ssh_jump_name,
|
1104
|
+
ssh_jump_service = kubernetes.core_api(
|
1105
|
+
context).read_namespaced_service(name=ssh_jump_name,
|
1106
|
+
namespace=namespace)
|
1093
1107
|
curr_svc_type = ssh_jump_service.spec.type
|
1094
1108
|
if service_type.value == curr_svc_type:
|
1095
1109
|
# If the currently existing SSH Jump service's type is identical
|
@@ -1101,9 +1115,9 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
1101
1115
|
# If a different type of service type for SSH Jump pod compared
|
1102
1116
|
# to user's configuration for networking mode exists, we remove
|
1103
1117
|
# existing servie to create a new one following user's config
|
1104
|
-
kubernetes.core_api().delete_namespaced_service(
|
1118
|
+
kubernetes.core_api(context).delete_namespaced_service(
|
1105
1119
|
name=ssh_jump_name, namespace=namespace)
|
1106
|
-
kubernetes.core_api().create_namespaced_service(
|
1120
|
+
kubernetes.core_api(context).create_namespaced_service(
|
1107
1121
|
namespace, content['service_spec'])
|
1108
1122
|
port_forward_mode = (
|
1109
1123
|
kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
|
@@ -1132,7 +1146,7 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
1132
1146
|
|
1133
1147
|
|
1134
1148
|
def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
1135
|
-
ssh_key_secret: str, namespace: str):
|
1149
|
+
ssh_key_secret: str, namespace: str, context: str):
|
1136
1150
|
"""Sets up Kubernetes RBAC and pod for SSH jump host.
|
1137
1151
|
|
1138
1152
|
Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
|
@@ -1161,7 +1175,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
1161
1175
|
|
1162
1176
|
# ServiceAccount
|
1163
1177
|
try:
|
1164
|
-
kubernetes.core_api().create_namespaced_service_account(
|
1178
|
+
kubernetes.core_api(context).create_namespaced_service_account(
|
1165
1179
|
namespace, content['service_account'])
|
1166
1180
|
except kubernetes.api_exception() as e:
|
1167
1181
|
if e.status == 409:
|
@@ -1174,7 +1188,8 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
1174
1188
|
logger.info('Created SSH Jump ServiceAccount.')
|
1175
1189
|
# Role
|
1176
1190
|
try:
|
1177
|
-
kubernetes.auth_api().create_namespaced_role(
|
1191
|
+
kubernetes.auth_api(context).create_namespaced_role(
|
1192
|
+
namespace, content['role'])
|
1178
1193
|
except kubernetes.api_exception() as e:
|
1179
1194
|
if e.status == 409:
|
1180
1195
|
logger.info(
|
@@ -1185,7 +1200,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
1185
1200
|
logger.info('Created SSH Jump Role.')
|
1186
1201
|
# RoleBinding
|
1187
1202
|
try:
|
1188
|
-
kubernetes.auth_api().create_namespaced_role_binding(
|
1203
|
+
kubernetes.auth_api(context).create_namespaced_role_binding(
|
1189
1204
|
namespace, content['role_binding'])
|
1190
1205
|
except kubernetes.api_exception() as e:
|
1191
1206
|
if e.status == 409:
|
@@ -1198,8 +1213,8 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
1198
1213
|
logger.info('Created SSH Jump RoleBinding.')
|
1199
1214
|
# Pod
|
1200
1215
|
try:
|
1201
|
-
kubernetes.core_api().create_namespaced_pod(
|
1202
|
-
|
1216
|
+
kubernetes.core_api(context).create_namespaced_pod(
|
1217
|
+
namespace, content['pod_spec'])
|
1203
1218
|
except kubernetes.api_exception() as e:
|
1204
1219
|
if e.status == 409:
|
1205
1220
|
logger.info(
|
@@ -1211,7 +1226,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
1211
1226
|
logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
|
1212
1227
|
|
1213
1228
|
|
1214
|
-
def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
|
1229
|
+
def clean_zombie_ssh_jump_pod(namespace: str, context: str, node_id: str):
|
1215
1230
|
"""Analyzes SSH jump pod and removes if it is in a bad state
|
1216
1231
|
|
1217
1232
|
Prevents the existence of a dangling SSH jump pod. This could happen
|
@@ -1231,7 +1246,8 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
|
|
1231
1246
|
|
1232
1247
|
# Get the SSH jump pod name from the head pod
|
1233
1248
|
try:
|
1234
|
-
pod = kubernetes.core_api().read_namespaced_pod(
|
1249
|
+
pod = kubernetes.core_api(context).read_namespaced_pod(
|
1250
|
+
node_id, namespace)
|
1235
1251
|
except kubernetes.api_exception() as e:
|
1236
1252
|
if e.status == 404:
|
1237
1253
|
logger.warning(f'Failed to get pod {node_id},'
|
@@ -1240,7 +1256,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
|
|
1240
1256
|
else:
|
1241
1257
|
ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
|
1242
1258
|
try:
|
1243
|
-
ssh_jump_pod = kubernetes.core_api().read_namespaced_pod(
|
1259
|
+
ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
1244
1260
|
ssh_jump_name, namespace)
|
1245
1261
|
cont_ready_cond = find(ssh_jump_pod.status.conditions,
|
1246
1262
|
lambda c: c.type == 'ContainersReady')
|
@@ -1251,9 +1267,9 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
|
|
1251
1267
|
# ssh jump pod, lets remove it and the service. Otherwise, main
|
1252
1268
|
# container is ready and its lifecycle management script takes
|
1253
1269
|
# care of the cleaning.
|
1254
|
-
kubernetes.core_api().delete_namespaced_pod(
|
1255
|
-
|
1256
|
-
kubernetes.core_api().delete_namespaced_service(
|
1270
|
+
kubernetes.core_api(context).delete_namespaced_pod(
|
1271
|
+
ssh_jump_name, namespace)
|
1272
|
+
kubernetes.core_api(context).delete_namespaced_service(
|
1257
1273
|
ssh_jump_name, namespace)
|
1258
1274
|
except kubernetes.api_exception() as e:
|
1259
1275
|
# We keep the warning in debug to avoid polluting the `sky launch`
|
@@ -1265,7 +1281,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
|
|
1265
1281
|
# We encountered an issue while checking ssh jump pod. To be on
|
1266
1282
|
# the safe side, lets remove its service so the port is freed
|
1267
1283
|
try:
|
1268
|
-
kubernetes.core_api().delete_namespaced_service(
|
1284
|
+
kubernetes.core_api(context).delete_namespaced_service(
|
1269
1285
|
ssh_jump_name, namespace)
|
1270
1286
|
except kubernetes.api_exception():
|
1271
1287
|
pass
|
@@ -1521,10 +1537,10 @@ def merge_custom_metadata(original_metadata: Dict[str, Any]) -> None:
|
|
1521
1537
|
merge_dicts(custom_metadata, original_metadata)
|
1522
1538
|
|
1523
1539
|
|
1524
|
-
def check_nvidia_runtime_class() -> bool:
|
1540
|
+
def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
|
1525
1541
|
"""Checks if the 'nvidia' RuntimeClass exists in the cluster"""
|
1526
1542
|
# Fetch the list of available RuntimeClasses
|
1527
|
-
runtime_classes = kubernetes.node_api().list_runtime_class()
|
1543
|
+
runtime_classes = kubernetes.node_api(context).list_runtime_class()
|
1528
1544
|
|
1529
1545
|
# Check if 'nvidia' RuntimeClass exists
|
1530
1546
|
nvidia_exists = any(
|
@@ -1532,7 +1548,7 @@ def check_nvidia_runtime_class() -> bool:
|
|
1532
1548
|
return nvidia_exists
|
1533
1549
|
|
1534
1550
|
|
1535
|
-
def check_secret_exists(secret_name: str, namespace: str) -> bool:
|
1551
|
+
def check_secret_exists(secret_name: str, namespace: str, context: str) -> bool:
|
1536
1552
|
"""Checks if a secret exists in a namespace
|
1537
1553
|
|
1538
1554
|
Args:
|
@@ -1541,7 +1557,7 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool:
|
|
1541
1557
|
"""
|
1542
1558
|
|
1543
1559
|
try:
|
1544
|
-
kubernetes.core_api().read_namespaced_secret(
|
1560
|
+
kubernetes.core_api(context).read_namespaced_secret(
|
1545
1561
|
secret_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
|
1546
1562
|
except kubernetes.api_exception() as e:
|
1547
1563
|
if e.status == 404:
|
@@ -1551,17 +1567,18 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool:
|
|
1551
1567
|
return True
|
1552
1568
|
|
1553
1569
|
|
1554
|
-
def create_namespace(namespace: str) -> None:
|
1570
|
+
def create_namespace(namespace: str, context: Optional[str]) -> None:
|
1555
1571
|
"""Creates a namespace in the cluster.
|
1556
1572
|
|
1557
1573
|
If the namespace already exists, logs a message and does nothing.
|
1558
1574
|
|
1559
1575
|
Args:
|
1560
1576
|
namespace: Name of the namespace to create
|
1577
|
+
context: Name of the context to use. Can be none to use default context.
|
1561
1578
|
"""
|
1562
1579
|
kubernetes_client = kubernetes.kubernetes.client
|
1563
1580
|
try:
|
1564
|
-
kubernetes.core_api().read_namespace(namespace)
|
1581
|
+
kubernetes.core_api(context).read_namespace(namespace)
|
1565
1582
|
except kubernetes.api_exception() as e:
|
1566
1583
|
if e.status != 404:
|
1567
1584
|
raise
|
@@ -1572,7 +1589,7 @@ def create_namespace(namespace: str) -> None:
|
|
1572
1589
|
merge_custom_metadata(ns_metadata)
|
1573
1590
|
namespace_obj = kubernetes_client.V1Namespace(metadata=ns_metadata)
|
1574
1591
|
try:
|
1575
|
-
kubernetes.core_api().create_namespace(namespace_obj)
|
1592
|
+
kubernetes.core_api(context).create_namespace(namespace_obj)
|
1576
1593
|
except kubernetes.api_exception() as e:
|
1577
1594
|
if e.status == 409:
|
1578
1595
|
logger.info(f'Namespace {namespace} already exists in the cluster.')
|
@@ -1729,3 +1746,13 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
|
|
1729
1746
|
free={'nvidia.com/gpu': int(accelerators_available)})
|
1730
1747
|
|
1731
1748
|
return node_info_dict
|
1749
|
+
|
1750
|
+
|
1751
|
+
def get_namespace_from_config(provider_config: Dict[str, Any]) -> str:
|
1752
|
+
return provider_config.get('namespace',
|
1753
|
+
get_current_kube_config_context_namespace())
|
1754
|
+
|
1755
|
+
|
1756
|
+
def get_context_from_config(provider_config: Dict[str, Any]) -> str:
|
1757
|
+
return provider_config.get('context',
|
1758
|
+
get_current_kube_config_context_name())
|
sky/skylet/log_lib.py
CHANGED
@@ -208,9 +208,12 @@ def run_with_log(
|
|
208
208
|
str(proc.pid),
|
209
209
|
]
|
210
210
|
|
211
|
+
# We do not need to set `start_new_session=True` here, as the
|
212
|
+
# daemon script will detach itself from the parent process with
|
213
|
+
# fork to avoid being killed by ray job. See the reason we
|
214
|
+
# daemonize the process in `sky/skylet/subprocess_daemon.py`.
|
211
215
|
subprocess.Popen(
|
212
216
|
daemon_cmd,
|
213
|
-
start_new_session=True,
|
214
217
|
# Suppress output
|
215
218
|
stdout=subprocess.DEVNULL,
|
216
219
|
stderr=subprocess.DEVNULL,
|
sky/skylet/subprocess_daemon.py
CHANGED
@@ -1,17 +1,44 @@
|
|
1
1
|
"""Sky subprocess daemon.
|
2
|
-
|
3
2
|
Wait for parent_pid to exit, then SIGTERM (or SIGKILL if needed) the child
|
4
3
|
processes of proc_pid.
|
5
4
|
"""
|
6
|
-
|
7
5
|
import argparse
|
6
|
+
import os
|
8
7
|
import sys
|
9
8
|
import time
|
10
9
|
|
11
10
|
import psutil
|
12
11
|
|
13
|
-
if __name__ == '__main__':
|
14
12
|
|
13
|
+
def daemonize():
|
14
|
+
"""Detaches the process from its parent process with double-forking.
|
15
|
+
|
16
|
+
This detachment is crucial in the context of SkyPilot and Ray job. When
|
17
|
+
'sky cancel' is executed, it uses Ray's stop job API to terminate the job.
|
18
|
+
Without daemonization, this subprocess_daemon process would be terminated
|
19
|
+
along with its parent process, ray::task, which is launched with Ray job.
|
20
|
+
Daemonization ensures this process survives the 'sky cancel' command,
|
21
|
+
allowing it to prevent orphaned processes of Ray job.
|
22
|
+
"""
|
23
|
+
# First fork: Creates a child process identical to the parent
|
24
|
+
if os.fork() > 0:
|
25
|
+
# Parent process exits, allowing the child to run independently
|
26
|
+
sys.exit()
|
27
|
+
|
28
|
+
# Continues to run from first forked child process.
|
29
|
+
# Detach from parent environment.
|
30
|
+
os.setsid()
|
31
|
+
|
32
|
+
# Second fork: Creates a grandchild process
|
33
|
+
if os.fork() > 0:
|
34
|
+
# First child exits, orphaning the grandchild
|
35
|
+
sys.exit()
|
36
|
+
# Continues execution in the grandchild process
|
37
|
+
# This process is now fully detached from the original parent and terminal
|
38
|
+
|
39
|
+
|
40
|
+
if __name__ == '__main__':
|
41
|
+
daemonize()
|
15
42
|
parser = argparse.ArgumentParser()
|
16
43
|
parser.add_argument('--parent-pid', type=int, required=True)
|
17
44
|
parser.add_argument('--proc-pid', type=int, required=True)
|
@@ -28,29 +55,34 @@ if __name__ == '__main__':
|
|
28
55
|
if process is None:
|
29
56
|
sys.exit()
|
30
57
|
|
58
|
+
children = []
|
31
59
|
if parent_process is not None:
|
32
60
|
# Wait for either parent or target process to exit.
|
33
61
|
while process.is_running() and parent_process.is_running():
|
62
|
+
try:
|
63
|
+
# process.children() must be called while the target process
|
64
|
+
# is alive, as it will return an empty list if the target
|
65
|
+
# process has already terminated.
|
66
|
+
tmp_children = process.children(recursive=True)
|
67
|
+
if tmp_children:
|
68
|
+
children = tmp_children
|
69
|
+
except psutil.NoSuchProcess:
|
70
|
+
pass
|
34
71
|
time.sleep(1)
|
72
|
+
children.append(process)
|
35
73
|
|
36
|
-
|
37
|
-
children = process.children(recursive=True)
|
38
|
-
children.append(process)
|
39
|
-
except psutil.NoSuchProcess:
|
40
|
-
sys.exit()
|
41
|
-
|
42
|
-
for pid in children:
|
74
|
+
for child in children:
|
43
75
|
try:
|
44
|
-
|
76
|
+
child.terminate()
|
45
77
|
except psutil.NoSuchProcess:
|
46
|
-
|
78
|
+
continue
|
47
79
|
|
48
80
|
# Wait 30s for the processes to exit gracefully.
|
49
81
|
time.sleep(30)
|
50
82
|
|
51
83
|
# SIGKILL if they're still running.
|
52
|
-
for
|
84
|
+
for child in children:
|
53
85
|
try:
|
54
|
-
|
86
|
+
child.kill()
|
55
87
|
except psutil.NoSuchProcess:
|
56
|
-
|
88
|
+
continue
|
@@ -1,15 +1,40 @@
|
|
1
1
|
#!/usr/bin/env bash
|
2
2
|
set -uo pipefail
|
3
3
|
|
4
|
+
KUBE_CONTEXT=""
|
5
|
+
KUBE_NAMESPACE=""
|
6
|
+
|
7
|
+
# Parse flags
|
8
|
+
while getopts ":c:n:" opt; do
|
9
|
+
case ${opt} in
|
10
|
+
c)
|
11
|
+
KUBE_CONTEXT="$OPTARG"
|
12
|
+
;;
|
13
|
+
n)
|
14
|
+
KUBE_NAMESPACE="$OPTARG"
|
15
|
+
;;
|
16
|
+
\?)
|
17
|
+
echo "Invalid option: -$OPTARG" >&2
|
18
|
+
echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
|
19
|
+
exit 1
|
20
|
+
;;
|
21
|
+
:)
|
22
|
+
echo "Option -$OPTARG requires an argument." >&2
|
23
|
+
exit 1
|
24
|
+
;;
|
25
|
+
esac
|
26
|
+
done
|
27
|
+
|
28
|
+
# Shift the processed options away so that $1 becomes the pod name
|
29
|
+
shift $((OPTIND -1))
|
30
|
+
|
4
31
|
# Check if pod name is passed as an argument
|
5
32
|
if [ $# -lt 1 ]; then
|
6
|
-
echo "Usage: $0 <pod_name> [kube_context] [kube_namespace]" >&2
|
33
|
+
echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
|
7
34
|
exit 1
|
8
35
|
fi
|
9
36
|
|
10
|
-
POD_NAME="$1" # The first argument is the name of the pod
|
11
|
-
KUBE_CONTEXT="${2:-}" # The second argument is the kube context, default is empty
|
12
|
-
KUBE_NAMESPACE="${3:-}" # The third argument is the kube namespace, default is empty
|
37
|
+
POD_NAME="$1" # The first positional argument is the name of the pod
|
13
38
|
|
14
39
|
# Checks if socat is installed
|
15
40
|
if ! command -v socat > /dev/null; then
|
@@ -21,6 +21,11 @@ provider:
|
|
21
21
|
# The namespace to create the Ray cluster in.
|
22
22
|
namespace: {{k8s_namespace}}
|
23
23
|
|
24
|
+
# The kubecontext used to connect to the Kubernetes cluster.
|
25
|
+
{% if k8s_context is not none %}
|
26
|
+
context: {{k8s_context}}
|
27
|
+
{% endif %}
|
28
|
+
|
24
29
|
# This should be one of KubernetesPortMode
|
25
30
|
port_mode: {{k8s_port_mode}}
|
26
31
|
|
sky/templates/lambda-ray.yml.j2
CHANGED
@@ -89,13 +89,13 @@ setup_commands:
|
|
89
89
|
# Increment the following for catching performance bugs easier:
|
90
90
|
# current num items (num SSH connections): 2
|
91
91
|
head_start_ray_commands:
|
92
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
92
|
+
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --min-worker-port 11002 --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
93
93
|
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
94
94
|
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
|
95
95
|
|
96
96
|
{%- if num_nodes > 1 %}
|
97
97
|
worker_start_ray_commands:
|
98
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
98
|
+
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --min-worker-port 11002 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
99
99
|
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
100
100
|
{%- else %}
|
101
101
|
worker_start_ray_commands: []
|
sky/utils/command_runner.py
CHANGED
@@ -649,13 +649,13 @@ class KubernetesCommandRunner(CommandRunner):
|
|
649
649
|
|
650
650
|
def __init__(
|
651
651
|
self,
|
652
|
-
node: Tuple[str, str],
|
652
|
+
node: Tuple[Tuple[str, str], str],
|
653
653
|
**kwargs,
|
654
654
|
):
|
655
655
|
"""Initialize KubernetesCommandRunner.
|
656
656
|
|
657
657
|
Example Usage:
|
658
|
-
runner = KubernetesCommandRunner((namespace, pod_name))
|
658
|
+
runner = KubernetesCommandRunner((namespace, context), pod_name))
|
659
659
|
runner.run('ls -l')
|
660
660
|
runner.rsync(source, target, up=True)
|
661
661
|
|
@@ -664,7 +664,11 @@ class KubernetesCommandRunner(CommandRunner):
|
|
664
664
|
"""
|
665
665
|
del kwargs
|
666
666
|
super().__init__(node)
|
667
|
-
self.namespace, self.pod_name = node
|
667
|
+
(self.namespace, self.context), self.pod_name = node
|
668
|
+
|
669
|
+
@property
|
670
|
+
def node_id(self) -> str:
|
671
|
+
return f'{self.context}-{self.namespace}-{self.pod_name}'
|
668
672
|
|
669
673
|
@timeline.event
|
670
674
|
def run(
|
@@ -719,9 +723,11 @@ class KubernetesCommandRunner(CommandRunner):
|
|
719
723
|
if connect_timeout is None:
|
720
724
|
connect_timeout = _DEFAULT_CONNECT_TIMEOUT
|
721
725
|
kubectl_args = [
|
722
|
-
'--pod-running-timeout', f'{connect_timeout}s', '-n',
|
723
|
-
self.namespace, self.pod_name
|
726
|
+
'--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
|
724
727
|
]
|
728
|
+
if self.context:
|
729
|
+
kubectl_args += ['--context', self.context]
|
730
|
+
kubectl_args += [self.pod_name]
|
725
731
|
if ssh_mode == SshMode.LOGIN:
|
726
732
|
assert isinstance(cmd, list), 'cmd must be a list for login mode.'
|
727
733
|
base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
|
@@ -821,7 +827,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
821
827
|
self._rsync(
|
822
828
|
source,
|
823
829
|
target,
|
824
|
-
node_destination=f'{self.pod_name}@{self.namespace}',
|
830
|
+
node_destination=f'{self.pod_name}@{self.namespace}+{self.context}',
|
825
831
|
up=up,
|
826
832
|
rsh_option=helper_path,
|
827
833
|
log_path=log_path,
|
sky/utils/command_runner.pyi
CHANGED
@@ -1,7 +1,16 @@
|
|
1
|
-
# When using pod@namespace, rsync passes args as: {us} -l pod namespace
|
1
|
+
# When using pod@namespace+context, rsync passes args as: {us} -l pod namespace+context
|
2
|
+
# We need to split the pod@namespace+context into pod, namespace and context
|
3
|
+
# For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
|
2
4
|
shift
|
3
5
|
pod=$1
|
4
6
|
shift
|
5
|
-
|
7
|
+
namespace_context=$1
|
8
|
+
namespace=$(echo $namespace_context | cut -d+ -f1)
|
9
|
+
context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
|
10
|
+
context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
|
6
11
|
shift
|
7
|
-
|
12
|
+
if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
|
13
|
+
kubectl exec -i $pod -n $namespace -- "$@"
|
14
|
+
else
|
15
|
+
kubectl exec -i $pod -n $namespace --context=$context -- "$@"
|
16
|
+
fi
|