skypilot-nightly 1.0.0.dev20240909__py3-none-any.whl → 1.0.0.dev20240911__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +33 -67
  3. sky/authentication.py +12 -7
  4. sky/backends/backend_utils.py +40 -33
  5. sky/backends/cloud_vm_ray_backend.py +1 -1
  6. sky/check.py +1 -1
  7. sky/clouds/aws.py +8 -6
  8. sky/clouds/azure.py +7 -5
  9. sky/clouds/cloud.py +43 -14
  10. sky/clouds/cudo.py +1 -1
  11. sky/clouds/fluidstack.py +2 -2
  12. sky/clouds/gcp.py +12 -7
  13. sky/clouds/kubernetes.py +28 -15
  14. sky/clouds/lambda_cloud.py +2 -2
  15. sky/clouds/oci.py +1 -1
  16. sky/clouds/paperspace.py +1 -1
  17. sky/clouds/runpod.py +1 -1
  18. sky/clouds/scp.py +2 -2
  19. sky/clouds/service_catalog/aws_catalog.py +1 -1
  20. sky/clouds/vsphere.py +1 -1
  21. sky/provision/kubernetes/config.py +52 -34
  22. sky/provision/kubernetes/instance.py +73 -61
  23. sky/provision/kubernetes/network.py +11 -5
  24. sky/provision/kubernetes/network_utils.py +10 -8
  25. sky/provision/kubernetes/utils.py +72 -45
  26. sky/skylet/log_lib.py +4 -1
  27. sky/skylet/subprocess_daemon.py +47 -15
  28. sky/templates/kubernetes-port-forward-proxy-command.sh +29 -4
  29. sky/templates/kubernetes-ray.yml.j2 +5 -0
  30. sky/templates/lambda-ray.yml.j2 +2 -2
  31. sky/utils/command_runner.py +12 -6
  32. sky/utils/command_runner.pyi +1 -1
  33. sky/utils/kubernetes/rsync_helper.sh +12 -3
  34. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/METADATA +1 -1
  35. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/RECORD +39 -39
  36. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/LICENSE +0 -0
  37. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/WHEEL +0 -0
  38. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/entry_points.txt +0 -0
  39. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/top_level.txt +0 -0
@@ -58,7 +58,11 @@ KIND_CONTEXT_NAME = 'kind-skypilot' # Context name used by sky local up
58
58
 
59
59
  # Port-forward proxy command constants
60
60
  PORT_FORWARD_PROXY_CMD_TEMPLATE = 'kubernetes-port-forward-proxy-command.sh'
61
- PORT_FORWARD_PROXY_CMD_PATH = '~/.sky/kubernetes-port-forward-proxy-command.sh'
61
+ # We add a version suffix to the port-forward proxy command to ensure backward
62
+ # compatibility and avoid overwriting the older version.
63
+ PORT_FORWARD_PROXY_CMD_VERSION = 2
64
+ PORT_FORWARD_PROXY_CMD_PATH = ('~/.sky/kubernetes-port-forward-proxy-command-'
65
+ f'v{PORT_FORWARD_PROXY_CMD_VERSION}.sh')
62
66
 
63
67
  logger = sky_logging.init_logger(__name__)
64
68
 
@@ -355,6 +359,10 @@ def get_kubernetes_nodes() -> List[Any]:
355
359
 
356
360
 
357
361
  def get_kubernetes_pods() -> List[Any]:
362
+ """Gets the kubernetes pods in the current namespace and current context.
363
+
364
+ Used for computing cluster resource usage.
365
+ """
358
366
  try:
359
367
  ns = get_current_kube_config_context_namespace()
360
368
  pods = kubernetes.core_api().list_namespaced_pod(
@@ -572,37 +580,39 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
572
580
  f'to set up GPUs.{suffix}')
573
581
 
574
582
 
575
- def get_head_ssh_port(cluster_name: str, namespace: str) -> int:
583
+ def get_head_ssh_port(cluster_name: str, namespace: str,
584
+ context: Optional[str]) -> int:
576
585
  svc_name = f'{cluster_name}-head-ssh'
577
- return get_port(svc_name, namespace)
586
+ return get_port(svc_name, namespace, context)
578
587
 
579
588
 
580
- def get_port(svc_name: str, namespace: str) -> int:
589
+ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
581
590
  """Gets the nodeport of the specified service.
582
591
 
583
592
  Args:
584
593
  svc_name (str): Name of the kubernetes service. Note that this may be
585
594
  different from the cluster name.
586
595
  namespace (str): Kubernetes namespace to look for the service in.
596
+ context (str): Kubernetes context to use.
587
597
  """
588
- head_service = kubernetes.core_api().read_namespaced_service(
598
+ head_service = kubernetes.core_api(context).read_namespaced_service(
589
599
  svc_name, namespace)
590
600
  return head_service.spec.ports[0].node_port
591
601
 
592
602
 
593
- def get_external_ip(
594
- network_mode: Optional[kubernetes_enums.KubernetesNetworkingMode]):
603
+ def get_external_ip(network_mode: Optional[
604
+ kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
595
605
  if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
596
606
  return '127.0.0.1'
597
607
  # Return the IP address of the first node with an external IP
598
- nodes = kubernetes.core_api().list_node().items
608
+ nodes = kubernetes.core_api(context).list_node().items
599
609
  for node in nodes:
600
610
  if node.status.addresses:
601
611
  for address in node.status.addresses:
602
612
  if address.type == 'ExternalIP':
603
613
  return address.address
604
614
  # If no external IP is found, use the API server IP
605
- api_host = kubernetes.core_api().api_client.configuration.host
615
+ api_host = kubernetes.core_api(context).api_client.configuration.host
606
616
  parsed_url = urlparse(api_host)
607
617
  return parsed_url.hostname
608
618
 
@@ -620,7 +630,9 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \
620
630
  """
621
631
  try:
622
632
  ns = get_current_kube_config_context_namespace()
623
- kubernetes.core_api().list_namespaced_pod(ns, _request_timeout=timeout)
633
+ context = get_current_kube_config_context_name()
634
+ kubernetes.core_api(context).list_namespaced_pod(
635
+ ns, _request_timeout=timeout)
624
636
  except ImportError:
625
637
  # TODO(romilb): Update these error strs to also include link to docs
626
638
  # when docs are ready.
@@ -948,14 +960,14 @@ def construct_ssh_jump_command(
948
960
  proxy_cmd_path = os.path.expanduser(proxy_cmd_path)
949
961
  # adding execution permission to the proxy command script
950
962
  os.chmod(proxy_cmd_path, os.stat(proxy_cmd_path).st_mode | 0o111)
951
- kube_context_flag = f' {current_kube_context}' if (current_kube_context
952
- is not None) else ''
953
- kube_namespace_flag = f' {current_kube_namespace}' if (
963
+ kube_context_flag = f'-c {current_kube_context} ' if (
964
+ current_kube_context is not None) else ''
965
+ kube_namespace_flag = f'-n {current_kube_namespace} ' if (
954
966
  current_kube_namespace is not None) else ''
955
967
  ssh_jump_proxy_command += (f' -o ProxyCommand=\'{proxy_cmd_path} '
956
- f'{proxy_cmd_target_pod}'
957
968
  f'{kube_context_flag}'
958
- f'{kube_namespace_flag}\'')
969
+ f'{kube_namespace_flag}'
970
+ f'{proxy_cmd_target_pod}\'')
959
971
  return ssh_jump_proxy_command
960
972
 
961
973
 
@@ -963,7 +975,8 @@ def get_ssh_proxy_command(
963
975
  k8s_ssh_target: str,
964
976
  network_mode: kubernetes_enums.KubernetesNetworkingMode,
965
977
  private_key_path: Optional[str] = None,
966
- namespace: Optional[str] = None) -> str:
978
+ namespace: Optional[str] = None,
979
+ context: Optional[str] = None) -> str:
967
980
  """Generates the SSH proxy command to connect to the pod.
968
981
 
969
982
  Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
@@ -1011,11 +1024,11 @@ def get_ssh_proxy_command(
1011
1024
  Required for NODEPORT networking mode.
1012
1025
  """
1013
1026
  # Fetch IP to connect to for the jump svc
1014
- ssh_jump_ip = get_external_ip(network_mode)
1027
+ ssh_jump_ip = get_external_ip(network_mode, context)
1015
1028
  assert private_key_path is not None, 'Private key path must be provided'
1016
1029
  if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
1017
1030
  assert namespace is not None, 'Namespace must be provided for NodePort'
1018
- ssh_jump_port = get_port(k8s_ssh_target, namespace)
1031
+ ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
1019
1032
  ssh_jump_proxy_command = construct_ssh_jump_command(
1020
1033
  private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
1021
1034
  else:
@@ -1061,7 +1074,7 @@ def create_proxy_command_script() -> str:
1061
1074
  return port_fwd_proxy_cmd_path
1062
1075
 
1063
1076
 
1064
- def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
1077
+ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str, context: str,
1065
1078
  service_type: kubernetes_enums.KubernetesServiceType):
1066
1079
  """Sets up Kubernetes service resource to access for SSH jump pod.
1067
1080
 
@@ -1083,13 +1096,14 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
1083
1096
 
1084
1097
  # Create service
1085
1098
  try:
1086
- kubernetes.core_api().create_namespaced_service(namespace,
1087
- content['service_spec'])
1099
+ kubernetes.core_api(context).create_namespaced_service(
1100
+ namespace, content['service_spec'])
1088
1101
  except kubernetes.api_exception() as e:
1089
1102
  # SSH Jump Pod service already exists.
1090
1103
  if e.status == 409:
1091
- ssh_jump_service = kubernetes.core_api().read_namespaced_service(
1092
- name=ssh_jump_name, namespace=namespace)
1104
+ ssh_jump_service = kubernetes.core_api(
1105
+ context).read_namespaced_service(name=ssh_jump_name,
1106
+ namespace=namespace)
1093
1107
  curr_svc_type = ssh_jump_service.spec.type
1094
1108
  if service_type.value == curr_svc_type:
1095
1109
  # If the currently existing SSH Jump service's type is identical
@@ -1101,9 +1115,9 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
1101
1115
  # If a different type of service type for SSH Jump pod compared
1102
1116
  # to user's configuration for networking mode exists, we remove
1103
1117
  # existing servie to create a new one following user's config
1104
- kubernetes.core_api().delete_namespaced_service(
1118
+ kubernetes.core_api(context).delete_namespaced_service(
1105
1119
  name=ssh_jump_name, namespace=namespace)
1106
- kubernetes.core_api().create_namespaced_service(
1120
+ kubernetes.core_api(context).create_namespaced_service(
1107
1121
  namespace, content['service_spec'])
1108
1122
  port_forward_mode = (
1109
1123
  kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
@@ -1132,7 +1146,7 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
1132
1146
 
1133
1147
 
1134
1148
  def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
1135
- ssh_key_secret: str, namespace: str):
1149
+ ssh_key_secret: str, namespace: str, context: str):
1136
1150
  """Sets up Kubernetes RBAC and pod for SSH jump host.
1137
1151
 
1138
1152
  Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
@@ -1161,7 +1175,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
1161
1175
 
1162
1176
  # ServiceAccount
1163
1177
  try:
1164
- kubernetes.core_api().create_namespaced_service_account(
1178
+ kubernetes.core_api(context).create_namespaced_service_account(
1165
1179
  namespace, content['service_account'])
1166
1180
  except kubernetes.api_exception() as e:
1167
1181
  if e.status == 409:
@@ -1174,7 +1188,8 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
1174
1188
  logger.info('Created SSH Jump ServiceAccount.')
1175
1189
  # Role
1176
1190
  try:
1177
- kubernetes.auth_api().create_namespaced_role(namespace, content['role'])
1191
+ kubernetes.auth_api(context).create_namespaced_role(
1192
+ namespace, content['role'])
1178
1193
  except kubernetes.api_exception() as e:
1179
1194
  if e.status == 409:
1180
1195
  logger.info(
@@ -1185,7 +1200,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
1185
1200
  logger.info('Created SSH Jump Role.')
1186
1201
  # RoleBinding
1187
1202
  try:
1188
- kubernetes.auth_api().create_namespaced_role_binding(
1203
+ kubernetes.auth_api(context).create_namespaced_role_binding(
1189
1204
  namespace, content['role_binding'])
1190
1205
  except kubernetes.api_exception() as e:
1191
1206
  if e.status == 409:
@@ -1198,8 +1213,8 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
1198
1213
  logger.info('Created SSH Jump RoleBinding.')
1199
1214
  # Pod
1200
1215
  try:
1201
- kubernetes.core_api().create_namespaced_pod(namespace,
1202
- content['pod_spec'])
1216
+ kubernetes.core_api(context).create_namespaced_pod(
1217
+ namespace, content['pod_spec'])
1203
1218
  except kubernetes.api_exception() as e:
1204
1219
  if e.status == 409:
1205
1220
  logger.info(
@@ -1211,7 +1226,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
1211
1226
  logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
1212
1227
 
1213
1228
 
1214
- def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
1229
+ def clean_zombie_ssh_jump_pod(namespace: str, context: str, node_id: str):
1215
1230
  """Analyzes SSH jump pod and removes if it is in a bad state
1216
1231
 
1217
1232
  Prevents the existence of a dangling SSH jump pod. This could happen
@@ -1231,7 +1246,8 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
1231
1246
 
1232
1247
  # Get the SSH jump pod name from the head pod
1233
1248
  try:
1234
- pod = kubernetes.core_api().read_namespaced_pod(node_id, namespace)
1249
+ pod = kubernetes.core_api(context).read_namespaced_pod(
1250
+ node_id, namespace)
1235
1251
  except kubernetes.api_exception() as e:
1236
1252
  if e.status == 404:
1237
1253
  logger.warning(f'Failed to get pod {node_id},'
@@ -1240,7 +1256,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
1240
1256
  else:
1241
1257
  ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
1242
1258
  try:
1243
- ssh_jump_pod = kubernetes.core_api().read_namespaced_pod(
1259
+ ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
1244
1260
  ssh_jump_name, namespace)
1245
1261
  cont_ready_cond = find(ssh_jump_pod.status.conditions,
1246
1262
  lambda c: c.type == 'ContainersReady')
@@ -1251,9 +1267,9 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
1251
1267
  # ssh jump pod, lets remove it and the service. Otherwise, main
1252
1268
  # container is ready and its lifecycle management script takes
1253
1269
  # care of the cleaning.
1254
- kubernetes.core_api().delete_namespaced_pod(ssh_jump_name,
1255
- namespace)
1256
- kubernetes.core_api().delete_namespaced_service(
1270
+ kubernetes.core_api(context).delete_namespaced_pod(
1271
+ ssh_jump_name, namespace)
1272
+ kubernetes.core_api(context).delete_namespaced_service(
1257
1273
  ssh_jump_name, namespace)
1258
1274
  except kubernetes.api_exception() as e:
1259
1275
  # We keep the warning in debug to avoid polluting the `sky launch`
@@ -1265,7 +1281,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
1265
1281
  # We encountered an issue while checking ssh jump pod. To be on
1266
1282
  # the safe side, lets remove its service so the port is freed
1267
1283
  try:
1268
- kubernetes.core_api().delete_namespaced_service(
1284
+ kubernetes.core_api(context).delete_namespaced_service(
1269
1285
  ssh_jump_name, namespace)
1270
1286
  except kubernetes.api_exception():
1271
1287
  pass
@@ -1521,10 +1537,10 @@ def merge_custom_metadata(original_metadata: Dict[str, Any]) -> None:
1521
1537
  merge_dicts(custom_metadata, original_metadata)
1522
1538
 
1523
1539
 
1524
- def check_nvidia_runtime_class() -> bool:
1540
+ def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
1525
1541
  """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
1526
1542
  # Fetch the list of available RuntimeClasses
1527
- runtime_classes = kubernetes.node_api().list_runtime_class()
1543
+ runtime_classes = kubernetes.node_api(context).list_runtime_class()
1528
1544
 
1529
1545
  # Check if 'nvidia' RuntimeClass exists
1530
1546
  nvidia_exists = any(
@@ -1532,7 +1548,7 @@ def check_nvidia_runtime_class() -> bool:
1532
1548
  return nvidia_exists
1533
1549
 
1534
1550
 
1535
- def check_secret_exists(secret_name: str, namespace: str) -> bool:
1551
+ def check_secret_exists(secret_name: str, namespace: str, context: str) -> bool:
1536
1552
  """Checks if a secret exists in a namespace
1537
1553
 
1538
1554
  Args:
@@ -1541,7 +1557,7 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool:
1541
1557
  """
1542
1558
 
1543
1559
  try:
1544
- kubernetes.core_api().read_namespaced_secret(
1560
+ kubernetes.core_api(context).read_namespaced_secret(
1545
1561
  secret_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
1546
1562
  except kubernetes.api_exception() as e:
1547
1563
  if e.status == 404:
@@ -1551,17 +1567,18 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool:
1551
1567
  return True
1552
1568
 
1553
1569
 
1554
- def create_namespace(namespace: str) -> None:
1570
+ def create_namespace(namespace: str, context: Optional[str]) -> None:
1555
1571
  """Creates a namespace in the cluster.
1556
1572
 
1557
1573
  If the namespace already exists, logs a message and does nothing.
1558
1574
 
1559
1575
  Args:
1560
1576
  namespace: Name of the namespace to create
1577
+ context: Name of the context to use. Can be none to use default context.
1561
1578
  """
1562
1579
  kubernetes_client = kubernetes.kubernetes.client
1563
1580
  try:
1564
- kubernetes.core_api().read_namespace(namespace)
1581
+ kubernetes.core_api(context).read_namespace(namespace)
1565
1582
  except kubernetes.api_exception() as e:
1566
1583
  if e.status != 404:
1567
1584
  raise
@@ -1572,7 +1589,7 @@ def create_namespace(namespace: str) -> None:
1572
1589
  merge_custom_metadata(ns_metadata)
1573
1590
  namespace_obj = kubernetes_client.V1Namespace(metadata=ns_metadata)
1574
1591
  try:
1575
- kubernetes.core_api().create_namespace(namespace_obj)
1592
+ kubernetes.core_api(context).create_namespace(namespace_obj)
1576
1593
  except kubernetes.api_exception() as e:
1577
1594
  if e.status == 409:
1578
1595
  logger.info(f'Namespace {namespace} already exists in the cluster.')
@@ -1729,3 +1746,13 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
1729
1746
  free={'nvidia.com/gpu': int(accelerators_available)})
1730
1747
 
1731
1748
  return node_info_dict
1749
+
1750
+
1751
+ def get_namespace_from_config(provider_config: Dict[str, Any]) -> str:
1752
+ return provider_config.get('namespace',
1753
+ get_current_kube_config_context_namespace())
1754
+
1755
+
1756
+ def get_context_from_config(provider_config: Dict[str, Any]) -> str:
1757
+ return provider_config.get('context',
1758
+ get_current_kube_config_context_name())
sky/skylet/log_lib.py CHANGED
@@ -208,9 +208,12 @@ def run_with_log(
208
208
  str(proc.pid),
209
209
  ]
210
210
 
211
+ # We do not need to set `start_new_session=True` here, as the
212
+ # daemon script will detach itself from the parent process with
213
+ # fork to avoid being killed by ray job. See the reason we
214
+ # daemonize the process in `sky/skylet/subprocess_daemon.py`.
211
215
  subprocess.Popen(
212
216
  daemon_cmd,
213
- start_new_session=True,
214
217
  # Suppress output
215
218
  stdout=subprocess.DEVNULL,
216
219
  stderr=subprocess.DEVNULL,
@@ -1,17 +1,44 @@
1
1
  """Sky subprocess daemon.
2
-
3
2
  Wait for parent_pid to exit, then SIGTERM (or SIGKILL if needed) the child
4
3
  processes of proc_pid.
5
4
  """
6
-
7
5
  import argparse
6
+ import os
8
7
  import sys
9
8
  import time
10
9
 
11
10
  import psutil
12
11
 
13
- if __name__ == '__main__':
14
12
 
13
+ def daemonize():
14
+ """Detaches the process from its parent process with double-forking.
15
+
16
+ This detachment is crucial in the context of SkyPilot and Ray job. When
17
+ 'sky cancel' is executed, it uses Ray's stop job API to terminate the job.
18
+ Without daemonization, this subprocess_daemon process would be terminated
19
+ along with its parent process, ray::task, which is launched with Ray job.
20
+ Daemonization ensures this process survives the 'sky cancel' command,
21
+ allowing it to prevent orphaned processes of Ray job.
22
+ """
23
+ # First fork: Creates a child process identical to the parent
24
+ if os.fork() > 0:
25
+ # Parent process exits, allowing the child to run independently
26
+ sys.exit()
27
+
28
+ # Continues to run from first forked child process.
29
+ # Detach from parent environment.
30
+ os.setsid()
31
+
32
+ # Second fork: Creates a grandchild process
33
+ if os.fork() > 0:
34
+ # First child exits, orphaning the grandchild
35
+ sys.exit()
36
+ # Continues execution in the grandchild process
37
+ # This process is now fully detached from the original parent and terminal
38
+
39
+
40
+ if __name__ == '__main__':
41
+ daemonize()
15
42
  parser = argparse.ArgumentParser()
16
43
  parser.add_argument('--parent-pid', type=int, required=True)
17
44
  parser.add_argument('--proc-pid', type=int, required=True)
@@ -28,29 +55,34 @@ if __name__ == '__main__':
28
55
  if process is None:
29
56
  sys.exit()
30
57
 
58
+ children = []
31
59
  if parent_process is not None:
32
60
  # Wait for either parent or target process to exit.
33
61
  while process.is_running() and parent_process.is_running():
62
+ try:
63
+ # process.children() must be called while the target process
64
+ # is alive, as it will return an empty list if the target
65
+ # process has already terminated.
66
+ tmp_children = process.children(recursive=True)
67
+ if tmp_children:
68
+ children = tmp_children
69
+ except psutil.NoSuchProcess:
70
+ pass
34
71
  time.sleep(1)
72
+ children.append(process)
35
73
 
36
- try:
37
- children = process.children(recursive=True)
38
- children.append(process)
39
- except psutil.NoSuchProcess:
40
- sys.exit()
41
-
42
- for pid in children:
74
+ for child in children:
43
75
  try:
44
- pid.terminate()
76
+ child.terminate()
45
77
  except psutil.NoSuchProcess:
46
- pass
78
+ continue
47
79
 
48
80
  # Wait 30s for the processes to exit gracefully.
49
81
  time.sleep(30)
50
82
 
51
83
  # SIGKILL if they're still running.
52
- for pid in children:
84
+ for child in children:
53
85
  try:
54
- pid.kill()
86
+ child.kill()
55
87
  except psutil.NoSuchProcess:
56
- pass
88
+ continue
@@ -1,15 +1,40 @@
1
1
  #!/usr/bin/env bash
2
2
  set -uo pipefail
3
3
 
4
+ KUBE_CONTEXT=""
5
+ KUBE_NAMESPACE=""
6
+
7
+ # Parse flags
8
+ while getopts ":c:n:" opt; do
9
+ case ${opt} in
10
+ c)
11
+ KUBE_CONTEXT="$OPTARG"
12
+ ;;
13
+ n)
14
+ KUBE_NAMESPACE="$OPTARG"
15
+ ;;
16
+ \?)
17
+ echo "Invalid option: -$OPTARG" >&2
18
+ echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
19
+ exit 1
20
+ ;;
21
+ :)
22
+ echo "Option -$OPTARG requires an argument." >&2
23
+ exit 1
24
+ ;;
25
+ esac
26
+ done
27
+
28
+ # Shift the processed options away so that $1 becomes the pod name
29
+ shift $((OPTIND -1))
30
+
4
31
  # Check if pod name is passed as an argument
5
32
  if [ $# -lt 1 ]; then
6
- echo "Usage: $0 <pod_name> [kube_context] [kube_namespace]" >&2
33
+ echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
7
34
  exit 1
8
35
  fi
9
36
 
10
- POD_NAME="$1" # The first argument is the name of the pod
11
- KUBE_CONTEXT="${2:-}" # The second argument is the kube context, default is empty
12
- KUBE_NAMESPACE="${3:-}" # The third argument is the kube namespace, default is empty
37
+ POD_NAME="$1" # The first positional argument is the name of the pod
13
38
 
14
39
  # Checks if socat is installed
15
40
  if ! command -v socat > /dev/null; then
@@ -21,6 +21,11 @@ provider:
21
21
  # The namespace to create the Ray cluster in.
22
22
  namespace: {{k8s_namespace}}
23
23
 
24
+ # The kubecontext used to connect to the Kubernetes cluster.
25
+ {% if k8s_context is not none %}
26
+ context: {{k8s_context}}
27
+ {% endif %}
28
+
24
29
  # This should be one of KubernetesPortMode
25
30
  port_mode: {{k8s_port_mode}}
26
31
 
@@ -89,13 +89,13 @@ setup_commands:
89
89
  # Increment the following for catching performance bugs easier:
90
90
  # current num items (num SSH connections): 2
91
91
  head_start_ray_commands:
92
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
92
+ - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --min-worker-port 11002 --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
93
93
  which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
94
94
  {{dump_port_command}}; {{ray_head_wait_initialized_command}}
95
95
 
96
96
  {%- if num_nodes > 1 %}
97
97
  worker_start_ray_commands:
98
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
98
+ - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --min-worker-port 11002 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
99
99
  which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
100
100
  {%- else %}
101
101
  worker_start_ray_commands: []
@@ -649,13 +649,13 @@ class KubernetesCommandRunner(CommandRunner):
649
649
 
650
650
  def __init__(
651
651
  self,
652
- node: Tuple[str, str],
652
+ node: Tuple[Tuple[str, str], str],
653
653
  **kwargs,
654
654
  ):
655
655
  """Initialize KubernetesCommandRunner.
656
656
 
657
657
  Example Usage:
658
- runner = KubernetesCommandRunner((namespace, pod_name))
658
+ runner = KubernetesCommandRunner((namespace, context), pod_name))
659
659
  runner.run('ls -l')
660
660
  runner.rsync(source, target, up=True)
661
661
 
@@ -664,7 +664,11 @@ class KubernetesCommandRunner(CommandRunner):
664
664
  """
665
665
  del kwargs
666
666
  super().__init__(node)
667
- self.namespace, self.pod_name = node
667
+ (self.namespace, self.context), self.pod_name = node
668
+
669
+ @property
670
+ def node_id(self) -> str:
671
+ return f'{self.context}-{self.namespace}-{self.pod_name}'
668
672
 
669
673
  @timeline.event
670
674
  def run(
@@ -719,9 +723,11 @@ class KubernetesCommandRunner(CommandRunner):
719
723
  if connect_timeout is None:
720
724
  connect_timeout = _DEFAULT_CONNECT_TIMEOUT
721
725
  kubectl_args = [
722
- '--pod-running-timeout', f'{connect_timeout}s', '-n',
723
- self.namespace, self.pod_name
726
+ '--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
724
727
  ]
728
+ if self.context:
729
+ kubectl_args += ['--context', self.context]
730
+ kubectl_args += [self.pod_name]
725
731
  if ssh_mode == SshMode.LOGIN:
726
732
  assert isinstance(cmd, list), 'cmd must be a list for login mode.'
727
733
  base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
@@ -821,7 +827,7 @@ class KubernetesCommandRunner(CommandRunner):
821
827
  self._rsync(
822
828
  source,
823
829
  target,
824
- node_destination=f'{self.pod_name}@{self.namespace}',
830
+ node_destination=f'{self.pod_name}@{self.namespace}+{self.context}',
825
831
  up=up,
826
832
  rsh_option=helper_path,
827
833
  log_path=log_path,
@@ -204,7 +204,7 @@ class KubernetesCommandRunner(CommandRunner):
204
204
 
205
205
  def __init__(
206
206
  self,
207
- node: Tuple[str, str],
207
+ node: Tuple[Tuple[str, str], str],
208
208
  ) -> None:
209
209
  ...
210
210
 
@@ -1,7 +1,16 @@
1
- # When using pod@namespace, rsync passes args as: {us} -l pod namespace
1
+ # When using pod@namespace+context, rsync passes args as: {us} -l pod namespace+context
2
+ # We need to split the pod@namespace+context into pod, namespace and context
3
+ # For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
2
4
  shift
3
5
  pod=$1
4
6
  shift
5
- namespace=$1
7
+ namespace_context=$1
8
+ namespace=$(echo $namespace_context | cut -d+ -f1)
9
+ context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
10
+ context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
6
11
  shift
7
- kubectl exec -i $pod -n $namespace -- "$@"
12
+ if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
13
+ kubectl exec -i $pod -n $namespace -- "$@"
14
+ else
15
+ kubectl exec -i $pod -n $namespace --context=$context -- "$@"
16
+ fi
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20240909
3
+ Version: 1.0.0.dev20240911
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0