skypilot-nightly 1.0.0.dev20250228__py3-none-any.whl → 1.0.0.dev20250302__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '0f178b2af47ec2b185bc685cce6965c675613cc7'
8
+ _SKYPILOT_COMMIT_SHA = 'cefc23846d47d7dacf960093c2001912a83c1162'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250228'
38
+ __version__ = '1.0.0.dev20250302'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -1280,12 +1280,17 @@ def parallel_data_transfer_to_nodes(
1280
1280
  stream_logs=stream_logs,
1281
1281
  require_outputs=True,
1282
1282
  source_bashrc=source_bashrc)
1283
- err_msg = ('Failed to run command before rsync '
1283
+ err_msg = (f'{colorama.Style.RESET_ALL}{colorama.Style.DIM}'
1284
+ f'----- CMD -----\n'
1285
+ f'{cmd}\n'
1286
+ f'----- CMD END -----\n'
1287
+ f'{colorama.Style.RESET_ALL}'
1288
+ f'{colorama.Fore.RED}'
1289
+ f'Failed to run command before rsync '
1284
1290
  f'{origin_source} -> {target}. '
1285
- 'Ensure that the network is stable, then retry. '
1286
- f'{cmd}')
1291
+ f'{colorama.Style.RESET_ALL}')
1287
1292
  if log_path != os.devnull:
1288
- err_msg += f' See logs in {log_path}'
1293
+ err_msg += ux_utils.log_path_hint(log_path)
1289
1294
  subprocess_utils.handle_returncode(rc,
1290
1295
  cmd,
1291
1296
  err_msg,
sky/cli.py CHANGED
@@ -5419,11 +5419,16 @@ def local():
5419
5419
  @click.option('--cleanup',
5420
5420
  is_flag=True,
5421
5421
  help='Clean up the remote cluster instead of deploying it.')
5422
+ @click.option(
5423
+ '--context-name',
5424
+ type=str,
5425
+ required=False,
5426
+ help='Name to use for the kubeconfig context. Defaults to "default".')
5422
5427
  @local.command('up', cls=_DocumentedCodeCommand)
5423
5428
  @_add_click_options(_COMMON_OPTIONS)
5424
5429
  @usage_lib.entrypoint
5425
5430
  def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5426
- cleanup: bool, async_call: bool):
5431
+ cleanup: bool, context_name: Optional[str], async_call: bool):
5427
5432
  """Creates a local or remote cluster."""
5428
5433
 
5429
5434
  def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
@@ -5468,7 +5473,8 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5468
5473
  raise click.BadParameter(
5469
5474
  f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
5470
5475
 
5471
- request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup)
5476
+ request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
5477
+ context_name)
5472
5478
  _async_call_or_wait(request_id, async_call, request_name='local up')
5473
5479
 
5474
5480
 
@@ -5504,10 +5510,19 @@ def api():
5504
5510
  required=False,
5505
5511
  help=('The host to deploy the SkyPilot API server. To allow '
5506
5512
  'remote access, set this to 0.0.0.0'))
5513
+ @click.option('--foreground',
5514
+ is_flag=True,
5515
+ default=False,
5516
+ required=False,
5517
+ help='Run the SkyPilot API server in the foreground and output '
5518
+ 'its logs to stdout/stderr. Allowing external systems '
5519
+ 'to manage the process lifecycle and collect logs directly. '
5520
+ 'This is useful when the API server is managed by systems '
5521
+ 'like systemd and Kubernetes.')
5507
5522
  @usage_lib.entrypoint
5508
- def api_start(deploy: bool, host: Optional[str]):
5523
+ def api_start(deploy: bool, host: Optional[str], foreground: bool):
5509
5524
  """Starts the SkyPilot API server locally."""
5510
- sdk.api_start(deploy=deploy, host=host)
5525
+ sdk.api_start(deploy=deploy, host=host, foreground=foreground)
5511
5526
 
5512
5527
 
5513
5528
  @api.command('stop', cls=_DocumentedCodeCommand)
sky/client/cli.py CHANGED
@@ -5419,11 +5419,16 @@ def local():
5419
5419
  @click.option('--cleanup',
5420
5420
  is_flag=True,
5421
5421
  help='Clean up the remote cluster instead of deploying it.')
5422
+ @click.option(
5423
+ '--context-name',
5424
+ type=str,
5425
+ required=False,
5426
+ help='Name to use for the kubeconfig context. Defaults to "default".')
5422
5427
  @local.command('up', cls=_DocumentedCodeCommand)
5423
5428
  @_add_click_options(_COMMON_OPTIONS)
5424
5429
  @usage_lib.entrypoint
5425
5430
  def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5426
- cleanup: bool, async_call: bool):
5431
+ cleanup: bool, context_name: Optional[str], async_call: bool):
5427
5432
  """Creates a local or remote cluster."""
5428
5433
 
5429
5434
  def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
@@ -5468,7 +5473,8 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5468
5473
  raise click.BadParameter(
5469
5474
  f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
5470
5475
 
5471
- request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup)
5476
+ request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
5477
+ context_name)
5472
5478
  _async_call_or_wait(request_id, async_call, request_name='local up')
5473
5479
 
5474
5480
 
@@ -5504,10 +5510,19 @@ def api():
5504
5510
  required=False,
5505
5511
  help=('The host to deploy the SkyPilot API server. To allow '
5506
5512
  'remote access, set this to 0.0.0.0'))
5513
+ @click.option('--foreground',
5514
+ is_flag=True,
5515
+ default=False,
5516
+ required=False,
5517
+ help='Run the SkyPilot API server in the foreground and output '
5518
+ 'its logs to stdout/stderr. Allowing external systems '
5519
+ 'to manage the process lifecycle and collect logs directly. '
5520
+ 'This is useful when the API server is managed by systems '
5521
+ 'like systemd and Kubernetes.')
5507
5522
  @usage_lib.entrypoint
5508
- def api_start(deploy: bool, host: Optional[str]):
5523
+ def api_start(deploy: bool, host: Optional[str], foreground: bool):
5509
5524
  """Starts the SkyPilot API server locally."""
5510
- sdk.api_start(deploy=deploy, host=host)
5525
+ sdk.api_start(deploy=deploy, host=host, foreground=foreground)
5511
5526
 
5512
5527
 
5513
5528
  @api.command('stop', cls=_DocumentedCodeCommand)
sky/client/sdk.py CHANGED
@@ -1263,8 +1263,12 @@ def storage_delete(name: str) -> server_common.RequestId:
1263
1263
  @usage_lib.entrypoint
1264
1264
  @server_common.check_server_healthy_or_start
1265
1265
  @annotations.client_api
1266
- def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
1267
- ssh_key: Optional[str], cleanup: bool) -> server_common.RequestId:
1266
+ def local_up(gpus: bool,
1267
+ ips: Optional[List[str]],
1268
+ ssh_user: Optional[str],
1269
+ ssh_key: Optional[str],
1270
+ cleanup: bool,
1271
+ context_name: Optional[str] = None) -> server_common.RequestId:
1268
1272
  """Launches a Kubernetes cluster on local machines.
1269
1273
 
1270
1274
  Returns:
@@ -1282,7 +1286,8 @@ def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
1282
1286
  ips=ips,
1283
1287
  ssh_user=ssh_user,
1284
1288
  ssh_key=ssh_key,
1285
- cleanup=cleanup)
1289
+ cleanup=cleanup,
1290
+ context_name=context_name)
1286
1291
  response = requests.post(f'{server_common.get_server_url()}/local_up',
1287
1292
  json=json.loads(body.model_dump_json()))
1288
1293
  return server_common.get_request_id(response)
@@ -1611,6 +1616,7 @@ def api_start(
1611
1616
  *,
1612
1617
  deploy: bool = False,
1613
1618
  host: str = '127.0.0.1',
1619
+ foreground: bool = False,
1614
1620
  ) -> None:
1615
1621
  """Starts the API server.
1616
1622
 
@@ -1622,7 +1628,8 @@ def api_start(
1622
1628
  resources of the machine.
1623
1629
  host: The host to deploy the API server. It will be set to 0.0.0.0
1624
1630
  if deploy is True, to allow remote access.
1625
-
1631
+ foreground: Whether to run the API server in the foreground (run in
1632
+ the current process).
1626
1633
  Returns:
1627
1634
  None
1628
1635
  """
@@ -1641,7 +1648,10 @@ def api_start(
1641
1648
  'from the config file and/or unset the '
1642
1649
  'SKYPILOT_API_SERVER_ENDPOINT environment '
1643
1650
  'variable.')
1644
- server_common.check_server_healthy_or_start_fn(deploy, host)
1651
+ server_common.check_server_healthy_or_start_fn(deploy, host, foreground)
1652
+ if foreground:
1653
+ # Explain why current process exited
1654
+ logger.info('API server is already running:')
1645
1655
  logger.info(f'{ux_utils.INDENT_SYMBOL}SkyPilot API server: '
1646
1656
  f'{server_common.get_server_url(host)}\n'
1647
1657
  f'{ux_utils.INDENT_LAST_SYMBOL}'
sky/cloud_stores.py CHANGED
@@ -54,8 +54,9 @@ class S3CloudStorage(CloudStorage):
54
54
 
55
55
  # List of commands to install AWS CLI
56
56
  _GET_AWSCLI = [
57
- 'aws --version >/dev/null 2>&1 || '
58
- f'{constants.SKY_UV_PIP_CMD} install awscli',
57
+ 'awscli_path=$(which aws) || '
58
+ f'{{ {constants.SKY_UV_PIP_CMD} install awscli && '
59
+ f'awscli_path={constants.SKY_REMOTE_PYTHON_ENV}/bin/aws; }}',
59
60
  ]
60
61
 
61
62
  def is_directory(self, url: str) -> bool:
@@ -85,8 +86,7 @@ class S3CloudStorage(CloudStorage):
85
86
  # AWS Sync by default uses 10 threads to upload files to the bucket.
86
87
  # To increase parallelism, modify max_concurrent_requests in your
87
88
  # aws config file (Default path: ~/.aws/config).
88
- download_via_awscli = (f'{constants.SKY_REMOTE_PYTHON_ENV}/bin/aws s3 '
89
- 'sync --no-follow-symlinks '
89
+ download_via_awscli = (f'$awscli_path s3 sync --no-follow-symlinks '
90
90
  f'{source} {destination}')
91
91
 
92
92
  all_commands = list(self._GET_AWSCLI)
@@ -95,8 +95,7 @@ class S3CloudStorage(CloudStorage):
95
95
 
96
96
  def make_sync_file_command(self, source: str, destination: str) -> str:
97
97
  """Downloads a file using AWS CLI."""
98
- download_via_awscli = (f'{constants.SKY_REMOTE_PYTHON_ENV}/bin/aws s3 '
99
- f'cp {source} {destination}')
98
+ download_via_awscli = (f'$awscli_path s3 cp {source} {destination}')
100
99
 
101
100
  all_commands = list(self._GET_AWSCLI)
102
101
  all_commands.append(download_via_awscli)
sky/clouds/gcp.py CHANGED
@@ -67,8 +67,20 @@ _GCLOUD_VERSION = '424.0.0'
67
67
  GOOGLE_SDK_INSTALLATION_COMMAND: str = f'pushd /tmp &>/dev/null && \
68
68
  {{ gcloud --help > /dev/null 2>&1 || \
69
69
  {{ mkdir -p {os.path.dirname(_GCLOUD_INSTALLATION_LOG)} && \
70
- wget --quiet https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-{_GCLOUD_VERSION}-linux-x86_64.tar.gz > {_GCLOUD_INSTALLATION_LOG} && \
71
- tar xzf google-cloud-sdk-{_GCLOUD_VERSION}-linux-x86_64.tar.gz >> {_GCLOUD_INSTALLATION_LOG} && \
70
+ ARCH=$(uname -m) && \
71
+ if [ "$ARCH" = "x86_64" ]; then \
72
+ echo "Installing Google Cloud SDK for $ARCH" > {_GCLOUD_INSTALLATION_LOG} && \
73
+ ARCH_SUFFIX="x86_64"; \
74
+ elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \
75
+ echo "Installing Google Cloud SDK for $ARCH" > {_GCLOUD_INSTALLATION_LOG} && \
76
+ ARCH_SUFFIX="arm"; \
77
+ else \
78
+ echo "Architecture $ARCH not supported by Google Cloud SDK. Defaulting to x86_64." > {_GCLOUD_INSTALLATION_LOG} && \
79
+ ARCH_SUFFIX="x86_64"; \
80
+ fi && \
81
+ echo "Detected architecture: $ARCH, using package: $ARCH_SUFFIX" >> {_GCLOUD_INSTALLATION_LOG} && \
82
+ wget --quiet https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-{_GCLOUD_VERSION}-linux-${{ARCH_SUFFIX}}.tar.gz >> {_GCLOUD_INSTALLATION_LOG} && \
83
+ tar xzf google-cloud-sdk-{_GCLOUD_VERSION}-linux-${{ARCH_SUFFIX}}.tar.gz >> {_GCLOUD_INSTALLATION_LOG} && \
72
84
  rm -rf ~/google-cloud-sdk >> {_GCLOUD_INSTALLATION_LOG} && \
73
85
  mv google-cloud-sdk ~/ && \
74
86
  ~/google-cloud-sdk/install.sh -q >> {_GCLOUD_INSTALLATION_LOG} 2>&1 && \
sky/core.py CHANGED
@@ -1007,8 +1007,12 @@ def realtime_kubernetes_gpu_availability(
1007
1007
  # = Local Cluster =
1008
1008
  # =================
1009
1009
  @usage_lib.entrypoint
1010
- def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
1011
- ssh_key: Optional[str], cleanup: bool) -> None:
1010
+ def local_up(gpus: bool,
1011
+ ips: Optional[List[str]],
1012
+ ssh_user: Optional[str],
1013
+ ssh_key: Optional[str],
1014
+ cleanup: bool,
1015
+ context_name: Optional[str] = None) -> None:
1012
1016
  """Creates a local or remote cluster."""
1013
1017
 
1014
1018
  def _validate_args(ips, ssh_user, ssh_key, cleanup):
@@ -1034,7 +1038,7 @@ def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
1034
1038
  if ips:
1035
1039
  assert ssh_user is not None and ssh_key is not None
1036
1040
  kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
1037
- cleanup)
1041
+ cleanup, context_name)
1038
1042
  else:
1039
1043
  # Run local deployment (kind) if no remote args are specified
1040
1044
  kubernetes_deploy_utils.deploy_local_cluster(gpus)
@@ -24,7 +24,14 @@ RCLONE_VERSION = 'v1.68.2'
24
24
 
25
25
  def get_s3_mount_install_cmd() -> str:
26
26
  """Returns a command to install S3 mount utility goofys."""
27
- install_cmd = ('sudo wget -nc https://github.com/romilbhardwaj/goofys/'
27
+ install_cmd = ('ARCH=$(uname -m) && '
28
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
29
+ ' echo "goofys is not supported on $ARCH" && '
30
+ f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
31
+ 'else '
32
+ ' ARCH_SUFFIX="amd64"; '
33
+ 'fi && '
34
+ 'sudo wget -nc https://github.com/romilbhardwaj/goofys/'
28
35
  'releases/download/0.24.0-romilb-upstream/goofys '
29
36
  '-O /usr/local/bin/goofys && '
30
37
  'sudo chmod 755 /usr/local/bin/goofys')
@@ -49,9 +56,15 @@ def get_s3_mount_cmd(bucket_name: str,
49
56
 
50
57
  def get_gcs_mount_install_cmd() -> str:
51
58
  """Returns a command to install GCS mount utility gcsfuse."""
52
- install_cmd = ('wget -nc https://github.com/GoogleCloudPlatform/gcsfuse'
59
+ install_cmd = ('ARCH=$(uname -m) && '
60
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
61
+ ' ARCH_SUFFIX="arm64"; '
62
+ 'else '
63
+ ' ARCH_SUFFIX="amd64"; '
64
+ 'fi && '
65
+ 'wget -nc https://github.com/GoogleCloudPlatform/gcsfuse'
53
66
  f'/releases/download/v{GCSFUSE_VERSION}/'
54
- f'gcsfuse_{GCSFUSE_VERSION}_amd64.deb '
67
+ f'gcsfuse_{GCSFUSE_VERSION}_${{ARCH_SUFFIX}}.deb '
55
68
  '-O /tmp/gcsfuse.deb && '
56
69
  'sudo dpkg --install /tmp/gcsfuse.deb')
57
70
  return install_cmd
@@ -77,16 +90,24 @@ def get_gcs_mount_cmd(bucket_name: str,
77
90
 
78
91
  def get_az_mount_install_cmd() -> str:
79
92
  """Returns a command to install AZ Container mount utility blobfuse2."""
80
- install_cmd = ('sudo apt-get update; '
81
- 'sudo apt-get install -y '
82
- '-o Dpkg::Options::="--force-confdef" '
83
- 'fuse3 libfuse3-dev && '
84
- 'wget -nc https://github.com/Azure/azure-storage-fuse'
85
- f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}'
86
- f'/blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.x86_64.deb '
87
- '-O /tmp/blobfuse2.deb && '
88
- 'sudo dpkg --install /tmp/blobfuse2.deb && '
89
- f'mkdir -p {_BLOBFUSE_CACHE_ROOT_DIR};')
93
+ install_cmd = (
94
+ 'sudo apt-get update; '
95
+ 'sudo apt-get install -y '
96
+ '-o Dpkg::Options::="--force-confdef" '
97
+ 'fuse3 libfuse3-dev && '
98
+ 'ARCH=$(uname -m) && '
99
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
100
+ ' echo "blobfuse2 is not supported on $ARCH" && '
101
+ f' exit {exceptions.ARCH_NOT_SUPPORTED_EXIT_CODE}; '
102
+ 'else '
103
+ ' ARCH_SUFFIX="x86_64"; '
104
+ 'fi && '
105
+ 'wget -nc https://github.com/Azure/azure-storage-fuse'
106
+ f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}'
107
+ f'/blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.${{ARCH_SUFFIX}}.deb '
108
+ '-O /tmp/blobfuse2.deb && '
109
+ 'sudo dpkg --install /tmp/blobfuse2.deb && '
110
+ f'mkdir -p {_BLOBFUSE_CACHE_ROOT_DIR};')
90
111
 
91
112
  return install_cmd
92
113
 
@@ -207,14 +228,20 @@ def get_rclone_install_cmd() -> str:
207
228
  """
208
229
  # pylint: disable=line-too-long
209
230
  install_cmd = (
231
+ 'ARCH=$(uname -m) && '
232
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
233
+ ' ARCH_SUFFIX="arm"; '
234
+ 'else '
235
+ ' ARCH_SUFFIX="amd64"; '
236
+ 'fi && '
210
237
  f'(which dpkg > /dev/null 2>&1 && (which rclone > /dev/null || (cd ~ > /dev/null'
211
- f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-amd64.deb'
212
- f' && sudo dpkg -i rclone-{RCLONE_VERSION}-linux-amd64.deb'
213
- f' && rm -f rclone-{RCLONE_VERSION}-linux-amd64.deb)))'
238
+ f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb'
239
+ f' && sudo dpkg -i rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb'
240
+ f' && rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb)))'
214
241
  f' || (which rclone > /dev/null || (cd ~ > /dev/null'
215
- f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-amd64.rpm'
216
- f' && sudo yum --nogpgcheck install rclone-{RCLONE_VERSION}-linux-amd64.rpm -y'
217
- f' && rm -f rclone-{RCLONE_VERSION}-linux-amd64.rpm))')
242
+ f' && curl -O https://downloads.rclone.org/{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm'
243
+ f' && sudo yum --nogpgcheck install rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm -y'
244
+ f' && rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm))')
218
245
  return install_cmd
219
246
 
220
247
 
sky/exceptions.py CHANGED
@@ -22,6 +22,8 @@ MOUNT_PATH_NON_EMPTY_CODE = 42
22
22
  INSUFFICIENT_PRIVILEGES_CODE = 52
23
23
  # Return code when git command is ran in a dir that is not git repo
24
24
  GIT_FATAL_EXIT_CODE = 128
25
+ # Architecture, such as arm64, not supported by the dependency
26
+ ARCH_NOT_SUPPORTED_EXIT_CODE = 133
25
27
 
26
28
 
27
29
  def is_safe_exception(exc: Exception) -> bool:
sky/server/common.py CHANGED
@@ -145,57 +145,6 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
145
145
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
146
146
 
147
147
 
148
- def start_api_server_in_background(deploy: bool = False,
149
- host: str = '127.0.0.1'):
150
- if not is_api_server_local():
151
- raise RuntimeError(
152
- f'Cannot start API server: {get_server_url()} is not a local URL')
153
-
154
- # Check available memory before starting the server.
155
- avail_mem_size_gb: float = common_utils.get_mem_size_gb()
156
- if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
157
- logger.warning(
158
- f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only has '
159
- f'{avail_mem_size_gb:.1f}GB memory available. '
160
- f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is recommended to '
161
- f'support higher load with better performance.'
162
- f'{colorama.Style.RESET_ALL}')
163
- log_path = os.path.expanduser(constants.API_SERVER_LOGS)
164
- os.makedirs(os.path.dirname(log_path), exist_ok=True)
165
-
166
- api_server_cmd = API_SERVER_CMD
167
- if deploy:
168
- api_server_cmd += ' --deploy'
169
- if host is not None:
170
- api_server_cmd += f' --host {host}'
171
- cmd = f'{sys.executable} {api_server_cmd} > {log_path} 2>&1 < /dev/null'
172
-
173
- # Start the API server process in the background and don't wait for it.
174
- # If this is called from a CLI invocation, we need start_new_session=True so
175
- # that SIGINT on the CLI will not also kill the API server.
176
- subprocess.Popen(cmd, shell=True, start_new_session=True)
177
-
178
- # Wait for the server to start until timeout.
179
- # Conservative upper time bound for starting the server based on profiling.
180
- timeout_sec = 12
181
- start_time = time.time()
182
- while True:
183
- api_server_info = get_api_server_status()
184
- assert api_server_info.status != ApiServerStatus.VERSION_MISMATCH, (
185
- f'API server version mismatch when starting the server. '
186
- f'Server version: {api_server_info.api_version} '
187
- f'Client version: {server_constants.API_VERSION}')
188
- if api_server_info.status == ApiServerStatus.HEALTHY:
189
- break
190
- elif time.time() - start_time >= timeout_sec:
191
- with ux_utils.print_exception_no_traceback():
192
- raise RuntimeError(
193
- 'Failed to start SkyPilot API server at '
194
- f'{get_server_url(host)}'
195
- f'\nView logs at: {constants.API_SERVER_LOGS}')
196
- time.sleep(0.5)
197
-
198
-
199
148
  def handle_request_error(response: requests.Response) -> None:
200
149
  if response.status_code != 200:
201
150
  with ux_utils.print_exception_no_traceback():
@@ -218,10 +167,10 @@ def get_request_id(response: requests.Response) -> RequestId:
218
167
  return request_id
219
168
 
220
169
 
221
- def _start_api_server(deploy: bool = False, host: str = '127.0.0.1'):
170
+ def _start_api_server(deploy: bool = False,
171
+ host: str = '127.0.0.1',
172
+ foreground: bool = False):
222
173
  """Starts a SkyPilot API server locally."""
223
- # Lock to prevent multiple processes from starting the server at the
224
- # same time, causing issues with database initialization.
225
174
  server_url = get_server_url(host)
226
175
  assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
227
176
  f'server url {server_url} is not a local url')
@@ -230,7 +179,60 @@ def _start_api_server(deploy: bool = False, host: str = '127.0.0.1'):
230
179
  f'SkyPilot API server at {server_url}. '
231
180
  'Starting a local server.'
232
181
  f'{colorama.Style.RESET_ALL}')
233
- start_api_server_in_background(deploy=deploy, host=host)
182
+ if not is_api_server_local():
183
+ raise RuntimeError(f'Cannot start API server: {get_server_url()} '
184
+ 'is not a local URL')
185
+
186
+ # Check available memory before starting the server.
187
+ avail_mem_size_gb: float = common_utils.get_mem_size_gb()
188
+ if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
189
+ logger.warning(
190
+ f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
191
+ f'has {avail_mem_size_gb:.1f}GB memory available. '
192
+ f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is '
193
+ 'recommended to support higher load with better performance.'
194
+ f'{colorama.Style.RESET_ALL}')
195
+
196
+ args = [sys.executable, *API_SERVER_CMD.split()]
197
+ if deploy:
198
+ args += ['--deploy']
199
+ if host is not None:
200
+ args += [f'--host={host}']
201
+
202
+ if foreground:
203
+ # Replaces the current process with the API server
204
+ os.execvp(args[0], args)
205
+
206
+ log_path = os.path.expanduser(constants.API_SERVER_LOGS)
207
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
208
+ cmd = f'{" ".join(args)} > {log_path} 2>&1 < /dev/null'
209
+
210
+ # Start the API server process in the background and don't wait for it.
211
+ # If this is called from a CLI invocation, we need
212
+ # start_new_session=True so that SIGINT on the CLI will not also kill
213
+ # the API server.
214
+ subprocess.Popen(cmd, shell=True, start_new_session=True)
215
+
216
+ # Wait for the server to start until timeout.
217
+ # Conservative upper time bound for starting the server based on
218
+ # profiling.
219
+ timeout_sec = 12
220
+ start_time = time.time()
221
+ while True:
222
+ api_server_info = get_api_server_status()
223
+ assert api_server_info.status != ApiServerStatus.VERSION_MISMATCH, (
224
+ f'API server version mismatch when starting the server. '
225
+ f'Server version: {api_server_info.api_version} '
226
+ f'Client version: {server_constants.API_VERSION}')
227
+ if api_server_info.status == ApiServerStatus.HEALTHY:
228
+ break
229
+ elif time.time() - start_time >= timeout_sec:
230
+ with ux_utils.print_exception_no_traceback():
231
+ raise RuntimeError(
232
+ 'Failed to start SkyPilot API server at '
233
+ f'{get_server_url(host)}'
234
+ f'\nView logs at: {constants.API_SERVER_LOGS}')
235
+ time.sleep(0.5)
234
236
  logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
235
237
 
236
238
 
@@ -260,7 +262,8 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
260
262
 
261
263
 
262
264
  def check_server_healthy_or_start_fn(deploy: bool = False,
263
- host: str = '127.0.0.1'):
265
+ host: str = '127.0.0.1',
266
+ foreground: bool = False):
264
267
  try:
265
268
  check_server_healthy()
266
269
  except exceptions.ApiServerConnectionError as exc:
@@ -268,13 +271,15 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
268
271
  if not is_api_server_local():
269
272
  with ux_utils.print_exception_no_traceback():
270
273
  raise exceptions.ApiServerConnectionError(endpoint) from exc
274
+ # Lock to prevent multiple processes from starting the server at the
275
+ # same time, causing issues with database initialization.
271
276
  with filelock.FileLock(
272
277
  os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
273
278
  # Check again if server is already running. Other processes may
274
279
  # have started the server while we were waiting for the lock.
275
280
  api_server_info = get_api_server_status(endpoint)
276
281
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
277
- _start_api_server(deploy, host)
282
+ _start_api_server(deploy, host, foreground)
278
283
 
279
284
 
280
285
  def check_server_healthy_or_start(func):
@@ -371,7 +371,8 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
371
371
  # TODO(aylei): crash the API server or recreate the worker process
372
372
  # to avoid broken state.
373
373
  logger.error(f'[{worker}] Worker process interrupted')
374
- raise
374
+ with ux_utils.print_exception_no_traceback():
375
+ raise
375
376
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
376
377
  # Catch any other exceptions to avoid crashing the worker process.
377
378
  logger.error(
@@ -408,6 +409,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
408
409
  f'long requests and will allow at max '
409
410
  f'{max_parallel_for_short} short requests in parallel.')
410
411
 
412
+ sub_procs = []
411
413
  # Setup the queues.
412
414
  if queue_backend == QueueBackend.MULTIPROCESSING:
413
415
  logger.info('Creating shared request queues')
@@ -424,27 +426,26 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
424
426
  queue_server = multiprocessing.Process(
425
427
  target=mp_queue.start_queue_manager, args=(queue_names, port))
426
428
  queue_server.start()
427
-
429
+ sub_procs.append(queue_server)
428
430
  mp_queue.wait_for_queues_to_be_ready(queue_names, port=port)
429
431
 
430
432
  logger.info('Request queues created')
431
433
 
432
- worker_procs = []
433
434
  for worker_id in range(max_parallel_for_long):
434
435
  worker = RequestWorker(id=worker_id,
435
436
  schedule_type=api_requests.ScheduleType.LONG)
436
437
  worker_proc = multiprocessing.Process(target=request_worker,
437
438
  args=(worker, 1))
438
439
  worker_proc.start()
439
- worker_procs.append(worker_proc)
440
+ sub_procs.append(worker_proc)
440
441
 
441
442
  # Start a worker for short requests.
442
443
  worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
443
444
  worker_proc = multiprocessing.Process(target=request_worker,
444
445
  args=(worker, max_parallel_for_short))
445
446
  worker_proc.start()
446
- worker_procs.append(worker_proc)
447
- return worker_procs
447
+ sub_procs.append(worker_proc)
448
+ return sub_procs
448
449
 
449
450
 
450
451
  @annotations.lru_cache(scope='global', maxsize=1)
@@ -450,6 +450,7 @@ class LocalUpBody(RequestBody):
450
450
  ssh_user: Optional[str] = None
451
451
  ssh_key: Optional[str] = None
452
452
  cleanup: bool = False
453
+ context_name: Optional[str] = None
453
454
 
454
455
 
455
456
  class ServeTerminateReplicaBody(RequestBody):
@@ -172,12 +172,15 @@ def decode_storage_ls(
172
172
 
173
173
  @register_decoders('job_status')
174
174
  def decode_job_status(
175
- return_value: Dict[int, Optional[str]]
175
+ return_value: Dict[str, Optional[str]]
176
176
  ) -> Dict[int, Optional['job_lib.JobStatus']]:
177
177
  job_statuses: Dict[int, Optional['job_lib.JobStatus']] = {}
178
- for job_id in return_value.keys():
179
- if return_value[job_id] is not None:
180
- job_statuses[job_id] = job_lib.JobStatus(return_value[job_id])
178
+ for job_id_str, status_str in return_value.items():
179
+ # When we json serialize the job ID for storing in the requests db,
180
+ # the job_id gets converted to a string. Here we convert it back to int.
181
+ job_id = int(job_id_str)
182
+ if status_str is not None:
183
+ job_statuses[job_id] = job_lib.JobStatus(status_str)
181
184
  else:
182
185
  job_statuses[job_id] = None
183
186
  return job_statuses
sky/server/server.py CHANGED
@@ -1081,14 +1081,14 @@ if __name__ == '__main__':
1081
1081
  # that it is shown only when the API server is started.
1082
1082
  usage_lib.maybe_show_privacy_policy()
1083
1083
 
1084
- num_workers = None
1084
+ num_workers = 1
1085
1085
  if cmd_args.deploy:
1086
- num_workers = os.cpu_count()
1086
+ num_workers = common_utils.get_cpu_count()
1087
1087
 
1088
- workers = []
1088
+ sub_procs = []
1089
1089
  try:
1090
- workers = executor.start(cmd_args.deploy)
1091
- logger.info('Starting SkyPilot API server')
1090
+ sub_procs = executor.start(cmd_args.deploy)
1091
+ logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1092
1092
  # We don't support reload for now, since it may cause leakage of request
1093
1093
  # workers or interrupt running requests.
1094
1094
  uvicorn.run('sky.server.server:app',
@@ -1101,5 +1101,6 @@ if __name__ == '__main__':
1101
1101
  raise
1102
1102
  finally:
1103
1103
  logger.info('Shutting down SkyPilot API server...')
1104
- for worker in workers:
1105
- worker.terminate()
1104
+ for sub_proc in sub_procs:
1105
+ sub_proc.terminate()
1106
+ sub_proc.join()
@@ -123,7 +123,7 @@ extras_require: Dict[str, List[str]] = {
123
123
  'ibm-cloud-sdk-core', 'ibm-vpc', 'ibm-platform-services', 'ibm-cos-sdk'
124
124
  ] + local_ray,
125
125
  'docker': ['docker'] + local_ray,
126
- 'lambda': local_ray,
126
+ 'lambda': [], # No dependencies needed for lambda
127
127
  'cloudflare': aws_dependencies,
128
128
  'scp': local_ray,
129
129
  'oci': ['oci'] + local_ray,
sky/skylet/constants.py CHANGED
@@ -144,12 +144,14 @@ DISABLE_GPU_ECC_COMMAND = (
144
144
  CONDA_INSTALLATION_COMMANDS = (
145
145
  'which conda > /dev/null 2>&1 || '
146
146
  '{ '
147
- 'curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
147
+ # Use uname -m to get the architecture of the machine and download the
148
+ # corresponding Miniconda3-Linux.sh script.
149
+ 'curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-$(uname -m).sh -o Miniconda3-Linux.sh && ' # pylint: disable=line-too-long
148
150
  # We do not use && for installation of conda and the following init commands
149
151
  # because for some images, conda is already installed, but not initialized.
150
152
  # In this case, we need to initialize conda and set auto_activate_base to
151
153
  # true.
152
- '{ bash Miniconda3-Linux-x86_64.sh -b; '
154
+ '{ bash Miniconda3-Linux.sh -b; '
153
155
  'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
154
156
  # Caller should replace {conda_auto_activate} with either true or false.
155
157
  'conda config --set auto_activate_base {conda_auto_activate} && '
@@ -124,7 +124,7 @@ class Controllers(enum.Enum):
124
124
  cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
125
125
  in_progress_hint=(
126
126
  f'* To see detailed service status: {colorama.Style.BRIGHT}'
127
- f'sky serve status -a{colorama.Style.RESET_ALL}'),
127
+ f'sky serve status -v{colorama.Style.RESET_ALL}'),
128
128
  decline_cancel_hint=(
129
129
  'Cancelling the sky serve controller\'s jobs is not allowed.'),
130
130
  _decline_down_when_failed_to_fetch_status_hint=(
@@ -246,11 +246,15 @@ def _get_cloud_dependencies_installation_commands(
246
246
  'apt install curl socat netcat -y &> /dev/null; '
247
247
  'fi" && '
248
248
  # Install kubectl
249
+ 'ARCH=$(uname -m) && '
250
+ 'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
251
+ ' ARCH="arm64"; '
252
+ 'else '
253
+ ' ARCH="amd64"; '
254
+ 'fi && '
249
255
  '(command -v kubectl &>/dev/null || '
250
- '(curl -s -LO "https://dl.k8s.io/release/'
251
- '$(curl -L -s https://dl.k8s.io/release/stable.txt)'
252
- '/bin/linux/amd64/kubectl" && '
253
- 'sudo install -o root -g root -m 0755 '
256
+ '("https://dl.k8s.io/release/v1.31.6/bin/linux/$ARCH/kubectl" '
257
+ '&& sudo install -o root -g root -m 0755 '
254
258
  'kubectl /usr/local/bin/kubectl))')
255
259
  elif isinstance(cloud, clouds.Cudo):
256
260
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
@@ -9,21 +9,38 @@ YELLOW='\033[1;33m'
9
9
  NC='\033[0m' # No color
10
10
 
11
11
  # Variables
12
+ CLEANUP=false
13
+ INSTALL_GPU=false
14
+ POSITIONAL_ARGS=()
15
+
16
+ # Process all arguments
17
+ while [[ $# -gt 0 ]]; do
18
+ case $1 in
19
+ --cleanup)
20
+ CLEANUP=true
21
+ shift
22
+ ;;
23
+ *)
24
+ POSITIONAL_ARGS+=("$1")
25
+ shift
26
+ ;;
27
+ esac
28
+ done
29
+
30
+ # Restore positional arguments in correct order
31
+ set -- "${POSITIONAL_ARGS[@]}"
32
+
33
+ # Assign positional arguments to variables
12
34
  IPS_FILE=$1
13
35
  USER=$2
14
36
  SSH_KEY=$3
37
+ CONTEXT_NAME=${4:-default}
15
38
  K3S_TOKEN=mytoken # Any string can be used as the token
16
- CLEANUP=false
17
- INSTALL_GPU=false
18
-
19
- if [[ "$4" == "--cleanup" ]]; then
20
- CLEANUP=true
21
- fi
22
39
 
23
40
  # Basic argument checks
24
41
  if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
25
42
  >&2 echo -e "${RED}Error: Missing required arguments.${NC}"
26
- >&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [--cleanup]"
43
+ >&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [context-name] [--cleanup]"
27
44
  exit 1
28
45
  fi
29
46
 
@@ -116,6 +133,17 @@ if [ "$CLEANUP" == "true" ]; then
116
133
  cleanup_agent_node "$NODE"
117
134
  done
118
135
 
136
+ # Remove the context from local kubeconfig if it exists
137
+ if [ -f "$HOME/.kube/config" ]; then
138
+ progress_message "Removing context '$CONTEXT_NAME' from local kubeconfig..."
139
+ kubectl config delete-context "$CONTEXT_NAME" 2>/dev/null || true
140
+ kubectl config delete-cluster "$CONTEXT_NAME" 2>/dev/null || true
141
+ kubectl config delete-user "$CONTEXT_NAME" 2>/dev/null || true
142
+ # Update the current context to the first available context
143
+ kubectl config use-context $(kubectl config view -o jsonpath='{.contexts[0].name}') 2>/dev/null || true
144
+ success_message "Context '$CONTEXT_NAME' removed from local kubeconfig."
145
+ fi
146
+
119
147
  echo -e "${GREEN}Cleanup completed successfully.${NC}"
120
148
  exit 0
121
149
  fi
@@ -167,22 +195,25 @@ for NODE in $WORKER_NODES; do
167
195
  done
168
196
  # Step 3: Configure local kubectl to connect to the cluster
169
197
  progress_message "Configuring local kubectl to connect to the cluster..."
170
- scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config ~/.kube/config
171
198
 
172
- # Back up the original kubeconfig file if it exists
199
+ # Create temporary directory for kubeconfig operations
200
+ TEMP_DIR=$(mktemp -d)
201
+ TEMP_KUBECONFIG="$TEMP_DIR/kubeconfig"
202
+
203
+ # Get the kubeconfig from remote server
204
+ scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config "$TEMP_KUBECONFIG"
205
+
206
+ # Create .kube directory if it doesn't exist
207
+ mkdir -p "$HOME/.kube"
208
+
209
+ # Create empty kubeconfig if it doesn't exist
173
210
  KUBECONFIG_FILE="$HOME/.kube/config"
174
- if [[ -f "$KUBECONFIG_FILE" ]]; then
175
- echo "Backing up existing kubeconfig to $KUBECONFIG_FILE.bak"
176
- cp "$KUBECONFIG_FILE" "$KUBECONFIG_FILE.bak"
211
+ if [[ ! -f "$KUBECONFIG_FILE" ]]; then
212
+ touch "$KUBECONFIG_FILE"
177
213
  fi
178
214
 
179
- # Update kubeconfig for the local machine to use the master node's IP
180
- # Temporary file to hold the modified kubeconfig
181
- TEMP_FILE=$(mktemp)
182
-
183
- # Remove the certificate-authority-data, and replace the server with the master address
184
- awk '
185
- BEGIN { in_cluster = 0 }
215
+ # Modify the temporary kubeconfig to update server address and context name
216
+ awk -v context="$CONTEXT_NAME" '
186
217
  /^clusters:/ { in_cluster = 1 }
187
218
  /^users:/ { in_cluster = 0 }
188
219
  in_cluster && /^ *certificate-authority-data:/ { next }
@@ -191,13 +222,24 @@ awk '
191
222
  print " insecure-skip-tls-verify: true"
192
223
  next
193
224
  }
225
+ /name: default/ { sub("name: default", "name: " context) }
226
+ /cluster: default/ { sub("cluster: default", "cluster: " context) }
227
+ /user: default/ { sub("user: default", "user: " context) }
228
+ /current-context: default/ { sub("current-context: default", "current-context: " context) }
194
229
  { print }
195
- ' "$KUBECONFIG_FILE" > "$TEMP_FILE"
230
+ ' "$TEMP_KUBECONFIG" > "$TEMP_DIR/modified_config"
231
+
232
+ # Merge the configurations using kubectl
233
+ KUBECONFIG="$KUBECONFIG_FILE:$TEMP_DIR/modified_config" kubectl config view --flatten > "$TEMP_DIR/merged_config"
234
+ mv "$TEMP_DIR/merged_config" "$KUBECONFIG_FILE"
235
+
236
+ # Set the new context as the current context
237
+ kubectl config use-context "$CONTEXT_NAME"
196
238
 
197
- # Replace the original kubeconfig with the modified one
198
- mv "$TEMP_FILE" "$KUBECONFIG_FILE"
239
+ # Clean up temporary files
240
+ rm -rf "$TEMP_DIR"
199
241
 
200
- success_message "kubectl configured to connect to the cluster."
242
+ success_message "kubectl configured with new context '$CONTEXT_NAME'."
201
243
 
202
244
  echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
203
245
 
@@ -3,7 +3,7 @@ import os
3
3
  import shlex
4
4
  import subprocess
5
5
  import tempfile
6
- from typing import List
6
+ from typing import List, Optional
7
7
 
8
8
  from sky import check as sky_check
9
9
  from sky import sky_logging
@@ -19,8 +19,11 @@ from sky.utils import ux_utils
19
19
  logger = sky_logging.init_logger(__name__)
20
20
 
21
21
 
22
- def deploy_remote_cluster(ip_list: List[str], ssh_user: str, ssh_key: str,
23
- cleanup: bool):
22
+ def deploy_remote_cluster(ip_list: List[str],
23
+ ssh_user: str,
24
+ ssh_key: str,
25
+ cleanup: bool,
26
+ context_name: Optional[str] = None):
24
27
  success = False
25
28
  path_to_package = os.path.dirname(__file__)
26
29
  up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.sh')
@@ -41,6 +44,8 @@ def deploy_remote_cluster(ip_list: List[str], ssh_user: str, ssh_key: str,
41
44
 
42
45
  deploy_command = (f'{up_script_path} {ip_file.name} '
43
46
  f'{ssh_user} {key_file.name}')
47
+ if context_name is not None:
48
+ deploy_command += f' {context_name}'
44
49
  if cleanup:
45
50
  deploy_command += ' --cleanup'
46
51
 
@@ -52,10 +57,6 @@ def deploy_remote_cluster(ip_list: List[str], ssh_user: str, ssh_key: str,
52
57
  log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
53
58
  'local_up.log')
54
59
 
55
- # Check if ~/.kube/config exists:
56
- if os.path.exists(os.path.expanduser('~/.kube/config')):
57
- logger.info('Found existing kube config. '
58
- 'It will be backed up to ~/.kube/config.bak.')
59
60
  if cleanup:
60
61
  msg_str = 'Cleaning up remote cluster...'
61
62
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250228
3
+ Version: 1.0.0.dev20250302
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -75,7 +75,6 @@ Provides-Extra: docker
75
75
  Requires-Dist: docker; extra == "docker"
76
76
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "docker"
77
77
  Provides-Extra: lambda
78
- Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "lambda"
79
78
  Provides-Extra: cloudflare
80
79
  Requires-Dist: urllib3<2; extra == "cloudflare"
81
80
  Requires-Dist: awscli>=1.27.10; extra == "cloudflare"
@@ -132,7 +131,6 @@ Requires-Dist: ibm-cos-sdk; extra == "all"
132
131
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
133
132
  Requires-Dist: docker; extra == "all"
134
133
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
135
- Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
136
134
  Requires-Dist: urllib3<2; extra == "all"
137
135
  Requires-Dist: awscli>=1.27.10; extra == "all"
138
136
  Requires-Dist: botocore>=1.29.10; extra == "all"
@@ -1,12 +1,12 @@
1
- sky/__init__.py,sha256=jLsjhG2RasaqPtD_RYBd_LfTFzhUYIB8j6WkrjZVbKY,6428
1
+ sky/__init__.py,sha256=I1eDvAaLP1jLIfIf2_Y64Cz9tEvM6WDxAniUymSbjN8,6428
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
4
4
  sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
5
- sky/cli.py,sha256=K28Bowflmjhir2e3hIPbZhsnfqvBsANSeBSnEXFoy10,219929
6
- sky/cloud_stores.py,sha256=-95XIqi_ouo7hvoN5mQNP6bGm07MyF6Yk-YP4Txb5wg,24034
7
- sky/core.py,sha256=X83hdpPTiWyEJLamrguCd03PUjkRiGgqTFfEBEQkzWc,45471
5
+ sky/cli.py,sha256=-3dv9uu0jp3uRgZYsdI_1SQTE6o0CD1XBql6gMydgHE,220651
6
+ sky/cloud_stores.py,sha256=kEHXd2divyra-1c3EusHxKyM5yTQlTXc6cKVXofsefA,23978
7
+ sky/core.py,sha256=fDCit8n9BAXxJ9Dexq_CCOTmY8uZGK73wEaouNCRAbA,45573
8
8
  sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
9
- sky/exceptions.py,sha256=cGepNlBkjjgvi3fazc3DbdYLKhhF_sHCuGX0-hu_QMQ,13685
9
+ sky/exceptions.py,sha256=RCAojf2r9sUOdFYEhOoB05G-VP0xwkEVfsYdh2TZypA,13783
10
10
  sky/execution.py,sha256=0M4RTEzWn-B9oz221XdZOIGH12XOACmNq0j-WGUT_No,28023
11
11
  sky/global_user_state.py,sha256=sUDdSsJeiJkbgmZNwy8YGFK0XeNh-RBr1VDUvbmjf0g,33246
12
12
  sky/models.py,sha256=4xSW05BdDPEjW8Ubvj3VlVOVnzv0TbrolsFvR5R5v1U,638
@@ -33,7 +33,7 @@ sky/adaptors/vast.py,sha256=tpvmHi7IkQNzbbHVkeo04kUSajoEpSzXr2XgeO_I1LU,695
33
33
  sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
34
34
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
35
35
  sky/backends/backend.py,sha256=4BOqKZ-bwBTpjNnZF4JAHX2m2Iga7EmEn8Ao3tEivaM,7527
36
- sky/backends/backend_utils.py,sha256=egi4xOu1NKZi8HBUTz8hhCOLx-QtEokcM_v6Ix6OUdA,133199
36
+ sky/backends/backend_utils.py,sha256=B_46tG9PyrppxLWdg4mWGuuIr3TEcWTz6qhYXjAY2bw,133452
37
37
  sky/backends/cloud_vm_ray_backend.py,sha256=B8sH-m5pujvkTN_sLENsHw-SWpbaWwlHda26yI_1wnk,247459
38
38
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
39
39
  sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
@@ -43,9 +43,9 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
43
  sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
44
44
  sky/benchmark/benchmark_utils.py,sha256=o4RymqSceq5mLEZL0upQM6NVEzJJQzj9s9tTm49uUTc,26365
45
45
  sky/client/__init__.py,sha256=pz6xvVSd9X-gwqbsDL0E9QOojYqM0KAD0j-NCyCIF1k,38
46
- sky/client/cli.py,sha256=K28Bowflmjhir2e3hIPbZhsnfqvBsANSeBSnEXFoy10,219929
46
+ sky/client/cli.py,sha256=-3dv9uu0jp3uRgZYsdI_1SQTE6o0CD1XBql6gMydgHE,220651
47
47
  sky/client/common.py,sha256=axDic7WOG1e78SdFm5XIwdhX7YNvf3g4k7INrsW3X4s,14611
48
- sky/client/sdk.py,sha256=U4v8Khu1lf1oUoBuJUhIFnjsFhYM9x8XcKsnVRMtihI,66990
48
+ sky/client/sdk.py,sha256=zNjZFDTecloxdkGqmSq9wrdKjX9pEMJLghu5bUThJWk,67406
49
49
  sky/clouds/__init__.py,sha256=OW6mJ-9hpJSBORCgt2LippLQEYZHNfnBW1mooRNNvxo,1416
50
50
  sky/clouds/aws.py,sha256=J8tczaTDL239UowN9tUlhI92SeHw01wtFucSckvG63w,54112
51
51
  sky/clouds/azure.py,sha256=bawEw6wOLAVyrjxMD-4UjLCuMj1H5_jH8qggpfZYS54,31703
@@ -53,7 +53,7 @@ sky/clouds/cloud.py,sha256=Ej6WH6VElYdG3PG1-Sp6lFVsJ42uskV4dAg7kmoY4JA,35376
53
53
  sky/clouds/cudo.py,sha256=femv17IUM1TOXuCAg6zljqyFcBGfofbXCNGckpXFHzc,13127
54
54
  sky/clouds/do.py,sha256=hmksx0XML0dVHUZBMV2Wr3a5VilOsYfxX2dSBV_XK5o,11487
55
55
  sky/clouds/fluidstack.py,sha256=Eb0nlfU_EwTtGtV0nPKS2ueBlB0nYiDAN9swA-jjQV0,12446
56
- sky/clouds/gcp.py,sha256=0QpsI0Dso1xs3LhGlUq-Sq6WK-u11wN-57-vfcyhI5I,55154
56
+ sky/clouds/gcp.py,sha256=FUCUq94yGUZ_yyKxA3prRKTqetObbIMkfjAPTPbhXyA,55824
57
57
  sky/clouds/ibm.py,sha256=R4JR96YfXstZ2B_IgFNVEX2SBAq3q0lSWz4y7FoFoeE,21474
58
58
  sky/clouds/kubernetes.py,sha256=7ki_zJZKnkhOPrHgVFq6azy5UhNKeeBOCSTjKCgj3vk,31709
59
59
  sky/clouds/lambda_cloud.py,sha256=ejqA_Wj5-325Y_QjQ__FY4HMO8sv_2tSRsufmaldcmI,12699
@@ -103,7 +103,7 @@ sky/clouds/utils/scp_utils.py,sha256=r4lhRLtNgoz5nmkfN2ctAXYugF_-Et8TYH6ZlbbFfo8
103
103
  sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
104
104
  sky/data/data_transfer.py,sha256=wixC4_3_JaeJFdGKOp-O5ulcsMugDSgrCR0SnPpugGc,8946
105
105
  sky/data/data_utils.py,sha256=HjcgMDuWRR_fNQ9gjuROi9GgPVvTGApiJwxGtdb2_UU,28860
106
- sky/data/mounting_utils.py,sha256=la21kp7k51zGoFp9WxT5hf38P_XTqcq-Hm1bJZsPnkg,14949
106
+ sky/data/mounting_utils.py,sha256=i79Y-DZXVR88fjG_MBPB8EgsZBnHdpf1LGnJSm_VhAg,16063
107
107
  sky/data/storage.py,sha256=mTgMGdfSV6Gia076Dvgmc18ZlqF6eObima558UShiXA,207165
108
108
  sky/data/storage_utils.py,sha256=zB99nRTJjh8isU0UmqERmlwwRNgfig91IwrwVH8CcNw,12383
109
109
  sky/jobs/__init__.py,sha256=qoI53-xXE0-SOkrLWigvhgFXjk7dWE0OTqGPYIk-kmM,1458
@@ -228,29 +228,29 @@ sky/serve/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
228
228
  sky/serve/server/core.py,sha256=pRvFadEIH_WTUkTtSmuFoPBP4JFq8Obt68ifi9DWuog,36865
229
229
  sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,3283
230
230
  sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
231
- sky/server/common.py,sha256=0LphKrp89_sGI-xDakK2uEqI-zKuvbc4OTcuLCiKfmQ,17560
231
+ sky/server/common.py,sha256=pEa-q3P5aOm6RMlit0pVzlDoJnZU_6zViO7aK_7htn0,17843
232
232
  sky/server/constants.py,sha256=89jKE3SIe1T3_7j6ECTy4pZnhZZD7fBwsWOCOkTban8,770
233
- sky/server/server.py,sha256=4ipJG67sBFWylNYdPD1FUhth36yX23XbcROXipRSZsw,42438
233
+ sky/server/server.py,sha256=IDVx6qHl75PRSyc23NLMi0JLjqrTjOHWiCeNvK1D60U,42511
234
234
  sky/server/stream_utils.py,sha256=-3IX1YCgxAFfcvQIV0TCvOn1wbRLWovAx3ckCrsExWU,5651
235
235
  sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
236
236
  sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
237
- sky/server/requests/executor.py,sha256=TWX2jYkkstgRyRkWNE19Mgw4_CfzadebOW30iTGdK_Q,19693
238
- sky/server/requests/payloads.py,sha256=QYgEz85jswXkEYxO1mkwPA8MWXD_pETs-g_JH_Tlm_w,16038
237
+ sky/server/requests/executor.py,sha256=Jk8RJoQlicDqaHhgVWMH3UiL-dJS7lGSGd05GPv-Lrc,19781
238
+ sky/server/requests/payloads.py,sha256=75bHfIJ5TR8z3BVVKMYLKxK5BzjtxwawCpwN8B88-FA,16077
239
239
  sky/server/requests/requests.py,sha256=aMdjiK5kjSYP36pxdXFU6qgKOXcOmtViHbFm3V8Dvf8,19590
240
240
  sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
241
241
  sky/server/requests/queues/mp_queue.py,sha256=_7AFas__0b1L8e7Bwy4lu0VYU18R85YwMlDHPhQCfh0,2998
242
242
  sky/server/requests/serializers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
243
- sky/server/requests/serializers/decoders.py,sha256=iChUnvvXwnAArWm0AVT5eTLt5ZmPfkr4TT32orCSJCM,6624
243
+ sky/server/requests/serializers/decoders.py,sha256=0cpg80uAqkdK_LqcQPkpKswhcNUUztG9luDLm_0eUow,6811
244
244
  sky/server/requests/serializers/encoders.py,sha256=i4SAb5Oyp00CyMkyidbdA9dtxAzxZl40KTpL_x6pH0w,5679
245
245
  sky/setup_files/MANIFEST.in,sha256=cHYG6IdIp7RsDapL4Lrs-WTeYJftHn6qystSolmyyk8,581
246
- sky/setup_files/dependencies.py,sha256=CbuSc7D1YFlonN-TF8qI0khq3jE0K7ueUEPG10RUNIY,6283
246
+ sky/setup_files/dependencies.py,sha256=cm2qeYFeVDv5tHlcSw0xtZzTI6uz4m_WaF2GUIFIgCk,6313
247
247
  sky/setup_files/setup.py,sha256=Q9f0RvsdPC0FLvyTKW-upQtRuA81jRO4TtN3VK-mP-Y,7436
248
248
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
249
249
  sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
250
250
  sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
251
251
  sky/skylet/autostop_lib.py,sha256=W4CtMira6QnmYToFT5kYTGjNPRZNC-bZPfsF1k3tluE,4480
252
252
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
253
- sky/skylet/constants.py,sha256=WB7bKWvMkgyEf9tAzfKwtM8eYR9OEgJtqwgBT3BEoEM,17923
253
+ sky/skylet/constants.py,sha256=wELTCOQLIafwtl7_L8vjDwdqYQQxG-Sup3H7vxmcyB4,18037
254
254
  sky/skylet/events.py,sha256=pnV3ZiwWhXqTHpU5B5Y9Xwam_7FQDI6IrxgSx7X_NVA,12743
255
255
  sky/skylet/job_lib.py,sha256=8W6GM2zxqGMIoD3AGiXcKsK_7-qouuTojiVL6upSeoA,43728
256
256
  sky/skylet/log_lib.py,sha256=DzOrgY8C7RdEMLC9O9kEKV-iLMb9wVMPSnDha8eMx28,20900
@@ -313,7 +313,7 @@ sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
313
313
  sky/utils/common_utils.py,sha256=1KfqF_hgH9l1ieyV-_3fJd6ma41-tOstj-ihAQcEDIM,31162
314
314
  sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
315
315
  sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
316
- sky/utils/controller_utils.py,sha256=Wth_esy2NX9nco-MK01bgQMIChAYky0Uq4T35jQkXxY,48472
316
+ sky/utils/controller_utils.py,sha256=FETdTx1om3Qmxw5W9BNazhV0dWoNdMWYopIEQJRcKSA,48614
317
317
  sky/utils/dag_utils.py,sha256=sAus0aL1wtuuFZSDnpO4LY-6WK4u5iJY952oWQzHo3Y,7532
318
318
  sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
319
319
  sky/utils/env_options.py,sha256=aaD6GoYK0LaZIqjOEZ-R7eccQuiRriW3EuLWtOI5En8,1578
@@ -334,19 +334,19 @@ sky/utils/cli_utils/status_utils.py,sha256=LwGXzMgvnQeGR1fCC24q38hRLuAPeeSDkQ387
334
334
  sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
335
335
  sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnvvb0SHUFno,7773
336
336
  sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
337
- sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=7FN1Y10fE24GntL_sbQU7z_103rKvXrneru0rjuDhkU,8595
337
+ sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=SGnqa5ks5Og-F96S_PIUbKzuKAtTF5cYOrXIg5XWx_E,10179
338
338
  sky/utils/kubernetes/exec_kubeconfig_converter.py,sha256=fE1SnteoxI05EaugnWeV82hXwZTVHmbXsh1aaZAgF3c,2548
339
339
  sky/utils/kubernetes/generate_kind_config.py,sha256=_TNLnifA_r7-CRq083IP1xjelYqiLjzQX9ohuqYpDH8,3187
340
340
  sky/utils/kubernetes/generate_kubeconfig.sh,sha256=MBvXJio0PeujZSCXiRKE_pa6HCTiU9qBzR1WrXccVSY,10477
341
341
  sky/utils/kubernetes/gpu_labeler.py,sha256=4px7FyfsukacPEvKwTLUNb3WwacMIUrHWjP93qTi3kE,6998
342
342
  sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7ZWF5gdVIZPupCCo9A,1224
343
343
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
344
- sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=iAjfyPclOs8qlALACcfxLpRAO9CZ-h16leFqXZ6tNaY,10096
344
+ sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=otzHzpliHDCpzYT-nU9Q0ZExbiFpDPWvhxwkvchZj7k,10073
345
345
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
346
346
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
347
- skypilot_nightly-1.0.0.dev20250228.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
348
- skypilot_nightly-1.0.0.dev20250228.dist-info/METADATA,sha256=IBTEc-5NtZ9KP63RfsWsMmXI6qzs7ZBSdi9bAN8QfAc,19236
349
- skypilot_nightly-1.0.0.dev20250228.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
350
- skypilot_nightly-1.0.0.dev20250228.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
351
- skypilot_nightly-1.0.0.dev20250228.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
352
- skypilot_nightly-1.0.0.dev20250228.dist-info/RECORD,,
347
+ skypilot_nightly-1.0.0.dev20250302.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
348
+ skypilot_nightly-1.0.0.dev20250302.dist-info/METADATA,sha256=KBkT-GZ-9phSYbZBfrXZt0Rx_Ikvyt1o8K0P_iDw4iE,19115
349
+ skypilot_nightly-1.0.0.dev20250302.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
350
+ skypilot_nightly-1.0.0.dev20250302.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
351
+ skypilot_nightly-1.0.0.dev20250302.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
352
+ skypilot_nightly-1.0.0.dev20250302.dist-info/RECORD,,