skypilot-nightly 1.0.0.dev20241019__py3-none-any.whl → 1.0.0.dev20241021__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '9201def0ff1ac73681a82a26d46f56d0b027b03b'
8
+ _SKYPILOT_COMMIT_SHA = '3c3bcee5cfe720a96ab67f4049a557a79e7f077f'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241019'
38
+ __version__ = '1.0.0.dev20241021'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -4380,9 +4380,14 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4380
4380
  default=False,
4381
4381
  required=False,
4382
4382
  help='Skip confirmation prompt.')
4383
+ @click.option('--replica-id',
4384
+ default=None,
4385
+ type=int,
4386
+ help='Tear down a given replica')
4383
4387
  # pylint: disable=redefined-builtin
4384
- def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
4385
- """Teardown service(s).
4388
+ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
4389
+ replica_id: Optional[int]):
4390
+ """Teardown service(s) or a replica.
4386
4391
 
4387
4392
  SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If
4388
4393
  both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence.
@@ -4408,6 +4413,12 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
4408
4413
  \b
4409
4414
  # Forcefully tear down a service in failed status.
4410
4415
  sky serve down failed-service --purge
4416
+ \b
4417
+ # Tear down a specific replica
4418
+ sky serve down my-service --replica-id 1
4419
+ \b
4420
+ # Forcefully tear down a specific replica, even in failed status.
4421
+ sky serve down my-service --replica-id 1 --purge
4411
4422
  """
4412
4423
  if sum([len(service_names) > 0, all]) != 1:
4413
4424
  argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len(
@@ -4417,22 +4428,45 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
4417
4428
  'Can only specify one of SERVICE_NAMES or --all. '
4418
4429
  f'Provided {argument_str!r}.')
4419
4430
 
4431
+ replica_id_is_defined = replica_id is not None
4432
+ if replica_id_is_defined:
4433
+ if len(service_names) != 1:
4434
+ service_names_str = ', '.join(service_names)
4435
+ raise click.UsageError(f'The --replica-id option can only be used '
4436
+ f'with a single service name. Got: '
4437
+ f'{service_names_str}.')
4438
+ if all:
4439
+ raise click.UsageError('The --replica-id option cannot be used '
4440
+ 'with the --all option.')
4441
+
4420
4442
  backend_utils.is_controller_accessible(
4421
4443
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
4422
4444
  stopped_message='All services should have been terminated.',
4423
4445
  exit_if_not_accessible=True)
4424
4446
 
4425
4447
  if not yes:
4426
- quoted_service_names = [f'{name!r}' for name in service_names]
4427
- service_identity_str = f'service(s) {", ".join(quoted_service_names)}'
4428
- if all:
4429
- service_identity_str = 'all services'
4430
- click.confirm(f'Terminating {service_identity_str}. Proceed?',
4431
- default=True,
4432
- abort=True,
4433
- show_default=True)
4434
-
4435
- serve_lib.down(service_names=service_names, all=all, purge=purge)
4448
+ if replica_id_is_defined:
4449
+ click.confirm(
4450
+ f'Terminating replica ID {replica_id} in '
4451
+ f'{service_names[0]!r}. Proceed?',
4452
+ default=True,
4453
+ abort=True,
4454
+ show_default=True)
4455
+ else:
4456
+ quoted_service_names = [f'{name!r}' for name in service_names]
4457
+ service_identity_str = (f'service(s) '
4458
+ f'{", ".join(quoted_service_names)}')
4459
+ if all:
4460
+ service_identity_str = 'all services'
4461
+ click.confirm(f'Terminating {service_identity_str}. Proceed?',
4462
+ default=True,
4463
+ abort=True,
4464
+ show_default=True)
4465
+
4466
+ if replica_id_is_defined:
4467
+ serve_lib.terminate_replica(service_names[0], replica_id, purge)
4468
+ else:
4469
+ serve_lib.down(service_names=service_names, all=all, purge=purge)
4436
4470
 
4437
4471
 
4438
4472
  @serve.command('logs', cls=_DocumentedCodeCommand)
sky/clouds/azure.py CHANGED
@@ -329,7 +329,6 @@ class Azure(clouds.Cloud):
329
329
  runcmd:
330
330
  - sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
331
331
  - echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
332
- - usermod -aG docker skypilot:ssh_user
333
332
  write_files:
334
333
  - path: /etc/apt/apt.conf.d/20auto-upgrades
335
334
  content: |
sky/clouds/gcp.py CHANGED
@@ -477,6 +477,9 @@ class GCP(clouds.Cloud):
477
477
  'runtime_version']
478
478
  resources_vars['tpu_node_name'] = r.accelerator_args.get(
479
479
  'tpu_name')
480
+ # TPU VMs require privileged mode for docker containers to
481
+ # access TPU devices.
482
+ resources_vars['docker_run_options'] = ['--privileged']
480
483
  else:
481
484
  # Convert to GCP names:
482
485
  # https://cloud.google.com/compute/docs/gpus
@@ -37,10 +37,6 @@ class Lambda(clouds.Cloud):
37
37
  _CLOUD_UNSUPPORTED_FEATURES = {
38
38
  clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
39
39
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
40
- clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
41
- f'Docker image is currently not supported on {_REPR}. '
42
- 'You can try running docker command inside the `run` section in task.yaml.'
43
- ),
44
40
  clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
45
41
  clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
46
42
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
@@ -173,12 +169,20 @@ class Lambda(clouds.Cloud):
173
169
  else:
174
170
  custom_resources = None
175
171
 
176
- return {
172
+ resources_vars = {
177
173
  'instance_type': resources.instance_type,
178
174
  'custom_resources': custom_resources,
179
175
  'region': region.name,
180
176
  }
181
177
 
178
+ if acc_dict is not None:
179
+ # Lambda cloud's docker runtime information does not contain
180
+ # 'nvidia-container-runtime', causing no GPU option is added to
181
+ # the docker run command. We patch this by adding it here.
182
+ resources_vars['docker_run_options'] = ['--gpus all']
183
+
184
+ return resources_vars
185
+
182
186
  def _get_feasible_launchable_resources(
183
187
  self, resources: 'resources_lib.Resources'
184
188
  ) -> 'resources_utils.FeasibleResources':
sky/clouds/oci.py CHANGED
@@ -17,6 +17,8 @@ History:
17
17
  make_deploy_resources_variables(): Bug fix for specify the image_id as
18
18
  the ocid of the image in the task.yaml file, in this case the image_id
19
19
  for the node config should be set to the ocid instead of a dict.
20
+ - Hysun He (hysun.he@oracle.com) @ Oct 13, 2024:
21
+ Support more OS types additional to ubuntu for OCI resources.
20
22
  """
21
23
  import json
22
24
  import logging
@@ -295,10 +297,21 @@ class OCI(clouds.Cloud):
295
297
  cpus=None if cpus is None else float(cpus),
296
298
  disk_tier=resources.disk_tier)
297
299
 
300
+ image_str = self._get_image_str(image_id=resources.image_id,
301
+ instance_type=resources.instance_type,
302
+ region=region.name)
303
+
304
+ # pylint: disable=import-outside-toplevel
305
+ from sky.clouds.service_catalog import oci_catalog
306
+ os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
307
+ region=region.name)
308
+ logger.debug(f'OS type for the image {image_str} is {os_type}')
309
+
298
310
  return {
299
311
  'instance_type': instance_type,
300
312
  'custom_resources': custom_resources,
301
313
  'region': region.name,
314
+ 'os_type': os_type,
302
315
  'cpus': str(cpus),
303
316
  'memory': resources.memory,
304
317
  'disk_size': resources.disk_size,
@@ -501,59 +514,45 @@ class OCI(clouds.Cloud):
501
514
  region_name: str,
502
515
  instance_type: str,
503
516
  ) -> str:
504
- if image_id is None:
505
- return self._get_default_image(region_name=region_name,
506
- instance_type=instance_type)
507
- if None in image_id:
508
- image_id_str = image_id[None]
509
- else:
510
- assert region_name in image_id, image_id
511
- image_id_str = image_id[region_name]
517
+ image_id_str = self._get_image_str(image_id=image_id,
518
+ instance_type=instance_type,
519
+ region=region_name)
520
+
512
521
  if image_id_str.startswith('skypilot:'):
513
522
  image_id_str = service_catalog.get_image_id_from_tag(image_id_str,
514
523
  region_name,
515
524
  clouds='oci')
516
- if image_id_str is None:
517
- logger.critical(
518
- '! Real image_id not found! - {region_name}:{image_id}')
519
- # Raise ResourcesUnavailableError to make sure the failover
520
- # in CloudVMRayBackend will be correctly triggered.
521
- # TODO(zhwu): This is a information leakage to the cloud
522
- # implementor, we need to find a better way to handle this.
523
- raise exceptions.ResourcesUnavailableError(
524
- '! ERR: No image found in catalog for region '
525
- f'{region_name}. Try setting a valid image_id.')
525
+
526
+ # Image_id should be impossible be None, except for the case when
527
+ # user specify an image tag which does not exist in the image.csv
528
+ # catalog file which only possible in "test" / "evaluation" phase.
529
+ # Therefore, we use assert here.
530
+ assert image_id_str is not None
526
531
 
527
532
  logger.debug(f'Got real image_id {image_id_str}')
528
533
  return image_id_str
529
534
 
530
- def _get_default_image(self, region_name: str, instance_type: str) -> str:
535
+ def _get_image_str(self, image_id: Optional[Dict[Optional[str], str]],
536
+ instance_type: str, region: str):
537
+ if image_id is None:
538
+ image_str = self._get_default_image_tag(instance_type)
539
+ elif None in image_id:
540
+ image_str = image_id[None]
541
+ else:
542
+ assert region in image_id, image_id
543
+ image_str = image_id[region]
544
+ return image_str
545
+
546
+ def _get_default_image_tag(self, instance_type: str) -> str:
531
547
  acc = self.get_accelerators_from_instance_type(instance_type)
532
548
 
533
549
  if acc is None:
534
550
  image_tag = oci_utils.oci_config.get_default_image_tag()
535
- image_id_str = service_catalog.get_image_id_from_tag(image_tag,
536
- region_name,
537
- clouds='oci')
538
551
  else:
539
552
  assert len(acc) == 1, acc
540
553
  image_tag = oci_utils.oci_config.get_default_gpu_image_tag()
541
- image_id_str = service_catalog.get_image_id_from_tag(image_tag,
542
- region_name,
543
- clouds='oci')
544
554
 
545
- if image_id_str is not None:
546
- logger.debug(
547
- f'Got default image_id {image_id_str} from tag {image_tag}')
548
- return image_id_str
549
-
550
- # Raise ResourcesUnavailableError to make sure the failover in
551
- # CloudVMRayBackend will be correctly triggered.
552
- # TODO(zhwu): This is a information leakage to the cloud implementor,
553
- # we need to find a better way to handle this.
554
- raise exceptions.ResourcesUnavailableError(
555
- 'ERR: No image found in catalog for region '
556
- f'{region_name}. Try update your default image_id settings.')
555
+ return image_tag
557
556
 
558
557
  def get_vpu_from_disktier(
559
558
  self, cpus: Optional[float],
@@ -7,6 +7,8 @@ History:
7
7
  - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
8
8
  - Hysun He (hysun.he@oracle.com) @ Jun, 2023: Reduce retry times by
9
9
  excluding those unsubscribed regions.
10
+ - Hysun He (hysun.he@oracle.com) @ Oct 14, 2024: Bug fix for validation
11
+ of the Marketplace images
10
12
  """
11
13
 
12
14
  import logging
@@ -206,4 +208,24 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
206
208
 
207
209
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
208
210
  """Returns whether the image tag is valid."""
211
+ # Oct.14, 2024 by Hysun He: Marketplace images are region neutral, so don't
212
+ # check with region for the Marketplace images.
213
+ df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
214
+ if df.empty:
215
+ return False
216
+ app_catalog_listing_id = df['AppCatalogListingId'].iloc[0]
217
+ if app_catalog_listing_id:
218
+ return True
209
219
  return common.is_image_tag_valid_impl(_image_df, tag, region)
220
+
221
+
222
+ def get_image_os_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
223
+ del region
224
+ df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
225
+ if df.empty:
226
+ os_type = oci_utils.oci_config.get_default_image_os()
227
+ else:
228
+ os_type = df['OS'].iloc[0]
229
+
230
+ logger.debug(f'Operation system for the image {tag} is {os_type}')
231
+ return os_type
@@ -1,7 +1,9 @@
1
1
  """OCI Configuration.
2
2
  History:
3
- - Zhanghao Wu @ Oct 2023: Formatting and refactoring
4
3
  - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
4
+ - Zhanghao Wu @ Oct 2023: Formatting and refactoring
5
+ - Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
6
+ configuration.
5
7
  """
6
8
  import logging
7
9
  import os
@@ -121,5 +123,13 @@ class OCIConfig:
121
123
  return skypilot_config.get_nested(
122
124
  ('oci', 'default', 'oci_config_profile'), 'DEFAULT')
123
125
 
126
+ @classmethod
127
+ def get_default_image_os(cls) -> str:
128
+ # Get the default image OS. Instead of hardcoding, we give a choice to
129
+ # set the default image OS type in the sky's user-config file. (if not
130
+ # specified, use the hardcode one at last)
131
+ return skypilot_config.get_nested(('oci', 'default', 'image_os_type'),
132
+ 'ubuntu')
133
+
124
134
 
125
135
  oci_config = OCIConfig()
@@ -253,12 +253,13 @@ class DockerInitializer:
253
253
  # issue with nvidia container toolkit:
254
254
  # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
255
255
  self._run(
256
- '[ -f /etc/docker/daemon.json ] || '
256
+ '{ which jq || sudo apt update && sudo apt install -y jq; } && '
257
+ '{ [ -f /etc/docker/daemon.json ] || '
257
258
  'echo "{}" | sudo tee /etc/docker/daemon.json;'
258
259
  'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
259
260
  '/etc/docker/daemon.json > /tmp/daemon.json;'
260
261
  'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
261
- 'sudo systemctl restart docker')
262
+ 'sudo systemctl restart docker; } || true')
262
263
  user_docker_run_options = self.docker_config.get('run_options', [])
263
264
  start_command = docker_start_cmds(
264
265
  specific_image,
@@ -335,7 +336,11 @@ class DockerInitializer:
335
336
 
336
337
  def _check_docker_installed(self):
337
338
  no_exist = 'NoExist'
339
+ # SkyPilot: Add the current user to the docker group first (if needed),
340
+ # before checking if docker is installed to avoid permission issues.
338
341
  cleaned_output = self._run(
342
+ 'id -nG $USER | grep -qw docker || '
343
+ 'sudo usermod -aG docker $USER > /dev/null 2>&1;'
339
344
  f'command -v {self.docker_cmd} || echo {no_exist!r}')
340
345
  if no_exist in cleaned_output or 'docker' not in cleaned_output:
341
346
  logger.error(
@@ -424,8 +429,8 @@ class DockerInitializer:
424
429
  def _check_container_exited(self) -> bool:
425
430
  if self.initialized:
426
431
  return True
427
- output = (self._run(check_docker_running_cmd(self.container_name,
428
- self.docker_cmd),
429
- wait_for_docker_daemon=True))
430
- return 'false' in output.lower(
431
- ) and 'no such object' not in output.lower()
432
+ output = self._run(check_docker_running_cmd(self.container_name,
433
+ self.docker_cmd),
434
+ wait_for_docker_daemon=True)
435
+ return ('false' in output.lower() and
436
+ 'no such object' not in output.lower())
@@ -132,6 +132,8 @@ class PaperspaceCloudClient:
132
132
  'apt-get update \n'
133
133
  'apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \n' # pylint: disable=line-too-long
134
134
  'fi \n'
135
+ # TODO(tian): Maybe remove this as well since we are now adding
136
+ # users to docker group in the DockerInitializer. Need to test.
135
137
  'usermod -aG docker paperspace \n'
136
138
  f'echo "{public_key}" >> /home/paperspace/.ssh/authorized_keys \n')
137
139
  try:
sky/resources.py CHANGED
@@ -842,12 +842,6 @@ class Resources:
842
842
 
843
843
  if self.extract_docker_image() is not None:
844
844
  # TODO(tian): validate the docker image exists / of reasonable size
845
- if self.accelerators is not None:
846
- for acc in self.accelerators.keys():
847
- if acc.lower().startswith('tpu'):
848
- with ux_utils.print_exception_no_traceback():
849
- raise ValueError(
850
- 'Docker image is not supported for TPU VM.')
851
845
  if self.cloud is not None:
852
846
  self.cloud.check_features_are_supported(
853
847
  self, {clouds.CloudImplementationFeatures.DOCKER_IMAGE})
@@ -1032,6 +1026,12 @@ class Resources:
1032
1026
  self.accelerators is not None):
1033
1027
  initial_setup_commands = [constants.DISABLE_GPU_ECC_COMMAND]
1034
1028
 
1029
+ docker_image = self.extract_docker_image()
1030
+
1031
+ # Cloud specific variables
1032
+ cloud_specific_variables = self.cloud.make_deploy_resources_variables(
1033
+ self, cluster_name, region, zones, dryrun)
1034
+
1035
1035
  # Docker run options
1036
1036
  docker_run_options = skypilot_config.get_nested(
1037
1037
  ('docker', 'run_options'),
@@ -1039,18 +1039,17 @@ class Resources:
1039
1039
  override_configs=self.cluster_config_overrides)
1040
1040
  if isinstance(docker_run_options, str):
1041
1041
  docker_run_options = [docker_run_options]
1042
+ # Special accelerator runtime might require additional docker run
1043
+ # options. e.g., for TPU, we need --privileged.
1044
+ if 'docker_run_options' in cloud_specific_variables:
1045
+ docker_run_options.extend(
1046
+ cloud_specific_variables['docker_run_options'])
1042
1047
  if docker_run_options and isinstance(self.cloud, clouds.Kubernetes):
1043
1048
  logger.warning(
1044
1049
  f'{colorama.Style.DIM}Docker run options are specified, '
1045
1050
  'but ignored for Kubernetes: '
1046
1051
  f'{" ".join(docker_run_options)}'
1047
1052
  f'{colorama.Style.RESET_ALL}')
1048
-
1049
- docker_image = self.extract_docker_image()
1050
-
1051
- # Cloud specific variables
1052
- cloud_specific_variables = self.cloud.make_deploy_resources_variables(
1053
- self, cluster_name, region, zones, dryrun)
1054
1053
  return dict(
1055
1054
  cloud_specific_variables,
1056
1055
  **{
sky/serve/__init__.py CHANGED
@@ -8,6 +8,7 @@ from sky.serve.constants import SKYSERVE_METADATA_DIR
8
8
  from sky.serve.core import down
9
9
  from sky.serve.core import status
10
10
  from sky.serve.core import tail_logs
11
+ from sky.serve.core import terminate_replica
11
12
  from sky.serve.core import up
12
13
  from sky.serve.core import update
13
14
  from sky.serve.serve_state import ReplicaStatus
@@ -42,6 +43,7 @@ __all__ = [
42
43
  'SKY_SERVE_CONTROLLER_NAME',
43
44
  'SKYSERVE_METADATA_DIR',
44
45
  'status',
46
+ 'terminate_replica',
45
47
  'tail_logs',
46
48
  'up',
47
49
  'update',
sky/serve/constants.py CHANGED
@@ -92,4 +92,11 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
92
92
  # change for the serve_utils.ServeCodeGen, we need to bump this version, so that
93
93
  # the user can be notified to update their SkyPilot serve version on the remote
94
94
  # cluster.
95
- SERVE_VERSION = 1
95
+ # Changelog:
96
+ # v1.0 - Introduce rolling update.
97
+ # v2.0 - Added template-replica feature.
98
+ SERVE_VERSION = 2
99
+
100
+ TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
101
+ 'The version of service is outdated and does not support manually '
102
+ 'terminating replicas. Please terminate the service and spin up again.')
sky/serve/controller.py CHANGED
@@ -9,6 +9,7 @@ import time
9
9
  import traceback
10
10
  from typing import Any, Dict, List
11
11
 
12
+ import colorama
12
13
  import fastapi
13
14
  from fastapi import responses
14
15
  import uvicorn
@@ -157,6 +158,75 @@ class SkyServeController:
157
158
  return responses.JSONResponse(content={'message': 'Error'},
158
159
  status_code=500)
159
160
 
161
+ @self._app.post('/controller/terminate_replica')
162
+ async def terminate_replica(
163
+ request: fastapi.Request) -> fastapi.Response:
164
+ request_data = await request.json()
165
+ replica_id = request_data['replica_id']
166
+ assert isinstance(replica_id,
167
+ int), 'Error: replica ID must be an integer.'
168
+ purge = request_data['purge']
169
+ assert isinstance(purge, bool), 'Error: purge must be a boolean.'
170
+ replica_info = serve_state.get_replica_info_from_id(
171
+ self._service_name, replica_id)
172
+ assert replica_info is not None, (f'Error: replica '
173
+ f'{replica_id} does not exist.')
174
+ replica_status = replica_info.status
175
+
176
+ if replica_status == serve_state.ReplicaStatus.SHUTTING_DOWN:
177
+ return responses.JSONResponse(
178
+ status_code=409,
179
+ content={
180
+ 'message':
181
+ f'Replica {replica_id} of service '
182
+ f'{self._service_name!r} is already in the process '
183
+ f'of terminating. Skip terminating now.'
184
+ })
185
+
186
+ if (replica_status in serve_state.ReplicaStatus.failed_statuses()
187
+ and not purge):
188
+ return responses.JSONResponse(
189
+ status_code=409,
190
+ content={
191
+ 'message': f'{colorama.Fore.YELLOW}Replica '
192
+ f'{replica_id} of service '
193
+ f'{self._service_name!r} is in failed '
194
+ f'status ({replica_info.status}). '
195
+ f'Skipping its termination as it could '
196
+ f'lead to a resource leak. '
197
+ f'(Use `sky serve down '
198
+ f'{self._service_name!r} --replica-id '
199
+ f'{replica_id} --purge` to '
200
+ 'forcefully terminate the replica.)'
201
+ f'{colorama.Style.RESET_ALL}'
202
+ })
203
+
204
+ self._replica_manager.scale_down(replica_id, purge=purge)
205
+
206
+ action = 'terminated' if not purge else 'purged'
207
+ message = (f'{colorama.Fore.GREEN}Replica {replica_id} of service '
208
+ f'{self._service_name!r} is scheduled to be '
209
+ f'{action}.{colorama.Style.RESET_ALL}\n'
210
+ f'Please use {ux_utils.BOLD}sky serve status '
211
+ f'{self._service_name}{ux_utils.RESET_BOLD} '
212
+ f'to check the latest status.')
213
+ return responses.JSONResponse(status_code=200,
214
+ content={'message': message})
215
+
216
+ @self._app.exception_handler(Exception)
217
+ async def validation_exception_handler(
218
+ request: fastapi.Request, exc: Exception) -> fastapi.Response:
219
+ with ux_utils.enable_traceback():
220
+ logger.error(f'Error in controller: {exc!r}')
221
+ return responses.JSONResponse(
222
+ status_code=500,
223
+ content={
224
+ 'message':
225
+ (f'Failed method {request.method} at URL {request.url}.'
226
+ f' Exception message is {exc!r}.')
227
+ },
228
+ )
229
+
160
230
  threading.Thread(target=self._run_autoscaler).start()
161
231
 
162
232
  logger.info('SkyServe Controller started on '
sky/serve/core.py CHANGED
@@ -503,6 +503,53 @@ def down(
503
503
  sky_logging.print(stdout)
504
504
 
505
505
 
506
+ @usage_lib.entrypoint
507
+ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
508
+ """Tear down a specific replica for the given service.
509
+
510
+ Args:
511
+ service_name: Name of the service.
512
+ replica_id: ID of replica to terminate.
513
+ purge: Whether to terminate replicas in a failed status. These replicas
514
+ may lead to resource leaks, so we require the user to explicitly
515
+ specify this flag to make sure they are aware of this potential
516
+ resource leak.
517
+
518
+ Raises:
519
+ sky.exceptions.ClusterNotUpError: if the sky sere controller is not up.
520
+ RuntimeError: if failed to terminate the replica.
521
+ """
522
+ handle = backend_utils.is_controller_accessible(
523
+ controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
524
+ stopped_message=
525
+ 'No service is running now. Please spin up a service first.',
526
+ non_existent_message='No service is running now. '
527
+ 'Please spin up a service first.',
528
+ )
529
+
530
+ backend = backend_utils.get_backend_from_handle(handle)
531
+ assert isinstance(backend, backends.CloudVmRayBackend)
532
+
533
+ code = serve_utils.ServeCodeGen.terminate_replica(service_name, replica_id,
534
+ purge)
535
+ returncode, stdout, stderr = backend.run_on_head(handle,
536
+ code,
537
+ require_outputs=True,
538
+ stream_logs=False,
539
+ separate_stderr=True)
540
+
541
+ try:
542
+ subprocess_utils.handle_returncode(returncode,
543
+ code,
544
+ 'Failed to terminate the replica',
545
+ stderr,
546
+ stream_logs=True)
547
+ except exceptions.CommandError as e:
548
+ raise RuntimeError(e.error_msg) from e
549
+
550
+ sky_logging.print(stdout)
551
+
552
+
506
553
  @usage_lib.entrypoint
507
554
  def status(
508
555
  service_names: Optional[Union[str,
@@ -247,6 +247,8 @@ class ReplicaStatusProperty:
247
247
  is_scale_down: bool = False
248
248
  # The replica's spot instance was preempted.
249
249
  preempted: bool = False
250
+ # Whether the replica is purged.
251
+ purged: bool = False
250
252
 
251
253
  def remove_terminated_replica(self) -> bool:
252
254
  """Whether to remove the replica record from the replica table.
@@ -307,6 +309,8 @@ class ReplicaStatusProperty:
307
309
  return False
308
310
  if self.preempted:
309
311
  return False
312
+ if self.purged:
313
+ return False
310
314
  return True
311
315
 
312
316
  def to_replica_status(self) -> serve_state.ReplicaStatus:
@@ -590,7 +594,7 @@ class ReplicaManager:
590
594
  """
591
595
  raise NotImplementedError
592
596
 
593
- def scale_down(self, replica_id: int) -> None:
597
+ def scale_down(self, replica_id: int, purge: bool = False) -> None:
594
598
  """Scale down replica with replica_id."""
595
599
  raise NotImplementedError
596
600
 
@@ -679,7 +683,8 @@ class SkyPilotReplicaManager(ReplicaManager):
679
683
  replica_id: int,
680
684
  sync_down_logs: bool,
681
685
  replica_drain_delay_seconds: int,
682
- is_scale_down: bool = False) -> None:
686
+ is_scale_down: bool = False,
687
+ purge: bool = False) -> None:
683
688
 
684
689
  if replica_id in self._launch_process_pool:
685
690
  info = serve_state.get_replica_info_from_id(self._service_name,
@@ -763,16 +768,18 @@ class SkyPilotReplicaManager(ReplicaManager):
763
768
  )
764
769
  info.status_property.sky_down_status = ProcessStatus.RUNNING
765
770
  info.status_property.is_scale_down = is_scale_down
771
+ info.status_property.purged = purge
766
772
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
767
773
  p.start()
768
774
  self._down_process_pool[replica_id] = p
769
775
 
770
- def scale_down(self, replica_id: int) -> None:
776
+ def scale_down(self, replica_id: int, purge: bool = False) -> None:
771
777
  self._terminate_replica(
772
778
  replica_id,
773
779
  sync_down_logs=False,
774
780
  replica_drain_delay_seconds=_DEFAULT_DRAIN_SECONDS,
775
- is_scale_down=True)
781
+ is_scale_down=True,
782
+ purge=purge)
776
783
 
777
784
  def _handle_preemption(self, info: ReplicaInfo) -> bool:
778
785
  """Handle preemption of the replica if any error happened.
@@ -911,6 +918,8 @@ class SkyPilotReplicaManager(ReplicaManager):
911
918
  # since user should fixed the error before update.
912
919
  elif info.version != self.latest_version:
913
920
  removal_reason = 'for version outdated'
921
+ elif info.status_property.purged:
922
+ removal_reason = 'for purge'
914
923
  else:
915
924
  logger.info(f'Termination of replica {replica_id} '
916
925
  'finished. Replica info is kept since some '
sky/serve/serve_utils.py CHANGED
@@ -246,9 +246,11 @@ def set_service_status_and_active_versions_from_replica(
246
246
  update_mode: UpdateMode) -> None:
247
247
  record = serve_state.get_service_from_name(service_name)
248
248
  if record is None:
249
- raise ValueError('The service is up-ed in an old version and does not '
250
- 'support update. Please `sky serve down` '
251
- 'it first and relaunch the service.')
249
+ with ux_utils.print_exception_no_traceback():
250
+ raise ValueError(
251
+ 'The service is up-ed in an old version and does not '
252
+ 'support update. Please `sky serve down` '
253
+ 'it first and relaunch the service.')
252
254
  if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
253
255
  # When the service is shutting down, there is a period of time which the
254
256
  # controller still responds to the request, and the replica is not
@@ -289,7 +291,8 @@ def update_service_status() -> None:
289
291
  def update_service_encoded(service_name: str, version: int, mode: str) -> str:
290
292
  service_status = _get_service_status(service_name)
291
293
  if service_status is None:
292
- raise ValueError(f'Service {service_name!r} does not exist.')
294
+ with ux_utils.print_exception_no_traceback():
295
+ raise ValueError(f'Service {service_name!r} does not exist.')
293
296
  controller_port = service_status['controller_port']
294
297
  resp = requests.post(
295
298
  _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
@@ -299,20 +302,56 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
299
302
  'mode': mode,
300
303
  })
301
304
  if resp.status_code == 404:
302
- raise ValueError('The service is up-ed in an old version and does not '
303
- 'support update. Please `sky serve down` '
304
- 'it first and relaunch the service. ')
305
+ with ux_utils.print_exception_no_traceback():
306
+ raise ValueError(
307
+ 'The service is up-ed in an old version and does not '
308
+ 'support update. Please `sky serve down` '
309
+ 'it first and relaunch the service. ')
305
310
  elif resp.status_code == 400:
306
- raise ValueError(f'Client error during service update: {resp.text}')
311
+ with ux_utils.print_exception_no_traceback():
312
+ raise ValueError(f'Client error during service update: {resp.text}')
307
313
  elif resp.status_code == 500:
308
- raise RuntimeError(f'Server error during service update: {resp.text}')
314
+ with ux_utils.print_exception_no_traceback():
315
+ raise RuntimeError(
316
+ f'Server error during service update: {resp.text}')
309
317
  elif resp.status_code != 200:
310
- raise ValueError(f'Failed to update service: {resp.text}')
318
+ with ux_utils.print_exception_no_traceback():
319
+ raise ValueError(f'Failed to update service: {resp.text}')
311
320
 
312
321
  service_msg = resp.json()['message']
313
322
  return common_utils.encode_payload(service_msg)
314
323
 
315
324
 
325
+ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
326
+ service_status = _get_service_status(service_name)
327
+ if service_status is None:
328
+ with ux_utils.print_exception_no_traceback():
329
+ raise ValueError(f'Service {service_name!r} does not exist.')
330
+ replica_info = serve_state.get_replica_info_from_id(service_name,
331
+ replica_id)
332
+ if replica_info is None:
333
+ with ux_utils.print_exception_no_traceback():
334
+ raise ValueError(
335
+ f'Replica {replica_id} for service {service_name} does not '
336
+ 'exist.')
337
+
338
+ controller_port = service_status['controller_port']
339
+ resp = requests.post(
340
+ _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
341
+ '/controller/terminate_replica',
342
+ json={
343
+ 'replica_id': replica_id,
344
+ 'purge': purge,
345
+ })
346
+
347
+ message: str = resp.json()['message']
348
+ if resp.status_code != 200:
349
+ with ux_utils.print_exception_no_traceback():
350
+ raise ValueError(f'Failed to terminate replica {replica_id} '
351
+ f'in {service_name}. Reason:\n{message}')
352
+ return message
353
+
354
+
316
355
  def _get_service_status(
317
356
  service_name: str,
318
357
  with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
@@ -644,8 +683,9 @@ def stream_replica_logs(service_name: str, replica_id: int,
644
683
  for info in replica_info:
645
684
  if info.replica_id == replica_id:
646
685
  return info.status
647
- raise ValueError(
648
- _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id))
686
+ with ux_utils.print_exception_no_traceback():
687
+ raise ValueError(
688
+ _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id))
649
689
 
650
690
  finish_stream = (
651
691
  lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
@@ -735,7 +775,7 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
735
775
 
736
776
 
737
777
  def get_endpoint(service_record: Dict[str, Any]) -> str:
738
- # Don't use backend_utils.is_controller_up since it is too slow.
778
+ # Don't use backend_utils.is_controller_accessible since it is too slow.
739
779
  handle = global_user_state.get_handle_from_cluster_name(
740
780
  SKY_SERVE_CONTROLLER_NAME)
741
781
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
@@ -915,6 +955,18 @@ class ServeCodeGen:
915
955
  ]
916
956
  return cls._build(code)
917
957
 
958
+ @classmethod
959
+ def terminate_replica(cls, service_name: str, replica_id: int,
960
+ purge: bool) -> str:
961
+ code = [
962
+ f'(lambda: print(serve_utils.terminate_replica({service_name!r}, '
963
+ f'{replica_id}, {purge}), end="", flush=True) '
964
+ 'if getattr(constants, "SERVE_VERSION", 0) >= 2 else '
965
+ f'exec("raise RuntimeError('
966
+ f'{constants.TERMINATE_REPLICA_VERSION_MISMATCH_ERROR!r})"))()'
967
+ ]
968
+ return cls._build(code)
969
+
918
970
  @classmethod
919
971
  def wait_service_registration(cls, service_name: str, job_id: int) -> str:
920
972
  code = [
@@ -5,6 +5,26 @@ max_workers: {{num_nodes - 1}}
5
5
  upscaling_speed: {{num_nodes - 1}}
6
6
  idle_timeout_minutes: 60
7
7
 
8
+ {%- if docker_image is not none %}
9
+ docker:
10
+ image: {{docker_image}}
11
+ container_name: {{docker_container_name}}
12
+ run_options:
13
+ - --ulimit nofile=1048576:1048576
14
+ {%- for run_option in docker_run_options %}
15
+ - {{run_option}}
16
+ {%- endfor %}
17
+ {%- if docker_login_config is not none %}
18
+ docker_login_config:
19
+ username: |-
20
+ {{docker_login_config.username}}
21
+ password: |-
22
+ {{docker_login_config.password}}
23
+ server: |-
24
+ {{docker_login_config.server}}
25
+ {%- endif %}
26
+ {%- endif %}
27
+
8
28
  provider:
9
29
  type: external
10
30
  module: sky.provision.lambda
@@ -16,7 +16,11 @@ provider:
16
16
  disable_launch_config_check: true
17
17
 
18
18
  auth:
19
+ {% if os_type == "ubuntu" %}
19
20
  ssh_user: ubuntu
21
+ {% else %}
22
+ ssh_user: opc
23
+ {% endif %}
20
24
  ssh_private_key: {{ssh_private_key}}
21
25
 
22
26
  available_node_types:
@@ -85,14 +89,20 @@ setup_commands:
85
89
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
86
90
  # Line 'mkdir -p ..': disable host key check
87
91
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
88
- - sudo systemctl stop unattended-upgrades || true;
92
+ - echo "setup commands runs at $(date)" > /tmp/provision.tmp.out || true;
93
+ {%- if os_type == "ubuntu" %}
94
+ sudo systemctl stop unattended-upgrades || true;
89
95
  sudo systemctl disable unattended-upgrades || true;
90
96
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
91
97
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
92
98
  sudo pkill -9 apt-get;
93
99
  sudo pkill -9 dpkg;
94
100
  sudo dpkg --configure -a;
95
- ([ `sudo lshw -class display | grep "NVIDIA Corporation" | wc -l` -gt 0 ]) && (sudo which nvidia-smi > /dev/null || ( sudo apt-get install nvidia-driver-530-open -y && sudo apt-get install nvidia-driver-525-server -y ) || true);
101
+ {%- else %}
102
+ sudo /usr/libexec/oci-growfs -y || true;
103
+ sudo systemctl stop firewalld || true;
104
+ sudo systemctl disable firewalld || true;
105
+ {%- endif %}
96
106
  mkdir -p ~/.ssh; touch ~/.ssh/config;
97
107
  {{ conda_installation_commands }}
98
108
  {{ ray_skypilot_installation_commands }}
@@ -502,8 +502,10 @@ class SSHCommandRunner(CommandRunner):
502
502
  if self.ssh_control_name is not None:
503
503
  control_path = _ssh_control_path(self.ssh_control_name)
504
504
  if control_path is not None:
505
+ # Suppress the `Exit request sent.` output for this comamnd
506
+ # which would interrupt the CLI spinner.
505
507
  cmd = (f'ssh -O exit -S {control_path}/%C '
506
- f'{self.ssh_user}@{self.ip}')
508
+ f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
507
509
  logger.debug(f'Closing cached connection {control_path!r} with '
508
510
  f'cmd: {cmd}')
509
511
  log_lib.run_with_log(cmd,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241019
3
+ Version: 1.0.0.dev20241021
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=LKUWX7CbWSSP_7RKe0BbpUtwfW9AyCM0C6t-6csG3dI,5854
1
+ sky/__init__.py,sha256=3RKD64rxAs9PlurlTvQY9SSAILIPEznrfVw4n_oyctk,5854
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
5
- sky/cli.py,sha256=PJR6W92twf89j17OWLQJ9RawdazJcGslfW2L_fLB2PM,208545
5
+ sky/cli.py,sha256=XcQeVtH5J7xcARGagYicmTUfd2145jN2nvnZaZXtZlI,209981
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
@@ -10,7 +10,7 @@ sky/exceptions.py,sha256=D7WARzYRt4dGjXo6gI-gzkoodZbKF1D-qncm_DbHB28,8846
10
10
  sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
11
11
  sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
12
12
  sky/optimizer.py,sha256=OzxWiA6ZC0tyJ1eNMy4e72vitjfLKfbOLF9ywZOccXU,59343
13
- sky/resources.py,sha256=b9yaZvZkL-QZdElQLHsEZ2jhKgId2ixG8M2Z8DLBBKU,67450
13
+ sky/resources.py,sha256=Qk_CYvLO8OFsnRLqXu-nG6qXfJEZ2aBMzxFJHYaXTvE,67398
14
14
  sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
15
15
  sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
16
16
  sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
@@ -41,16 +41,16 @@ sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG
41
41
  sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
42
42
  sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
43
43
  sky/clouds/aws.py,sha256=XJVbOSkVVUHp9HbHDp0rFdHX113JHbY-3sgokGdNJVE,49527
44
- sky/clouds/azure.py,sha256=Yp_a1Lzvq4s47eRMeyVheDv9pC0hSPogCiTMYf-a5ZE,28687
44
+ sky/clouds/azure.py,sha256=jTgynKU5tuOyBe97n2I7_k9P0Sw0QFU-6wLDLFwQhfM,28634
45
45
  sky/clouds/cloud.py,sha256=PPk-Cbf1YbJT8bswcQLtPBtko02OWrRGJKkLzDpytTI,34858
46
46
  sky/clouds/cloud_registry.py,sha256=4yQMv-iBSgyN5aNL4Qxbn0JVE-dkVoEUIgj7S1z9S_Q,955
47
47
  sky/clouds/cudo.py,sha256=H4VyMo5wWGAv2MXZ3xsbWjlZA_cZYnt4ecNlTOOao8Y,13147
48
48
  sky/clouds/fluidstack.py,sha256=iOmoOx52yTrHKMzwBDaxFJCfNo79M61d5tj-Np24Lyc,12436
49
- sky/clouds/gcp.py,sha256=lUImS2WJIcUOtrgrVz8zaR4yPGqALqZ0lSmLbjN9xLU,54470
49
+ sky/clouds/gcp.py,sha256=m_dH04HqgU-DdW4R9wrSr66IpPt9JMKHEvHEGGFpeRo,54655
50
50
  sky/clouds/ibm.py,sha256=M8QdjeSFlwssfoY2aOodxG4q5R3eT9K-4lTPDHYvEYI,21476
51
51
  sky/clouds/kubernetes.py,sha256=aWoXWR-S4puZHzuUHroLKxLdTpkqU7j75dQlXECnsmE,28679
52
- sky/clouds/lambda_cloud.py,sha256=VtJ2mmwMT1X4zrzgt3FXM61zmrrgoELZHFgsdYVesPY,12562
53
- sky/clouds/oci.py,sha256=WXtxKwDBgi3He4ayi4qzJ4Y659Bi6xU8hWmYLHwiQYs,27371
52
+ sky/clouds/lambda_cloud.py,sha256=11dKUSunHUgaPZ1t8O85X29_NJ-o26sCt5DjwAPFgl4,12697
53
+ sky/clouds/oci.py,sha256=ecVgcbCVJwDLtaYXs-yGDzwPYRr23KvjnzFOXwaY2O0,26914
54
54
  sky/clouds/paperspace.py,sha256=lmUZPYAblaqiBmGQwCunccMiTF_dVA1o3vqY9Q_Nc28,10921
55
55
  sky/clouds/runpod.py,sha256=lstUC6f4JDhtcH9NfwkbpCJMmfmvMigoanhPXPbTYds,11540
56
56
  sky/clouds/scp.py,sha256=2KLTuNSMdBzK8CLwSesv7efOuiLidIMoyNG4AOt5Sqw,15870
@@ -67,7 +67,7 @@ sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOi
67
67
  sky/clouds/service_catalog/ibm_catalog.py,sha256=0dzjmXABFECzaAuIa0E6pVINhVK6-G6U52Mj-L45gK8,4472
68
68
  sky/clouds/service_catalog/kubernetes_catalog.py,sha256=6OocEUkgyJtBgHwzu4RPsvru6pj6RwGU-4uSFNQmsSM,8254
69
69
  sky/clouds/service_catalog/lambda_catalog.py,sha256=BAhUGqHj8aVe1zUhEQNO7bQUhcd9jAespGvPyQubTJY,5281
70
- sky/clouds/service_catalog/oci_catalog.py,sha256=tcV8_rsv_7_aTlcfTkq0XKdKRTFgwh8-rjyxVzPiYwQ,7744
70
+ sky/clouds/service_catalog/oci_catalog.py,sha256=AG1mOgc-iWaX4zapONWMZPNd2RCKCsaNOyFc0eq_LFU,8551
71
71
  sky/clouds/service_catalog/paperspace_catalog.py,sha256=W8GgGlPbbWViELQ8EZfmIkxSbeQcCmMRUX4ecIIYDsk,3768
72
72
  sky/clouds/service_catalog/runpod_catalog.py,sha256=NwZlolzihZeRxQKYIDhoXeUkJ3BSH1S6B_DszNDXT1g,4184
73
73
  sky/clouds/service_catalog/scp_catalog.py,sha256=4XnaZE5Q4XrrNnDnVhsHkH6jxmWXBeQqa9QqKqHKjSI,5174
@@ -84,7 +84,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DS
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
86
86
  sky/clouds/utils/gcp_utils.py,sha256=Xc_COjJfDt__oqVwrCw7ejY2B7ptHjMjDVb8obcpJ6s,6968
87
- sky/clouds/utils/oci_utils.py,sha256=LT_RtPQ2B1wlSF0e9PSD3NWxFFIzovcZeDjO-dyOghU,4482
87
+ sky/clouds/utils/oci_utils.py,sha256=t-5QEQEs8swN683AAp-oDD6yQJOQqVBbsVcHkNyqnbU,4968
88
88
  sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
89
89
  sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
90
90
  sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,7346
@@ -105,7 +105,7 @@ sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx
105
105
  sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
106
106
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
107
107
  sky/provision/constants.py,sha256=DvHj3wpqdpaSBHMOGIfVWLLWGJoz0eOQAx73DwYMNEk,531
108
- sky/provision/docker_utils.py,sha256=Z7vDUs9Yjqks_CsWrACcTgABIZuFi3EJVFwkU0WsdD0,18832
108
+ sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
109
109
  sky/provision/instance_setup.py,sha256=n1Px_KOYZl7Rf1WLXrfTTHyqxyA8_5QTN9BNLjQRkgc,22427
110
110
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
111
111
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
@@ -151,7 +151,7 @@ sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByx
151
151
  sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
152
152
  sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
153
153
  sky/provision/paperspace/instance.py,sha256=q_V01DZSMXLfy63Zwt6AQotq02JuXQZb5CHS_JttlwE,12046
154
- sky/provision/paperspace/utils.py,sha256=Bl3POslZjtZU_wbBIXid7ubhRy2j5kpsesR85q7MN5w,9428
154
+ sky/provision/paperspace/utils.py,sha256=uOmxbDKjV6skFizC4gYXSxRuEqso5ck2kF7MbtNmhEs,9580
155
155
  sky/provision/runpod/__init__.py,sha256=6HYvHI27EaLrX1SS0vWVhdLu5HDBeZCdvAeDJuwM5pk,556
156
156
  sky/provision/runpod/config.py,sha256=9ulZJVL7nHuxhTdoj8D7lNn7SdicJ5zc6FIcHIG9tcg,321
157
157
  sky/provision/runpod/instance.py,sha256=ucmFQEzapbxylsl6K9EUo7bHTZYzvfECo6tpJc-MFrw,9577
@@ -171,16 +171,16 @@ sky/provision/vsphere/common/service_manager_factory.py,sha256=YkvfHiRXFK_Nb406z
171
171
  sky/provision/vsphere/common/ssl_helper.py,sha256=TYzN9K0i_Mk_17PKGyGPgvOGfoizysuuIeYapcy_tWE,795
172
172
  sky/provision/vsphere/common/vapiconnect.py,sha256=R2I1ZWBA19d11fZ_FrIzQT8E1aLl1HU4Rdcj8Z5r3NE,2932
173
173
  sky/provision/vsphere/common/vim_utils.py,sha256=EMWLS8ILpdx6XwUZ9I53y0B_1yFrRrlr4jjIMT84hAc,17877
174
- sky/serve/__init__.py,sha256=Qg_XPOtQsUxiN-Q3njHZRfzoMcQ_KKU1QthkiTbESDw,1661
174
+ sky/serve/__init__.py,sha256=gFZt7W3UPMi4qvYe2xgkHg1VxbR1WGavKyWLBUD3mpg,1731
175
175
  sky/serve/autoscalers.py,sha256=khY1oZ22PRaUQNsLCoNKH178X_NiJw0LSLOKr7_LNgY,30275
176
- sky/serve/constants.py,sha256=OansIC7a0Pwat-Y5SF43T9phad_EvyjKO3peZgKFEHk,4367
177
- sky/serve/controller.py,sha256=gfE_gB7wxE1VxvnYqw_-KcMGc6X2kufl-NLR7sWdzdY,8172
178
- sky/serve/core.py,sha256=yebcgmafGwKppXA1vyJdnbWdOg5BSlh87pKL9gkzHPE,29066
176
+ sky/serve/constants.py,sha256=7MflfgTHO9gDSux93U4BmNeEMWXxZB4q7I54KUwgp-s,4651
177
+ sky/serve/controller.py,sha256=R5iIEGEEFtbm_6MvSGelYZP-vSmW0cSFuy64OexUc4g,11719
178
+ sky/serve/core.py,sha256=pz62ERWyHcg2p-rtzVjBZaWmKrK6Hx213YPoa_J5Tlo,31097
179
179
  sky/serve/load_balancer.py,sha256=aUfDsgUT_fYrchCwJCeunMPXmAkwJAY58BEu-IN2FaA,11571
180
180
  sky/serve/load_balancing_policies.py,sha256=ExdwH_pxPYpJ6CkoTQCOPSa4lzwbq1LFFMKzmIu8ryk,2331
181
- sky/serve/replica_managers.py,sha256=dO962WZ_6YWRDpyNemY7SzC7fZHlNfoL4kUS3MaKwDo,57405
181
+ sky/serve/replica_managers.py,sha256=1xYDK9Te5wFEF5hUK0gyNIUib0MY-HScLHUBDlTSl-k,57774
182
182
  sky/serve/serve_state.py,sha256=5BZSKKKxQRk-0mku17Ch4Veu4qOhaFvaOJY3zrZCkLw,19315
183
- sky/serve/serve_utils.py,sha256=im_1cJoJmufFxkBVnhK4nI6XlHvEXersQyIivNruJJc,38009
183
+ sky/serve/serve_utils.py,sha256=9tqh7i-99Kll-24sKhfjEzjTOnGXWJQdeqIyNkFVoMo,40180
184
184
  sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
185
185
  sky/serve/service_spec.py,sha256=iRhW95SERvb4NWtV10uCuhgvW31HuSAmZZ55OX0WK8s,15309
186
186
  sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
@@ -230,9 +230,9 @@ sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8w
230
230
  sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
231
231
  sky/templates/kubernetes-ray.yml.j2,sha256=Wq9luXc6-t141uyHbtOy1IDmLMM0PBbePTZfZEtAKw0,18160
232
232
  sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
233
- sky/templates/lambda-ray.yml.j2,sha256=oMbrfv3zHoD1v1XXMLCLK1vB7wLBU1Z_jNpC4-5lGVo,3985
233
+ sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
234
234
  sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
235
- sky/templates/oci-ray.yml.j2,sha256=5XfIobW9XuspIpEhI4vFIEcJEFCdtFJqEGfX03zL6DE,7032
235
+ sky/templates/oci-ray.yml.j2,sha256=E-xnadts-x88vYRI1QGFzgfGGKFospmo2N9d_0cPN5I,7144
236
236
  sky/templates/paperspace-ray.yml.j2,sha256=HQjZNamrB_a4fOMCxQXSVdV5JIHtbGtAE0JzEO8uuVQ,4021
237
237
  sky/templates/runpod-ray.yml.j2,sha256=p3BtYBHzROtNJqnjEo1xCmGSJQfCZYdarWszhDYyl0Q,3697
238
238
  sky/templates/scp-ray.yml.j2,sha256=I9u8Ax-lit-d6UrCC9BVU8avst8w1cwK6TrzZBcz_JM,5608
@@ -245,7 +245,7 @@ sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
245
  sky/utils/accelerator_registry.py,sha256=BO4iYH5bV80Xyp4EPfO0n1D3LL0FvESCy7xm59Je3_o,3798
246
246
  sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO9U,5943
247
247
  sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
248
- sky/utils/command_runner.py,sha256=ZIu4aur4yxtjHu60Na9o90Iu-g48_yeWXo-NNfmzs-w,34634
248
+ sky/utils/command_runner.py,sha256=TEFJlmIGzlZxZppcBdwDK4AscM0-08L2XRFwQIRK9OA,34784
249
249
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
250
250
  sky/utils/common_utils.py,sha256=MwFhIcvCEMBo7kbENUjN3qRNO5SoMV0fzAORc65c5x0,24525
251
251
  sky/utils/controller_utils.py,sha256=V05hiLJIjqqXssYzs_Gchk4-tijgpMgLJsRW8ymhS-E,40625
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
276
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241019.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241019.dist-info/METADATA,sha256=AE2fCPLtATmQ_7yujx2cd9zAwVowsMapi0bTc7-Gk6A,19540
279
- skypilot_nightly-1.0.0.dev20241019.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
280
- skypilot_nightly-1.0.0.dev20241019.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241019.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241019.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241021.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241021.dist-info/METADATA,sha256=mv3zgaXoDB_-9jX9Sk5tEP5oiCfv5xZ76gMbFothu4g,19540
279
+ skypilot_nightly-1.0.0.dev20241021.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
280
+ skypilot_nightly-1.0.0.dev20241021.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241021.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241021.dist-info/RECORD,,