skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/common.py +15 -9
  3. sky/adaptors/do.py +20 -0
  4. sky/adaptors/oci.py +32 -1
  5. sky/authentication.py +20 -8
  6. sky/backends/backend_utils.py +44 -0
  7. sky/backends/cloud_vm_ray_backend.py +202 -41
  8. sky/backends/wheel_utils.py +4 -1
  9. sky/check.py +31 -1
  10. sky/cli.py +39 -43
  11. sky/cloud_stores.py +71 -2
  12. sky/clouds/__init__.py +2 -0
  13. sky/clouds/aws.py +137 -50
  14. sky/clouds/cloud.py +4 -0
  15. sky/clouds/do.py +303 -0
  16. sky/clouds/gcp.py +9 -0
  17. sky/clouds/kubernetes.py +3 -3
  18. sky/clouds/oci.py +20 -9
  19. sky/clouds/service_catalog/__init__.py +7 -3
  20. sky/clouds/service_catalog/constants.py +1 -1
  21. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
  22. sky/clouds/service_catalog/do_catalog.py +111 -0
  23. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  24. sky/clouds/utils/oci_utils.py +15 -2
  25. sky/core.py +8 -5
  26. sky/data/data_transfer.py +37 -0
  27. sky/data/data_utils.py +19 -4
  28. sky/data/mounting_utils.py +99 -15
  29. sky/data/storage.py +961 -130
  30. sky/global_user_state.py +1 -1
  31. sky/jobs/__init__.py +2 -0
  32. sky/jobs/constants.py +8 -7
  33. sky/jobs/controller.py +19 -22
  34. sky/jobs/core.py +46 -2
  35. sky/jobs/recovery_strategy.py +114 -143
  36. sky/jobs/scheduler.py +283 -0
  37. sky/jobs/state.py +290 -21
  38. sky/jobs/utils.py +346 -95
  39. sky/optimizer.py +6 -3
  40. sky/provision/aws/config.py +59 -29
  41. sky/provision/azure/instance.py +1 -1
  42. sky/provision/do/__init__.py +11 -0
  43. sky/provision/do/config.py +14 -0
  44. sky/provision/do/constants.py +10 -0
  45. sky/provision/do/instance.py +287 -0
  46. sky/provision/do/utils.py +306 -0
  47. sky/provision/docker_utils.py +22 -11
  48. sky/provision/gcp/instance_utils.py +15 -9
  49. sky/provision/kubernetes/instance.py +3 -2
  50. sky/provision/kubernetes/utils.py +125 -20
  51. sky/provision/oci/query_utils.py +17 -14
  52. sky/provision/provisioner.py +0 -1
  53. sky/provision/runpod/instance.py +10 -1
  54. sky/provision/runpod/utils.py +170 -13
  55. sky/resources.py +1 -1
  56. sky/serve/autoscalers.py +359 -301
  57. sky/serve/controller.py +10 -8
  58. sky/serve/core.py +84 -7
  59. sky/serve/load_balancer.py +27 -10
  60. sky/serve/replica_managers.py +1 -3
  61. sky/serve/serve_state.py +10 -5
  62. sky/serve/serve_utils.py +28 -1
  63. sky/serve/service.py +4 -3
  64. sky/serve/service_spec.py +31 -0
  65. sky/setup_files/dependencies.py +4 -1
  66. sky/skylet/constants.py +8 -4
  67. sky/skylet/events.py +7 -3
  68. sky/skylet/job_lib.py +10 -30
  69. sky/skylet/log_lib.py +8 -8
  70. sky/skylet/log_lib.pyi +3 -0
  71. sky/skylet/providers/command_runner.py +5 -7
  72. sky/skylet/skylet.py +1 -1
  73. sky/task.py +28 -1
  74. sky/templates/do-ray.yml.j2 +98 -0
  75. sky/templates/jobs-controller.yaml.j2 +41 -7
  76. sky/templates/runpod-ray.yml.j2 +13 -0
  77. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  78. sky/usage/usage_lib.py +10 -2
  79. sky/utils/accelerator_registry.py +12 -8
  80. sky/utils/controller_utils.py +114 -39
  81. sky/utils/db_utils.py +18 -4
  82. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  83. sky/utils/log_utils.py +2 -0
  84. sky/utils/resources_utils.py +25 -21
  85. sky/utils/schemas.py +27 -0
  86. sky/utils/subprocess_utils.py +54 -10
  87. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
  88. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
  89. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
  90. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py CHANGED
@@ -25,9 +25,9 @@ from sky.utils import log_utils
25
25
  from sky.utils import subprocess_utils
26
26
  from sky.utils import ux_utils
27
27
 
28
- _SKY_LOG_WAITING_GAP_SECONDS = 1
29
- _SKY_LOG_WAITING_MAX_RETRY = 5
30
- _SKY_LOG_TAILING_GAP_SECONDS = 0.2
28
+ SKY_LOG_WAITING_GAP_SECONDS = 1
29
+ SKY_LOG_WAITING_MAX_RETRY = 5
30
+ SKY_LOG_TAILING_GAP_SECONDS = 0.2
31
31
  # Peek the head of the lines to check if we need to start
32
32
  # streaming when tail > 0.
33
33
  PEEK_HEAD_LINES_FOR_START_STREAM = 20
@@ -336,7 +336,7 @@ def _follow_job_logs(file,
336
336
  ]:
337
337
  if wait_last_logs:
338
338
  # Wait all the logs are printed before exit.
339
- time.sleep(1 + _SKY_LOG_TAILING_GAP_SECONDS)
339
+ time.sleep(1 + SKY_LOG_TAILING_GAP_SECONDS)
340
340
  wait_last_logs = False
341
341
  continue
342
342
  status_str = status.value if status is not None else 'None'
@@ -345,7 +345,7 @@ def _follow_job_logs(file,
345
345
  f'Job finished (status: {status_str}).'))
346
346
  return
347
347
 
348
- time.sleep(_SKY_LOG_TAILING_GAP_SECONDS)
348
+ time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
349
349
  status = job_lib.get_status_no_lock(job_id)
350
350
 
351
351
 
@@ -426,15 +426,15 @@ def tail_logs(job_id: Optional[int],
426
426
  retry_cnt += 1
427
427
  if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
428
428
  break
429
- if retry_cnt >= _SKY_LOG_WAITING_MAX_RETRY:
429
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
430
430
  print(
431
431
  f'{colorama.Fore.RED}ERROR: Logs for '
432
432
  f'{job_str} (status: {status.value}) does not exist '
433
433
  f'after retrying {retry_cnt} times.{colorama.Style.RESET_ALL}')
434
434
  return
435
- print(f'INFO: Waiting {_SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
435
+ print(f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
436
436
  'to be written...')
437
- time.sleep(_SKY_LOG_WAITING_GAP_SECONDS)
437
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
438
438
  status = job_lib.update_job_status([job_id], silent=True)[0]
439
439
 
440
440
  start_stream_at = LOG_FILE_START_STREAMING_AT
sky/skylet/log_lib.pyi CHANGED
@@ -13,6 +13,9 @@ from sky.skylet import constants as constants
13
13
  from sky.skylet import job_lib as job_lib
14
14
  from sky.utils import log_utils as log_utils
15
15
 
16
+ SKY_LOG_WAITING_GAP_SECONDS: int = ...
17
+ SKY_LOG_WAITING_MAX_RETRY: int = ...
18
+ SKY_LOG_TAILING_GAP_SECONDS: float = ...
16
19
  LOG_FILE_START_STREAMING_AT: str = ...
17
20
 
18
21
 
@@ -25,7 +25,7 @@ def docker_start_cmds(
25
25
  docker_cmd,
26
26
  ):
27
27
  """Generating docker start command without --rm.
28
-
28
+
29
29
  The code is borrowed from `ray.autoscaler._private.docker`.
30
30
 
31
31
  Changes we made:
@@ -159,19 +159,17 @@ class SkyDockerCommandRunner(DockerCommandRunner):
159
159
  return True
160
160
 
161
161
  # SkyPilot: Docker login if user specified a private docker registry.
162
- if "docker_login_config" in self.docker_config:
162
+ if 'docker_login_config' in self.docker_config:
163
163
  # TODO(tian): Maybe support a command to get the login password?
164
- docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[
165
- "docker_login_config"]
164
+ docker_login_config: docker_utils.DockerLoginConfig = (
165
+ self.docker_config['docker_login_config'])
166
166
  self._run_with_retry(
167
167
  f'{self.docker_cmd} login --username '
168
168
  f'{docker_login_config.username} --password '
169
169
  f'{docker_login_config.password} {docker_login_config.server}')
170
170
  # We automatically add the server prefix to the image name if
171
171
  # the user did not add it.
172
- server_prefix = f'{docker_login_config.server}/'
173
- if not specific_image.startswith(server_prefix):
174
- specific_image = f'{server_prefix}{specific_image}'
172
+ specific_image = docker_login_config.format_image(specific_image)
175
173
 
176
174
  if self.docker_config.get('pull_before_run', True):
177
175
  assert specific_image, ('Image must be included in config if '
sky/skylet/skylet.py CHANGED
@@ -20,7 +20,7 @@ EVENTS = [
20
20
  # The managed job update event should be after the job update event.
21
21
  # Otherwise, the abnormal managed job status update will be delayed
22
22
  # until the next job update event.
23
- events.ManagedJobUpdateEvent(),
23
+ events.ManagedJobEvent(),
24
24
  # This is for monitoring controller job status. If it becomes
25
25
  # unhealthy, this event will correctly update the controller
26
26
  # status to CONTROLLER_FAILED.
sky/task.py CHANGED
@@ -948,12 +948,22 @@ class Task:
948
948
  store_type = storage_lib.StoreType.from_cloud(storage_cloud_str)
949
949
  return store_type, storage_region
950
950
 
951
- def sync_storage_mounts(self) -> None:
951
+ def sync_storage_mounts(self, force_sync: bool = False) -> None:
952
952
  """(INTERNAL) Eagerly syncs storage mounts to cloud storage.
953
953
 
954
954
  After syncing up, COPY-mode storage mounts are translated into regular
955
955
  file_mounts of the form ``{ /remote/path: {s3,gs,..}://<bucket path>
956
956
  }``.
957
+
958
+ Args:
959
+ force_sync: If True, forces the synchronization of storage mounts.
960
+ If the store object is added via storage.add_store(),
961
+ the sync will happen automatically via add_store.
962
+ However, if it is passed via the construction function
963
+ of storage, it is usually because the user passed an
964
+ intermediate bucket name in the config and we need to
965
+ construct from the user config. In this case, set
966
+ force_sync to True.
957
967
  """
958
968
  for storage in self.storage_mounts.values():
959
969
  if not storage.stores:
@@ -961,6 +971,8 @@ class Task:
961
971
  self.storage_plans[storage] = store_type
962
972
  storage.add_store(store_type, store_region)
963
973
  else:
974
+ if force_sync:
975
+ storage.sync_all_stores()
964
976
  # We will download the first store that is added to remote.
965
977
  self.storage_plans[storage] = list(storage.stores.keys())[0]
966
978
 
@@ -977,6 +989,7 @@ class Task:
977
989
  else:
978
990
  assert storage.name is not None, storage
979
991
  blob_path = 's3://' + storage.name
992
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
980
993
  self.update_file_mounts({
981
994
  mnt_path: blob_path,
982
995
  })
@@ -987,6 +1000,7 @@ class Task:
987
1000
  else:
988
1001
  assert storage.name is not None, storage
989
1002
  blob_path = 'gs://' + storage.name
1003
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
990
1004
  self.update_file_mounts({
991
1005
  mnt_path: blob_path,
992
1006
  })
@@ -1005,6 +1019,7 @@ class Task:
1005
1019
  blob_path = data_utils.AZURE_CONTAINER_URL.format(
1006
1020
  storage_account_name=storage_account_name,
1007
1021
  container_name=storage.name)
1022
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
1008
1023
  self.update_file_mounts({
1009
1024
  mnt_path: blob_path,
1010
1025
  })
@@ -1015,6 +1030,7 @@ class Task:
1015
1030
  blob_path = storage.source
1016
1031
  else:
1017
1032
  blob_path = 'r2://' + storage.name
1033
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
1018
1034
  self.update_file_mounts({
1019
1035
  mnt_path: blob_path,
1020
1036
  })
@@ -1030,7 +1046,18 @@ class Task:
1030
1046
  cos_region = data_utils.Rclone.get_region_from_rclone(
1031
1047
  storage.name, data_utils.Rclone.RcloneClouds.IBM)
1032
1048
  blob_path = f'cos://{cos_region}/{storage.name}'
1049
+ blob_path = storage.get_bucket_sub_path_prefix(blob_path)
1033
1050
  self.update_file_mounts({mnt_path: blob_path})
1051
+ elif store_type is storage_lib.StoreType.OCI:
1052
+ if storage.source is not None and not isinstance(
1053
+ storage.source,
1054
+ list) and storage.source.startswith('oci://'):
1055
+ blob_path = storage.source
1056
+ else:
1057
+ blob_path = 'oci://' + storage.name
1058
+ self.update_file_mounts({
1059
+ mnt_path: blob_path,
1060
+ })
1034
1061
  else:
1035
1062
  with ux_utils.print_exception_no_traceback():
1036
1063
  raise ValueError(f'Storage Type {store_type} '
@@ -0,0 +1,98 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ {%- if docker_image is not none %}
9
+ docker:
10
+ image: {{docker_image}}
11
+ container_name: {{docker_container_name}}
12
+ run_options:
13
+ - --ulimit nofile=1048576:1048576
14
+ {%- for run_option in docker_run_options %}
15
+ - {{run_option}}
16
+ {%- endfor %}
17
+ {%- if docker_login_config is not none %}
18
+ docker_login_config:
19
+ username: |-
20
+ {{docker_login_config.username}}
21
+ password: |-
22
+ {{docker_login_config.password}}
23
+ server: |-
24
+ {{docker_login_config.server}}
25
+ {%- endif %}
26
+ {%- endif %}
27
+
28
+ provider:
29
+ type: external
30
+ module: sky.provision.do
31
+ region: "{{region}}"
32
+
33
+ auth:
34
+ ssh_user: root
35
+ ssh_private_key: {{ssh_private_key}}
36
+ ssh_public_key: |-
37
+ skypilot:ssh_public_key_content
38
+
39
+ available_node_types:
40
+ ray_head_default:
41
+ resources: {}
42
+ node_config:
43
+ InstanceType: {{instance_type}}
44
+ DiskSize: {{disk_size}}
45
+ {%- if image_id is not none %}
46
+ ImageId: {{image_id}}
47
+ {%- endif %}
48
+
49
+ head_node_type: ray_head_default
50
+
51
+ # Format: `REMOTE_PATH : LOCAL_PATH`
52
+ file_mounts: {
53
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
54
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
55
+ {%- for remote_path, local_path in credentials.items() %}
56
+ "{{remote_path}}": "{{local_path}}",
57
+ {%- endfor %}
58
+ }
59
+
60
+ rsync_exclude: []
61
+
62
+ initialization_commands: []
63
+
64
+ # List of shell commands to run to set up nodes.
65
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
66
+ # connection, which is expensive. Try your best to co-locate commands into fewer
67
+ # items!
68
+ #
69
+ # Increment the following for catching performance bugs easier:
70
+ # current num items (num SSH connections): 1
71
+ setup_commands:
72
+ # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
73
+ # Create ~/.ssh/config file in case the file does not exist in the image.
74
+ # Line 'rm ..': there is another installation of pip.
75
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
76
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
77
+ # Line 'mkdir -p ..': disable host key check
78
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
79
+ - {%- for initial_setup_command in initial_setup_commands %}
80
+ {{ initial_setup_command }}
81
+ {%- endfor %}
82
+ sudo systemctl stop unattended-upgrades || true;
83
+ sudo systemctl disable unattended-upgrades || true;
84
+ sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
85
+ sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
86
+ sudo pkill -9 apt-get;
87
+ sudo pkill -9 dpkg;
88
+ sudo dpkg --configure -a;
89
+ mkdir -p ~/.ssh; touch ~/.ssh/config;
90
+ {{ conda_installation_commands }}
91
+ {{ ray_skypilot_installation_commands }}
92
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
93
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
94
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
95
+ [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
96
+
97
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
98
+ # We do not need to list it here anymore.
@@ -26,16 +26,50 @@ setup: |
26
26
  echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
27
27
  {% endif %}
28
28
 
29
- # Dashboard.
30
- ps aux | grep -v nohup | grep -v grep | grep -- "-m sky.spot.dashboard" | awk '{print $2}' | xargs kill > /dev/null 2>&1 || true
31
- pip list | grep flask > /dev/null 2>&1 || pip install flask 2>&1 > /dev/null
32
- ((ps aux | grep -v nohup | grep -v grep | grep -q -- "-m sky.jobs.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &));
29
+ # Create systemd service file
30
+ mkdir -p ~/.config/systemd/user/
31
+
32
+ # Create systemd user service file
33
+ cat << EOF > ~/.config/systemd/user/skypilot-dashboard.service
34
+ [Unit]
35
+ Description=SkyPilot Jobs Dashboard
36
+ After=network.target
37
+
38
+ [Service]
39
+ Environment="PATH={{ sky_python_env_path }}:\$PATH"
40
+ Environment="SKYPILOT_USER_ID={{controller_envs.SKYPILOT_USER_ID}}"
41
+ Environment="SKYPILOT_USER={{controller_envs.SKYPILOT_USER}}"
42
+ Restart=always
43
+ StandardOutput=append:/home/$USER/.sky/job-dashboard.log
44
+ StandardError=append:/home/$USER/.sky/job-dashboard.log
45
+ ExecStart={{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard
46
+
47
+ [Install]
48
+ WantedBy=default.target
49
+ EOF
50
+
51
+ if command -v systemctl &>/dev/null && systemctl --user show &>/dev/null; then
52
+ systemctl --user daemon-reload
53
+ systemctl --user enable --now skypilot-dashboard
54
+ else
55
+ echo "Systemd user services not found. Setting up SkyPilot dashboard manually."
56
+ # Kill any old dashboard processes
57
+ ps aux | grep -v nohup | grep -v grep | grep -- '-m sky.jobs.dashboard.dashboard' \
58
+ | awk '{print $2}' | xargs kill > /dev/null 2>&1 || true
59
+ # Launch the dashboard in the background if not already running
60
+ (ps aux | grep -v nohup | grep -v grep | grep -q -- '-m sky.jobs.dashboard.dashboard') || \
61
+ (nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &)
62
+ fi
33
63
 
34
64
  run: |
35
65
  {{ sky_activate_python_env }}
36
- # Start the controller for the current managed job.
37
- python -u -m sky.jobs.controller {{remote_user_yaml_path}} \
38
- --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %}
66
+ # Submit the job to the scheduler.
67
+ # Note: The job is already in the `spot` table, marked as PENDING.
68
+ # CloudVmRayBackend._exec_code_on_head() calls
69
+ # managed_job_codegen.set_pending() before we get here.
70
+ python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
71
+ --job-id $SKYPILOT_INTERNAL_JOB_ID
72
+
39
73
 
40
74
  envs:
41
75
  {%- for env_name, env_value in controller_envs.items() %}
@@ -10,6 +10,19 @@ provider:
10
10
  module: sky.provision.runpod
11
11
  region: "{{region}}"
12
12
  disable_launch_config_check: true
13
+ # For RunPod, we directly set the image id for the docker as runtime environment
14
+ # support, thus we need to avoid the DockerInitializer detects the docker field
15
+ # and performs the initialization. Therefore we put the docker login config in
16
+ # the provider config here.
17
+ {%- if docker_login_config is not none %}
18
+ docker_login_config:
19
+ username: |-
20
+ {{docker_login_config.username}}
21
+ password: |-
22
+ {{docker_login_config.password}}
23
+ server: |-
24
+ {{docker_login_config.server}}
25
+ {%- endif %}
13
26
 
14
27
  auth:
15
28
  ssh_user: root
@@ -29,6 +29,10 @@ file_mounts:
29
29
  {%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
30
30
  {{remote_catalog_path}}: {{local_catalog_path}}
31
31
  {%- endfor %}
32
+ {%- if use_tls %}
33
+ {{remote_tls_keyfile}}: {{local_tls_keyfile}}
34
+ {{remote_tls_certfile}}: {{local_tls_certfile}}
35
+ {%- endif %}
32
36
 
33
37
  run: |
34
38
  # Activate the Python environment, so that cloud SDKs can be found in the
sky/usage/usage_lib.py CHANGED
@@ -3,7 +3,6 @@
3
3
  import contextlib
4
4
  import datetime
5
5
  import enum
6
- import inspect
7
6
  import json
8
7
  import os
9
8
  import time
@@ -12,19 +11,28 @@ import typing
12
11
  from typing import Any, Callable, Dict, List, Optional, Union
13
12
 
14
13
  import click
15
- import requests
16
14
 
17
15
  import sky
18
16
  from sky import sky_logging
17
+ from sky.adaptors import common as adaptors_common
19
18
  from sky.usage import constants
20
19
  from sky.utils import common_utils
21
20
  from sky.utils import env_options
22
21
  from sky.utils import ux_utils
23
22
 
24
23
  if typing.TYPE_CHECKING:
24
+ import inspect
25
+
26
+ import requests
27
+
25
28
  from sky import resources as resources_lib
26
29
  from sky import status_lib
27
30
  from sky import task as task_lib
31
+ else:
32
+ # requests and inspect cost ~100ms to load, which can be postponed to
33
+ # collection phase or skipped if user specifies no collection
34
+ requests = adaptors_common.LazyImport('requests')
35
+ inspect = adaptors_common.LazyImport('inspect')
28
36
 
29
37
  logger = sky_logging.init_logger(__name__)
30
38
 
@@ -3,6 +3,7 @@ import typing
3
3
  from typing import Optional
4
4
 
5
5
  from sky.clouds import service_catalog
6
+ from sky.utils import rich_utils
6
7
  from sky.utils import ux_utils
7
8
 
8
9
  if typing.TYPE_CHECKING:
@@ -88,14 +89,17 @@ def canonicalize_accelerator_name(accelerator: str,
88
89
  if accelerator.lower() in mapping:
89
90
  return mapping[accelerator.lower()]
90
91
 
91
- # _ACCELERATORS may not be comprehensive.
92
- # Users may manually add new accelerators to the catalogs, or download new
93
- # catalogs (that have new accelerators) without upgrading SkyPilot.
94
- # To cover such cases, we should search the accelerator name
95
- # in the service catalog.
96
- searched = service_catalog.list_accelerators(name_filter=accelerator,
97
- case_sensitive=False,
98
- clouds=cloud_str)
92
+ # Listing accelerators can be time-consuming since canonicalizing usually
93
+ # involves catalog reading with cache not warmed up.
94
+ with rich_utils.safe_status('Listing accelerators...'):
95
+ # _ACCELERATORS may not be comprehensive.
96
+ # Users may manually add new accelerators to the catalogs, or download
97
+ # new catalogs (that have new accelerators) without upgrading SkyPilot.
98
+ # To cover such cases, we should search the accelerator name
99
+ # in the service catalog.
100
+ searched = service_catalog.list_accelerators(name_filter=accelerator,
101
+ case_sensitive=False,
102
+ clouds=cloud_str)
99
103
  names = list(searched.keys())
100
104
 
101
105
  # Exact match.