skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py CHANGED
@@ -1,10 +1,15 @@
1
1
  """Constants for SkyPilot."""
2
+ from typing import List, Tuple
3
+
2
4
  from packaging import version
3
5
 
4
6
  import sky
7
+ from sky.setup_files import dependencies
5
8
 
6
9
  SKY_LOGS_DIRECTORY = '~/sky_logs'
7
10
  SKY_REMOTE_WORKDIR = '~/sky_workdir'
11
+ SKY_IGNORE_FILE = '.skyignore'
12
+ GIT_IGNORE_FILE = '.gitignore'
8
13
 
9
14
  # Default Ray port is 6379. Default Ray dashboard port is 8265.
10
15
  # Default Ray tempdir is /tmp/ray.
@@ -35,32 +40,47 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
35
40
  'which python3')
36
41
  # Python executable, e.g., /opt/conda/bin/python3
37
42
  SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
43
+ # Prefer SKY_UV_PIP_CMD, which is faster.
44
+ # TODO(cooperc): remove remaining usage (GCP TPU setup).
38
45
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
39
46
  # Ray executable, e.g., /opt/conda/bin/ray
40
47
  # We need to add SKY_PYTHON_CMD before ray executable because:
41
48
  # The ray executable is a python script with a header like:
42
49
  # #!/opt/conda/bin/python3
43
- # When we create the skypilot-runtime venv, the previously installed ray
44
- # executable will be reused (due to --system-site-packages), and that will cause
45
- # running ray CLI commands to use the wrong python executable.
46
50
  SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
47
51
  f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
48
52
  # Separate env for SkyPilot runtime dependencies.
49
53
  SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
50
54
  SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
51
55
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
56
+ # uv is used for venv and pip, much faster than python implementations.
57
+ SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
58
+ SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
59
+ # This won't reinstall uv if it's already installed, so it's safe to re-run.
60
+ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
61
+ 'curl -LsSf https://astral.sh/uv/install.sh '
62
+ f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
63
+ SKY_UV_PIP_CMD = f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip'
64
+ # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
65
+ # environment. `deactivate` command does not work when conda is used.
66
+ DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
67
+ 'export PATH='
68
+ f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
69
+
70
+ # Prefix for SkyPilot environment variables
71
+ SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
52
72
 
53
73
  # The name for the environment variable that stores the unique ID of the
54
74
  # current task. This will stay the same across multiple recoveries of the
55
75
  # same managed task.
56
- TASK_ID_ENV_VAR = 'SKYPILOT_TASK_ID'
76
+ TASK_ID_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_ID'
57
77
  # This environment variable stores a '\n'-separated list of task IDs that
58
78
  # are within the same managed job (DAG). This can be used by the user to
59
79
  # retrieve the task IDs of any tasks that are within the same managed job.
60
80
  # This environment variable is pre-assigned before any task starts
61
81
  # running within the same job, and will remain constant throughout the
62
82
  # lifetime of the job.
63
- TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
83
+ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
64
84
 
65
85
  # The version of skylet. MUST bump this version whenever we need the skylet to
66
86
  # be restarted on existing clusters updated with the new version of SkyPilot,
@@ -69,11 +89,11 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
69
89
  # cluster yaml is updated.
70
90
  #
71
91
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
72
- SKYLET_VERSION = '8'
92
+ SKYLET_VERSION = '12'
73
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
74
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
75
95
  # user can be notified to update their SkyPilot version on the remote cluster.
76
- SKYLET_LIB_VERSION = 1
96
+ SKYLET_LIB_VERSION = 2
77
97
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
78
98
 
79
99
  # `sky jobs dashboard`-related
@@ -84,15 +104,37 @@ SPOT_DASHBOARD_REMOTE_PORT = 5000
84
104
  # Docker default options
85
105
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
86
106
  DEFAULT_DOCKER_PORT = 10022
87
- DOCKER_USERNAME_ENV_VAR = 'SKYPILOT_DOCKER_USERNAME'
88
- DOCKER_PASSWORD_ENV_VAR = 'SKYPILOT_DOCKER_PASSWORD'
89
- DOCKER_SERVER_ENV_VAR = 'SKYPILOT_DOCKER_SERVER'
107
+ DOCKER_USERNAME_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}DOCKER_USERNAME'
108
+ DOCKER_PASSWORD_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}DOCKER_PASSWORD'
109
+ DOCKER_SERVER_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}DOCKER_SERVER'
90
110
  DOCKER_LOGIN_ENV_VARS = {
91
111
  DOCKER_USERNAME_ENV_VAR,
92
112
  DOCKER_PASSWORD_ENV_VAR,
93
113
  DOCKER_SERVER_ENV_VAR,
94
114
  }
95
115
 
116
+ RUNPOD_DOCKER_USERNAME_ENV_VAR = 'SKYPILOT_RUNPOD_DOCKER_USERNAME'
117
+
118
+ # Commands for disable GPU ECC, which can improve the performance of the GPU
119
+ # for some workloads by 30%. This will only be applied when a user specify
120
+ # `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml.
121
+ # Running this command will reboot the machine, introducing overhead for
122
+ # provisioning the machine.
123
+ # https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW
124
+ DISABLE_GPU_ECC_COMMAND = (
125
+ # Check if the GPU ECC is enabled. We use `sudo which` to check nvidia-smi
126
+ # because in some environments, nvidia-smi is not in path for sudo and we
127
+ # should skip disabling ECC in this case.
128
+ 'sudo which nvidia-smi && echo "Checking Nvidia ECC Mode" && '
129
+ 'out=$(nvidia-smi -q | grep "ECC Mode" -A2) && '
130
+ 'echo "$out" && echo "$out" | grep Current | grep Enabled && '
131
+ 'echo "Disabling Nvidia ECC" && '
132
+ # Disable the GPU ECC.
133
+ 'sudo nvidia-smi -e 0 && '
134
+ # Reboot the machine to apply the changes.
135
+ '{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
136
+ '|| true; ')
137
+
96
138
  # Install conda on the remote cluster if it is not already installed.
97
139
  # We use conda with python 3.10 to be consistent across multiple clouds with
98
140
  # best effort.
@@ -101,40 +143,51 @@ DOCKER_LOGIN_ENV_VARS = {
101
143
  # AWS's Deep Learning AMI's default conda environment.
102
144
  CONDA_INSTALLATION_COMMANDS = (
103
145
  'which conda > /dev/null 2>&1 || '
104
- '{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
105
- 'bash Miniconda3-Linux-x86_64.sh -b && '
146
+ '{ '
147
+ 'curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
148
+ # We do not use && for installation of conda and the following init commands
149
+ # because for some images, conda is already installed, but not initialized.
150
+ # In this case, we need to initialize conda and set auto_activate_base to
151
+ # true.
152
+ '{ bash Miniconda3-Linux-x86_64.sh -b; '
106
153
  'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
107
- 'conda config --set auto_activate_base true && '
108
- f'conda activate base; }}; '
154
+ # Caller should replace {conda_auto_activate} with either true or false.
155
+ 'conda config --set auto_activate_base {conda_auto_activate} && '
156
+ 'conda activate base; }; '
157
+ '}; '
109
158
  'grep "# >>> conda initialize >>>" ~/.bashrc || '
110
159
  '{ conda init && source ~/.bashrc; };'
111
- # If Python version is larger then equal to 3.12, create a new conda env
112
- # with Python 3.10.
113
- # We don't use a separate conda env for SkyPilot dependencies because it is
114
- # costly to create a new conda env, and venv should be a lightweight and
115
- # faster alternative when the python version satisfies the requirement.
116
- '[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && '
117
- f'echo "Creating conda env with Python 3.10" && '
118
- f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && '
119
- f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};'
160
+ # Install uv for venv management and pip installation.
161
+ f'{SKY_UV_INSTALL_CMD};'
120
162
  # Create a separate conda environment for SkyPilot dependencies.
121
163
  f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
122
- f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && '
123
- f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};'
164
+ # Do NOT use --system-site-packages here, because if users upgrade any
165
+ # packages in the base env, they interfere with skypilot dependencies.
166
+ # Reference: https://github.com/skypilot-org/skypilot/issues/4097
167
+ # --seed will include pip and setuptools, which are present in venvs created
168
+ # with python -m venv.
169
+ # --python 3.10 will ensure the specific python version is downloaded
170
+ # and installed in the venv. SkyPilot requires Python<3.12, and 3.10 is
171
+ # preferred. We have to always pass in `--python` to avoid the issue when a
172
+ # user has `.python_version` file in their home directory, which will cause
173
+ # uv to use the python version specified in the `.python_version` file.
174
+ # TODO(zhwu): consider adding --python-preference only-managed to avoid
175
+ # using the system python, if a user report such issue.
176
+ f'{SKY_UV_CMD} venv --seed {SKY_REMOTE_PYTHON_ENV} --python 3.10;'
177
+ f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE};'
124
178
  )
125
179
 
126
180
  _sky_version = str(version.parse(sky.__version__))
127
181
  RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
128
- # Install ray and skypilot on the remote cluster if they are not already
129
- # installed. {var} will be replaced with the actual value in
130
- # backend_utils.write_cluster_config.
131
- RAY_SKYPILOT_INSTALLATION_COMMANDS = (
182
+ RAY_INSTALLATION_COMMANDS = (
183
+ f'{SKY_UV_INSTALL_CMD};'
132
184
  'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
133
- # Disable the pip version check to avoid the warning message, which makes
134
- # the output hard to read.
135
- 'export PIP_DISABLE_PIP_VERSION_CHECK=1;'
136
185
  # Print the PATH in provision.log to help debug PATH issues.
137
186
  'echo PATH=$PATH; '
187
+ # Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
188
+ # causing the error:
189
+ # ImportError: cannot import name 'packaging' from 'pkg_resources'"
190
+ f'{SKY_UV_PIP_CMD} install "setuptools<70"; '
138
191
  # Backward compatibility for ray upgrade (#3248): do not upgrade ray if the
139
192
  # ray cluster is already running, to avoid the ray cluster being restarted.
140
193
  #
@@ -148,10 +201,10 @@ RAY_SKYPILOT_INSTALLATION_COMMANDS = (
148
201
  # latest ray port 6380, but those existing cluster launched before #1790
149
202
  # that has ray cluster on the default port 6379 will be upgraded and
150
203
  # restarted.
151
- f'{SKY_PIP_CMD} list | grep "ray " | '
204
+ f'{SKY_UV_PIP_CMD} list | grep "ray " | '
152
205
  f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
153
206
  f'|| {RAY_STATUS} || '
154
- f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
207
+ f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
155
208
  # In some envs, e.g. pip does not have permission to write under /opt/conda
156
209
  # ray package will be installed under ~/.local/bin. If the user's PATH does
157
210
  # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -164,35 +217,54 @@ RAY_SKYPILOT_INSTALLATION_COMMANDS = (
164
217
  # Writes ray path to file if it does not exist or the file is empty.
165
218
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
166
219
  f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
167
- f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; '
168
- # END ray package check and installation
169
- f'{{ {SKY_PIP_CMD} list | grep "skypilot " && '
220
+ f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
221
+
222
+ SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
223
+ f'{SKY_UV_INSTALL_CMD};'
224
+ f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
170
225
  '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long
171
- f'{{ {SKY_PIP_CMD} uninstall skypilot -y; '
172
- f'{SKY_PIP_CMD} install "$(echo ~/.sky/wheels/{{sky_wheel_hash}}/'
226
+ f'{{ {SKY_UV_PIP_CMD} uninstall skypilot; '
227
+ # uv cannot install azure-cli normally, since it depends on pre-release
228
+ # packages. Manually install azure-cli with the --prerelease=allow flag
229
+ # first. This will allow skypilot to successfully install. See
230
+ # https://docs.astral.sh/uv/pip/compatibility/#pre-release-compatibility.
231
+ # We don't want to use --prerelease=allow for all packages, because it will
232
+ # cause uv to use pre-releases for some other packages that have sufficient
233
+ # stable releases.
234
+ 'if [ "{cloud}" = "azure" ]; then '
235
+ f'{SKY_UV_PIP_CMD} install --prerelease=allow "{dependencies.AZURE_CLI}";'
236
+ 'fi;'
237
+ # Install skypilot from wheel
238
+ f'{SKY_UV_PIP_CMD} install "$(echo ~/.sky/wheels/{{sky_wheel_hash}}/'
173
239
  f'skypilot-{_sky_version}*.whl)[{{cloud}}, remote]" && '
174
240
  'echo "{sky_wheel_hash}" > ~/.sky/wheels/current_sky_wheel_hash || '
175
- 'exit 1; }; '
176
- # END SkyPilot package check and installation
241
+ 'exit 1; }; ')
177
242
 
243
+ # Install ray and skypilot on the remote cluster if they are not already
244
+ # installed. {var} will be replaced with the actual value in
245
+ # backend_utils.write_cluster_config.
246
+ RAY_SKYPILOT_INSTALLATION_COMMANDS = (
247
+ f'{RAY_INSTALLATION_COMMANDS} '
248
+ f'{SKYPILOT_WHEEL_INSTALLATION_COMMANDS} '
178
249
  # Only patch ray when the ray version is the same as the expected version.
179
250
  # The ray installation above can be skipped due to the existing ray cluster
180
251
  # for backward compatibility. In this case, we should not patch the ray
181
252
  # files.
182
- f'{SKY_PIP_CMD} list | grep "ray " | grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
183
- f'&& {{ {SKY_PYTHON_CMD} -c "from sky.skylet.ray_patches import patch; patch()" '
184
- '|| exit 1; };')
253
+ f'{SKY_UV_PIP_CMD} list | grep "ray " | '
254
+ f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null && '
255
+ f'{{ {SKY_PYTHON_CMD} -c '
256
+ '"from sky.skylet.ray_patches import patch; patch()" || exit 1; }; ')
185
257
 
186
258
  # The name for the environment variable that stores SkyPilot user hash, which
187
259
  # is mainly used to make sure sky commands runs on a VM launched by SkyPilot
188
260
  # will be recognized as the same user (e.g., jobs controller or sky serve
189
261
  # controller).
190
- USER_ID_ENV_VAR = 'SKYPILOT_USER_ID'
262
+ USER_ID_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER_ID'
191
263
 
192
264
  # The name for the environment variable that stores SkyPilot user name.
193
265
  # Similar to USER_ID_ENV_VAR, this is mainly used to make sure sky commands
194
266
  # runs on a VM launched by SkyPilot will be recognized as the same user.
195
- USER_ENV_VAR = 'SKYPILOT_USER'
267
+ USER_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER'
196
268
 
197
269
  # In most clouds, cluster names can only contain lowercase letters, numbers
198
270
  # and hyphens. We use this regex to validate the cluster name.
@@ -201,13 +273,25 @@ CLUSTER_NAME_VALID_REGEX = '[a-zA-Z]([-_.a-zA-Z0-9]*[a-zA-Z0-9])?'
201
273
  # Used for translate local file mounts to cloud storage. Please refer to
202
274
  # sky/execution.py::_maybe_translate_local_file_mounts_and_sync_up for
203
275
  # more details.
204
- WORKDIR_BUCKET_NAME = 'skypilot-workdir-{username}-{id}'
205
- FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}'
206
- FILE_MOUNTS_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}'
276
+ FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-{username}-{user_hash}-{id}'
207
277
  FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}'
208
278
  FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files'
279
+ # For API server, the use a temporary directory in the same path as the upload
280
+ # directory to avoid using a different block device, which may not allow hard
281
+ # linking. E.g., in our API server deployment on k8s, ~/.sky/ is mounted from a
282
+ # persistent volume, so any contents in ~/.sky/ cannot be hard linked elsewhere.
283
+ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
284
+ # Base path for two-hop file mounts translation. See
285
+ # controller_utils.translate_local_file_mounts_to_two_hop().
286
+ FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
287
+
288
+ # Used when an managed jobs are created and
289
+ # files are synced up to the cloud.
290
+ FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
291
+ FILE_MOUNTS_SUBPATH = 'job-{run_id}/local-file-mounts/{i}'
292
+ FILE_MOUNTS_TMP_SUBPATH = 'job-{run_id}/tmp-files'
209
293
 
210
- # The default idle timeout for SkyPilot controllers. This include spot
294
+ # The default idle timeout for SkyPilot controllers. This include jobs
211
295
  # controller and sky serve controller.
212
296
  # TODO(tian): Refactor to controller_utils. Current blocker: circular import.
213
297
  CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
@@ -220,3 +304,52 @@ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
220
304
  # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
221
305
  # services.
222
306
  CONTROLLER_PROCESS_CPU_DEMAND = 0.25
307
+ # The log for SkyPilot API server.
308
+ API_SERVER_LOGS = '~/.sky/api_server/server.log'
309
+ # The lock for creating the SkyPilot API server.
310
+ API_SERVER_CREATION_LOCK_PATH = '~/.sky/api_server/.creation.lock'
311
+
312
+ # The name for the environment variable that stores the URL of the SkyPilot
313
+ # API server.
314
+ SKY_API_SERVER_URL_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}API_SERVER_ENDPOINT'
315
+
316
+ # SkyPilot environment variables
317
+ SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
318
+ SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
319
+ SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
320
+ SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
321
+
322
+ # Placeholder for the SSH user in proxy command, replaced when the ssh_user is
323
+ # known after provisioning.
324
+ SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user'
325
+
326
+ # The keys that can be overridden in the `~/.sky/config.yaml` file. The
327
+ # overrides are specified in task YAMLs.
328
+ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
329
+ ('docker', 'run_options'),
330
+ ('nvidia_gpus', 'disable_ecc'),
331
+ ('kubernetes', 'pod_config'),
332
+ ('kubernetes', 'provision_timeout'),
333
+ ('gcp', 'managed_instance_group'),
334
+ ]
335
+ # When overriding the SkyPilot configs on the API server with the client one,
336
+ # we skip the following keys because they are meant to be client-side configs.
337
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
338
+ ('api_server',),
339
+ ('allowed_clouds',)]
340
+
341
+ # Constants for Azure blob storage
342
+ WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
343
+ # Observed time for new role assignment to propagate was ~45s
344
+ WAIT_FOR_STORAGE_ACCOUNT_ROLE_ASSIGNMENT = 180
345
+ RETRY_INTERVAL_AFTER_ROLE_ASSIGNMENT = 10
346
+ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
347
+ 'Failed to assign Storage Blob Data Owner role to the '
348
+ 'storage account {storage_account_name}.')
349
+
350
+ # The placeholder for the local skypilot config path in file mounts for
351
+ # controllers.
352
+ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
353
+
354
+ # Path to the generated cluster config yamls and ssh configs.
355
+ SKY_USER_FILE_PATH = '~/.sky/generated'
sky/skylet/events.py CHANGED
@@ -12,14 +12,17 @@ import yaml
12
12
  from sky import clouds
13
13
  from sky import sky_logging
14
14
  from sky.backends import cloud_vm_ray_backend
15
- from sky.clouds import cloud_registry
15
+ from sky.jobs import scheduler as managed_job_scheduler
16
+ from sky.jobs import state as managed_job_state
16
17
  from sky.jobs import utils as managed_job_utils
17
18
  from sky.serve import serve_utils
18
19
  from sky.skylet import autostop_lib
19
20
  from sky.skylet import constants
20
21
  from sky.skylet import job_lib
21
- from sky.utils import cluster_yaml_utils
22
+ from sky.usage import usage_lib
23
+ from sky.utils import cluster_utils
22
24
  from sky.utils import common_utils
25
+ from sky.utils import registry
23
26
  from sky.utils import ux_utils
24
27
 
25
28
  # Seconds of sleep between the processing of skylet events.
@@ -67,12 +70,13 @@ class JobSchedulerEvent(SkyletEvent):
67
70
  job_lib.scheduler.schedule_step(force_update_jobs=True)
68
71
 
69
72
 
70
- class ManagedJobUpdateEvent(SkyletEvent):
71
- """Skylet event for updating managed job status."""
73
+ class ManagedJobEvent(SkyletEvent):
74
+ """Skylet event for updating and scheduling managed jobs."""
72
75
  EVENT_INTERVAL_SECONDS = 300
73
76
 
74
77
  def _run(self):
75
- managed_job_utils.update_managed_job_status()
78
+ managed_job_utils.update_managed_jobs_statuses()
79
+ managed_job_scheduler.maybe_schedule_next_jobs()
76
80
 
77
81
 
78
82
  class ServiceUpdateEvent(SkyletEvent):
@@ -87,6 +91,14 @@ class ServiceUpdateEvent(SkyletEvent):
87
91
  serve_utils.update_service_status()
88
92
 
89
93
 
94
+ class UsageHeartbeatReportEvent(SkyletEvent):
95
+ """Skylet event for reporting usage."""
96
+ EVENT_INTERVAL_SECONDS = 600
97
+
98
+ def _run(self):
99
+ usage_lib.send_heartbeat(interval_seconds=self.EVENT_INTERVAL_SECONDS)
100
+
101
+
90
102
  class AutostopEvent(SkyletEvent):
91
103
  """Skylet event for autostop.
92
104
 
@@ -116,7 +128,8 @@ class AutostopEvent(SkyletEvent):
116
128
  logger.debug('autostop_config not set. Skipped.')
117
129
  return
118
130
 
119
- if job_lib.is_cluster_idle():
131
+ if (job_lib.is_cluster_idle() and
132
+ not managed_job_state.get_num_alive_jobs()):
120
133
  idle_minutes = (time.time() -
121
134
  autostop_lib.get_last_active_time()) // 60
122
135
  logger.debug(
@@ -140,11 +153,10 @@ class AutostopEvent(SkyletEvent):
140
153
  autostop_lib.set_autostopping_started()
141
154
 
142
155
  config_path = os.path.abspath(
143
- os.path.expanduser(
144
- cluster_yaml_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
156
+ os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
145
157
  config = common_utils.read_yaml(config_path)
146
- provider_name = cluster_yaml_utils.get_provider_name(config)
147
- cloud = cloud_registry.CLOUD_REGISTRY.from_str(provider_name)
158
+ provider_name = cluster_utils.get_provider_name(config)
159
+ cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
148
160
  assert cloud is not None, f'Unknown cloud: {provider_name}'
149
161
 
150
162
  if (cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.