skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -12,9 +12,6 @@ from sky.utils import subprocess_utils
12
12
 
13
13
  logger = sky_logging.init_logger(__name__)
14
14
 
15
- DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
16
- 'the Docker daemon socket')
17
-
18
15
  # Configure environment variables. A docker image can have environment variables
19
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
20
17
  # shell environment, so that our ssh session can access them.
@@ -23,9 +20,16 @@ SETUP_ENV_VARS_CMD = (
23
20
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
24
21
  'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
25
22
  '~/container_env_var.sh && '
26
- '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
23
+ '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;'
27
24
  )
28
25
 
26
+ # Docker daemon may not be ready when the machine is firstly started. The error
27
+ # message starts with the following string. We should wait for a while and retry
28
+ # the command.
29
+ DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
30
+ 'the Docker daemon socket')
31
+ _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
32
+
29
33
 
30
34
  @dataclasses.dataclass
31
35
  class DockerLoginConfig:
@@ -34,6 +38,13 @@ class DockerLoginConfig:
34
38
  password: str
35
39
  server: str
36
40
 
41
+ def format_image(self, image: str) -> str:
42
+ """Format the image name with the server prefix."""
43
+ server_prefix = f'{self.server}/'
44
+ if not image.startswith(server_prefix):
45
+ return f'{server_prefix}{image}'
46
+ return image
47
+
37
48
  @classmethod
38
49
  def from_env_vars(cls, d: Dict[str, str]) -> 'DockerLoginConfig':
39
50
  return cls(
@@ -106,8 +117,8 @@ def docker_start_cmds(
106
117
  '--cap-add=SYS_ADMIN',
107
118
  '--device=/dev/fuse',
108
119
  '--security-opt=apparmor:unconfined',
120
+ '--entrypoint=/bin/bash',
109
121
  image,
110
- 'bash',
111
122
  ]
112
123
  return ' '.join(docker_run)
113
124
 
@@ -139,7 +150,9 @@ class DockerInitializer:
139
150
  def _run(self,
140
151
  cmd,
141
152
  run_env='host',
142
- wait_for_docker_daemon: bool = False) -> str:
153
+ wait_for_docker_daemon: bool = False,
154
+ separate_stderr: bool = False,
155
+ log_err_when_fail: bool = True) -> str:
143
156
 
144
157
  if run_env == 'docker':
145
158
  cmd = self._docker_expand_user(cmd, any_char=True)
@@ -152,29 +165,38 @@ class DockerInitializer:
152
165
  f' {shlex.quote(cmd)} ')
153
166
 
154
167
  logger.debug(f'+ {cmd}')
155
- cnt = 0
156
- retry = 3
168
+ start = time.time()
157
169
  while True:
158
- rc, stdout, stderr = self.runner.run(cmd,
159
- require_outputs=True,
160
- stream_logs=False,
161
- log_path=self.log_path)
162
- if (not wait_for_docker_daemon or
163
- DOCKER_PERMISSION_DENIED_STR not in stdout + stderr):
164
- break
165
-
166
- cnt += 1
167
- if cnt > retry:
168
- break
169
- logger.info(
170
- 'Failed to run docker command, retrying in 10 seconds... '
171
- f'({cnt}/{retry})')
172
- time.sleep(10)
170
+ rc, stdout, stderr = self.runner.run(
171
+ cmd,
172
+ require_outputs=True,
173
+ stream_logs=False,
174
+ separate_stderr=separate_stderr,
175
+ log_path=self.log_path)
176
+ if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr and
177
+ wait_for_docker_daemon):
178
+ if time.time() - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
179
+ if rc == 0:
180
+ # Set returncode to 1 if failed to connect to docker
181
+ # daemon after timeout.
182
+ rc = 1
183
+ break
184
+ # Close the cached connection to make the permission update of
185
+ # ssh user take effect, e.g. usermod -aG docker $USER, called
186
+ # by cloud-init of Azure.
187
+ self.runner.close_cached_connection()
188
+ logger.info('Failed to connect to docker daemon. It might be '
189
+ 'initializing, retrying in 5 seconds...')
190
+ time.sleep(5)
191
+ continue
192
+ break
173
193
  subprocess_utils.handle_returncode(
174
194
  rc,
175
195
  cmd,
176
- error_msg='Failed to run docker setup commands',
177
- stderr=stdout + stderr)
196
+ error_msg='Failed to run docker setup commands.',
197
+ stderr=stdout + stderr,
198
+ # Print out the error message if the command failed.
199
+ stream_logs=log_err_when_fail)
178
200
  return stdout.strip()
179
201
 
180
202
  def initialize(self) -> str:
@@ -205,9 +227,7 @@ class DockerInitializer:
205
227
  wait_for_docker_daemon=True)
206
228
  # We automatically add the server prefix to the image name if
207
229
  # the user did not add it.
208
- server_prefix = f'{docker_login_config.server}/'
209
- if not specific_image.startswith(server_prefix):
210
- specific_image = f'{server_prefix}{specific_image}'
230
+ specific_image = docker_login_config.format_image(specific_image)
211
231
 
212
232
  if self.docker_config.get('pull_before_run', True):
213
233
  assert specific_image, ('Image must be included in config if ' +
@@ -238,12 +258,13 @@ class DockerInitializer:
238
258
  # issue with nvidia container toolkit:
239
259
  # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
240
260
  self._run(
241
- '[ -f /etc/docker/daemon.json ] || '
261
+ '{ which jq || sudo apt update && sudo apt install -y jq; } && '
262
+ '{ [ -f /etc/docker/daemon.json ] || '
242
263
  'echo "{}" | sudo tee /etc/docker/daemon.json;'
243
264
  'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
244
265
  '/etc/docker/daemon.json > /tmp/daemon.json;'
245
266
  'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
246
- 'sudo systemctl restart docker')
267
+ 'sudo systemctl restart docker; } || true')
247
268
  user_docker_run_options = self.docker_config.get('run_options', [])
248
269
  start_command = docker_start_cmds(
249
270
  specific_image,
@@ -320,12 +341,22 @@ class DockerInitializer:
320
341
 
321
342
  def _check_docker_installed(self):
322
343
  no_exist = 'NoExist'
323
- cleaned_output = self._run(
324
- f'command -v {self.docker_cmd} || echo {no_exist!r}')
325
- if no_exist in cleaned_output or 'docker' not in cleaned_output:
326
- logger.error(
327
- f'{self.docker_cmd.capitalize()} not installed. Please use an '
328
- f'image with {self.docker_cmd.capitalize()} installed.')
344
+ # SkyPilot: Add the current user to the docker group first (if needed),
345
+ # before checking if docker is installed to avoid permission issues.
346
+ docker_cmd = ('id -nG $USER | grep -qw docker || '
347
+ 'sudo usermod -aG docker $USER > /dev/null 2>&1;'
348
+ f'command -v {self.docker_cmd} || echo {no_exist!r}')
349
+ cleaned_output = self._run(docker_cmd)
350
+ timeout = 60 * 10 # 10 minute timeout
351
+ start = time.time()
352
+ while no_exist in cleaned_output or 'docker' not in cleaned_output:
353
+ if time.time() - start > timeout:
354
+ logger.error(
355
+ f'{self.docker_cmd.capitalize()} not installed. Please use '
356
+ f'an image with {self.docker_cmd.capitalize()} installed.')
357
+ return
358
+ time.sleep(5)
359
+ cleaned_output = self._run(docker_cmd)
329
360
 
330
361
  def _check_container_status(self):
331
362
  if self.initialized:
@@ -340,9 +371,14 @@ class DockerInitializer:
340
371
  user_pos = string.find('~')
341
372
  if user_pos > -1:
342
373
  if self.home_dir is None:
343
- self.home_dir = (self._run(
344
- f'{self.docker_cmd} exec {self.container_name} '
345
- 'printenv HOME',))
374
+ cmd = (f'{self.docker_cmd} exec {self.container_name} '
375
+ 'printenv HOME')
376
+ self.home_dir = self._run(cmd, separate_stderr=True)
377
+ # Check for unexpected newline in home directory, which can be
378
+ # a common issue when the output is mixed with stderr.
379
+ assert '\n' not in self.home_dir, (
380
+ 'Unexpected newline in home directory '
381
+ f'({{self.home_dir}}) retrieved with {cmd}')
346
382
 
347
383
  if any_char:
348
384
  return string.replace('~/', self.home_dir + '/')
@@ -360,8 +396,8 @@ class DockerInitializer:
360
396
  'info -f "{{.Runtimes}}"'))
361
397
  if 'nvidia-container-runtime' in runtime_output:
362
398
  try:
363
- self._run('nvidia-smi')
364
- return run_options + ['--runtime=nvidia']
399
+ self._run('nvidia-smi', log_err_when_fail=False)
400
+ return run_options + ['--runtime=nvidia', '--gpus all']
365
401
  except Exception as e: # pylint: disable=broad-except
366
402
  logger.debug(
367
403
  'Nvidia Container Runtime is present in the docker image'
@@ -404,8 +440,8 @@ class DockerInitializer:
404
440
  def _check_container_exited(self) -> bool:
405
441
  if self.initialized:
406
442
  return True
407
- output = (self._run(check_docker_running_cmd(self.container_name,
408
- self.docker_cmd),
409
- wait_for_docker_daemon=True))
410
- return 'false' in output.lower(
411
- ) and 'no such object' not in output.lower()
443
+ output = self._run(check_docker_running_cmd(self.container_name,
444
+ self.docker_cmd),
445
+ wait_for_docker_daemon=True)
446
+ return ('false' in output.lower() and
447
+ 'no such object' not in output.lower())
@@ -1,21 +1,22 @@
1
1
  """FluidStack API client."""
2
2
 
3
- import functools
4
3
  import json
5
4
  import os
6
- from typing import Any, Dict, List, Optional
5
+ import time
6
+ from typing import Any, Dict, List
7
7
  import uuid
8
8
 
9
9
  import requests
10
10
 
11
+ from sky.utils import annotations
12
+
11
13
 
12
14
  def get_key_suffix():
13
15
  return str(uuid.uuid4()).replace('-', '')[:8]
14
16
 
15
17
 
16
- ENDPOINT = 'https://api.fluidstack.io/v1/'
18
+ ENDPOINT = 'https://platform.fluidstack.io/'
17
19
  FLUIDSTACK_API_KEY_PATH = '~/.fluidstack/api_key'
18
- FLUIDSTACK_API_TOKEN_PATH = '~/.fluidstack/api_token'
19
20
 
20
21
 
21
22
  def read_contents(path: str) -> str:
@@ -30,7 +31,7 @@ class FluidstackAPIError(Exception):
30
31
  super().__init__(message)
31
32
 
32
33
 
33
- def raise_fluidstack_error(response: requests.Response) -> None:
34
+ def raise_fluidstack_error(response: 'requests.Response') -> None:
34
35
  """Raise FluidstackAPIError if appropriate."""
35
36
  status_code = response.status_code
36
37
  if response.ok:
@@ -46,109 +47,76 @@ def raise_fluidstack_error(response: requests.Response) -> None:
46
47
  raise FluidstackAPIError(f'{message}', status_code)
47
48
 
48
49
 
49
- @functools.lru_cache()
50
- def with_nvidia_drivers(region: str):
51
- if region in ['norway_4_eu', 'generic_1_canada']:
52
- return False
53
- client = FluidstackClient()
54
- plans = client.get_plans()
55
- for plan in plans:
56
- if region in [r['id'] for r in plan['regions']]:
57
- if 'Ubuntu 20.04 LTS (Nvidia)' in plan['os_options']:
58
- return True
59
- return False
60
-
61
-
62
50
  class FluidstackClient:
63
51
  """FluidStack API Client"""
64
52
 
65
53
  def __init__(self):
66
54
  self.api_key = read_contents(
67
- os.path.expanduser(FLUIDSTACK_API_KEY_PATH))
68
- self.api_token = read_contents(
69
- os.path.expanduser(FLUIDSTACK_API_TOKEN_PATH))
55
+ os.path.expanduser(FLUIDSTACK_API_KEY_PATH)).strip()
70
56
 
71
57
  def get_plans(self):
72
- response = requests.get(ENDPOINT + 'plans')
58
+ response = requests.get(ENDPOINT + 'list_available_configurations',
59
+ headers={'api-key': self.api_key})
73
60
  raise_fluidstack_error(response)
74
61
  plans = response.json()
75
- plans = [
76
- plan for plan in plans
77
- if plan['minimum_commitment'] == 'hourly' and plan['type'] in
78
- ['preconfigured', 'custom'] and plan['gpu_type'] != 'NO GPU'
79
- ]
80
62
  return plans
81
63
 
82
- def list_instances(
83
- self,
84
- tag_filters: Optional[Dict[str,
85
- str]] = None) -> List[Dict[str, Any]]:
64
+ def list_instances(self) -> List[Dict[str, Any]]:
86
65
  response = requests.get(
87
- ENDPOINT + 'servers',
88
- auth=(self.api_key, self.api_token),
66
+ ENDPOINT + 'instances',
67
+ headers={'api-key': self.api_key},
89
68
  )
90
69
  raise_fluidstack_error(response)
91
70
  instances = response.json()
92
- filtered_instances = []
93
-
94
- for instance in instances:
95
- if isinstance(instance['tags'], str):
96
- instance['tags'] = json.loads(instance['tags'])
97
- if not instance['tags']:
98
- instance['tags'] = {}
99
- if tag_filters:
100
- for key in tag_filters:
101
- if instance['tags'].get(key, None) != tag_filters[key]:
102
- break
103
- else:
104
- filtered_instances.append(instance)
105
- else:
106
- filtered_instances.append(instance)
107
-
108
- return filtered_instances
71
+ return instances
109
72
 
110
73
  def create_instance(
111
74
  self,
112
75
  instance_type: str = '',
113
- hostname: str = '',
76
+ name: str = '',
114
77
  region: str = '',
115
78
  ssh_pub_key: str = '',
116
79
  count: int = 1,
117
80
  ) -> List[str]:
118
81
  """Launch new instances."""
119
82
 
120
- config: Dict[str, Any] = {}
121
83
  plans = self.get_plans()
122
84
  regions = self.list_regions()
85
+ gpu_type, gpu_count = instance_type.split('::')
86
+ gpu_count = int(gpu_count)
87
+
123
88
  plans = [
124
- plan for plan in plans if plan['plan_id'] == instance_type and
125
- region in [r['id'] for r in plan['regions']]
89
+ plan for plan in plans if plan['gpu_type'] == gpu_type and
90
+ gpu_count in plan['gpu_counts'] and region in plan['regions']
126
91
  ]
127
92
  if not plans:
128
93
  raise FluidstackAPIError(
129
94
  f'Plan {instance_type} out of stock in region {region}')
130
95
 
131
96
  ssh_key = self.get_or_add_ssh_key(ssh_pub_key)
132
- os_id = 'Ubuntu 20.04 LTS'
133
- body = dict(plan=None if config else instance_type,
134
- region=regions[region],
135
- os=os_id,
136
- hostname=hostname,
137
- ssh_keys=[ssh_key['id']],
138
- multiplicity=count,
139
- config=config)
140
-
141
- response = requests.post(ENDPOINT + 'server',
142
- auth=(self.api_key, self.api_token),
143
- json=body)
144
- raise_fluidstack_error(response)
145
- instance_ids = response.json().get('multiple')
146
- assert all(id is not None for id in instance_ids), instance_ids
97
+ default_operating_system = 'ubuntu_22_04_lts_nvidia'
98
+ instance_ids = []
99
+ for _ in range(count):
100
+ body = dict(gpu_type=gpu_type,
101
+ gpu_count=gpu_count,
102
+ region=regions[region],
103
+ operating_system_label=default_operating_system,
104
+ name=name,
105
+ ssh_key=ssh_key['name'])
106
+
107
+ response = requests.post(ENDPOINT + 'instances',
108
+ headers={'api-key': self.api_key},
109
+ json=body)
110
+ raise_fluidstack_error(response)
111
+ instance_id = response.json().get('id')
112
+ instance_ids.append(instance_id)
113
+ time.sleep(1)
114
+
147
115
  return instance_ids
148
116
 
149
117
  def list_ssh_keys(self):
150
- response = requests.get(ENDPOINT + 'ssh',
151
- auth=(self.api_key, self.api_token))
118
+ response = requests.get(ENDPOINT + 'ssh_keys',
119
+ headers={'api-key': self.api_key})
152
120
  raise_fluidstack_error(response)
153
121
  return response.json()
154
122
 
@@ -156,86 +124,50 @@ class FluidstackClient:
156
124
  """Add ssh key if not already added."""
157
125
  ssh_keys = self.list_ssh_keys()
158
126
  for key in ssh_keys:
159
- if key['public_key'].strip() == ssh_pub_key.strip():
160
- return {
161
- 'id': key['id'],
162
- 'name': key['name'],
163
- 'ssh_key': ssh_pub_key
164
- }
127
+ if key['public_key'].strip().split()[:2] == ssh_pub_key.strip(
128
+ ).split()[:2]:
129
+ return {'name': key['name'], 'ssh_key': ssh_pub_key}
165
130
  ssh_key_name = 'skypilot-' + get_key_suffix()
166
131
  response = requests.post(
167
- ENDPOINT + 'ssh',
168
- auth=(self.api_key, self.api_token),
132
+ ENDPOINT + 'ssh_keys',
133
+ headers={'api-key': self.api_key},
169
134
  json=dict(name=ssh_key_name, public_key=ssh_pub_key),
170
135
  )
171
136
  raise_fluidstack_error(response)
172
- key_id = response.json()['id']
173
- return {'id': key_id, 'name': ssh_key_name, 'ssh_key': ssh_pub_key}
137
+ return {'name': ssh_key_name, 'ssh_key': ssh_pub_key}
174
138
 
175
- @functools.lru_cache()
139
+ @annotations.lru_cache(scope='global')
176
140
  def list_regions(self):
177
- response = requests.get(ENDPOINT + 'plans')
178
- raise_fluidstack_error(response)
179
- plans = response.json()
180
- plans = [
181
- plan for plan in plans
182
- if plan['minimum_commitment'] == 'hourly' and plan['type'] in
183
- ['preconfigured', 'custom'] and plan['gpu_type'] != 'NO GPU'
184
- ]
141
+ plans = self.get_plans()
185
142
 
186
143
  def get_regions(plans: List) -> dict:
187
144
  """Return a list of regions where the plan is available."""
188
145
  regions = {}
189
146
  for plan in plans:
190
147
  for region in plan.get('regions', []):
191
- regions[region['id']] = region['id']
148
+ regions[region] = region
192
149
  return regions
193
150
 
194
151
  regions = get_regions(plans)
195
152
  return regions
196
153
 
197
154
  def delete(self, instance_id: str):
198
- response = requests.delete(ENDPOINT + 'server/' + instance_id,
199
- auth=(self.api_key, self.api_token))
155
+ response = requests.delete(ENDPOINT + 'instances/' + instance_id,
156
+ headers={'api-key': self.api_key})
200
157
  raise_fluidstack_error(response)
201
158
  return response.json()
202
159
 
203
160
  def stop(self, instance_id: str):
204
- response = requests.put(ENDPOINT + 'server/' + instance_id + '/stop',
205
- auth=(self.api_key, self.api_token))
206
- raise_fluidstack_error(response)
207
- return response.json()
208
-
209
- def restart(self, instance_id: str):
210
- response = requests.post(ENDPOINT + 'server/' + instance_id + '/reboot',
211
- auth=(self.api_key, self.api_token))
212
- raise_fluidstack_error(response)
213
- return response.json()
214
-
215
- def info(self, instance_id: str):
216
- response = requests.get(ENDPOINT + f'server/{instance_id}',
217
- auth=(self.api_key, self.api_token))
218
- raise_fluidstack_error(response)
219
- return response.json()
220
-
221
- def status(self, instance_id: str):
222
- response = self.info(instance_id)
223
- return response['status']
224
-
225
- def add_tags(self, instance_id: str, tags: Dict[str, str]) -> str:
226
- response = requests.patch(
227
- ENDPOINT + f'server/{instance_id}/tag',
228
- auth=(self.api_key, self.api_token),
229
- json=dict(tags=json.dumps(tags)),
230
- )
161
+ response = requests.put(ENDPOINT + 'instances/' + instance_id + '/stop',
162
+ headers={'api-key': self.api_key})
231
163
  raise_fluidstack_error(response)
232
164
  return response.json()
233
165
 
234
- def rename(self, instance_id: str, hostname: str) -> str:
235
- response = requests.patch(
236
- ENDPOINT + f'server/{instance_id}/rename',
237
- auth=(self.api_key, self.api_token),
238
- json=dict(name=hostname),
166
+ def rename(self, instance_id: str, name: str) -> str:
167
+ response = requests.put(
168
+ ENDPOINT + f'instances/{instance_id}/rename',
169
+ headers={'api-key': self.api_key},
170
+ json=dict(new_instance_name=name),
239
171
  )
240
172
  raise_fluidstack_error(response)
241
173
  return response.json()
@@ -1,15 +1,16 @@
1
1
  """FluidStack instance provisioning."""
2
+ import os
2
3
  import time
3
4
  from typing import Any, Dict, List, Optional
4
5
 
5
6
  from sky import authentication as auth
6
7
  from sky import exceptions
7
8
  from sky import sky_logging
8
- from sky import status_lib
9
9
  from sky.provision import common
10
10
  from sky.provision.fluidstack import fluidstack_utils as utils
11
11
  from sky.utils import command_runner
12
12
  from sky.utils import common_utils
13
+ from sky.utils import status_lib
13
14
  from sky.utils import subprocess_utils
14
15
  from sky.utils import ux_utils
15
16
 
@@ -25,10 +26,11 @@ logger = sky_logging.init_logger(__name__)
25
26
 
26
27
  def get_internal_ip(node_info: Dict[str, Any]) -> None:
27
28
  node_info['internal_ip'] = node_info['ip_address']
29
+ private_key_path, _ = auth.get_or_generate_keys()
28
30
  runner = command_runner.SSHCommandRunner(
29
- node_info['ip_address'],
30
- ssh_user=node_info['capabilities']['default_user_name'],
31
- ssh_private_key=auth.PRIVATE_SSH_KEY_PATH)
31
+ (node_info['ip_address'], 22),
32
+ ssh_user='ubuntu',
33
+ ssh_private_key=os.path.expanduser(private_key_path))
32
34
  result = runner.run(_GET_INTERNAL_IP_CMD,
33
35
  require_outputs=True,
34
36
  stream_logs=False)
@@ -61,7 +63,7 @@ def _filter_instances(
61
63
  if (include_instances is not None and
62
64
  instance['id'] not in include_instances):
63
65
  continue
64
- if instance.get('hostname') in possible_names:
66
+ if instance.get('name') in possible_names:
65
67
  filtered_instances[instance['id']] = instance
66
68
  return filtered_instances
67
69
 
@@ -69,7 +71,7 @@ def _filter_instances(
69
71
  def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
70
72
  head_instance_id = None
71
73
  for inst_id, inst in instances.items():
72
- if inst['hostname'].endswith('-head'):
74
+ if inst['name'].endswith('-head'):
73
75
  head_instance_id = inst_id
74
76
  break
75
77
  return head_instance_id
@@ -79,18 +81,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
79
81
  config: common.ProvisionConfig) -> common.ProvisionRecord:
80
82
  """Runs instances for the given cluster."""
81
83
 
82
- pending_status = [
83
- 'create',
84
- 'requesting',
85
- 'provisioning',
86
- 'customizing',
87
- 'starting',
88
- 'stopping',
89
- 'start',
90
- 'stop',
91
- 'reboot',
92
- 'rebooting',
93
- ]
84
+ pending_status = ['pending', 'provisioning']
94
85
  while True:
95
86
  instances = _filter_instances(cluster_name_on_cloud, pending_status)
96
87
  if len(instances) > config.count:
@@ -127,7 +118,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
127
118
  f'{instance_name}')
128
119
  rename(instance_id, instance_name)
129
120
  if (instance_id != head_instance_id and
130
- instance['hostname'].endswith('-head')):
121
+ instance['name'].endswith('-head')):
131
122
  # Multiple head instances exist.
132
123
  # This is a rare case when the instance name was manually modified
133
124
  # on the cloud or some unexpected behavior happened.
@@ -167,7 +158,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
167
158
  node_type = 'head' if head_instance_id is None else 'worker'
168
159
  try:
169
160
  instance_ids = utils.FluidstackClient().create_instance(
170
- hostname=f'{cluster_name_on_cloud}-{node_type}',
161
+ name=f'{cluster_name_on_cloud}-{node_type}',
171
162
  instance_type=config.node_config['InstanceType'],
172
163
  ssh_pub_key=config.node_config['AuthorizedKey'],
173
164
  region=region)
@@ -184,9 +175,6 @@ def run_instances(region: str, cluster_name_on_cloud: str,
184
175
  instances = _filter_instances(cluster_name_on_cloud,
185
176
  pending_status + ['running'])
186
177
  if len(instances) < config.count:
187
- # Some of pending instances have been convert to a state that will
188
- # not convert to `running` status. This can be due to resource
189
- # availability issue.
190
178
  all_instances = _filter_instances(
191
179
  cluster_name_on_cloud,
192
180
  status_filters=None,
@@ -253,15 +241,11 @@ def terminate_instances(
253
241
  instances = _filter_instances(cluster_name_on_cloud, None)
254
242
  for inst_id, inst in instances.items():
255
243
  logger.debug(f'Terminating instance {inst_id}: {inst}')
256
- if worker_only and inst['hostname'].endswith('-head'):
244
+ if worker_only and inst['name'].endswith('-head'):
257
245
  continue
258
246
  try:
259
247
  utils.FluidstackClient().delete(inst_id)
260
248
  except Exception as e: # pylint: disable=broad-except
261
- if (isinstance(e, utils.FluidstackAPIError) and
262
- 'Machine is already terminated' in str(e)):
263
- logger.debug(f'Instance {inst_id} is already terminated.')
264
- continue
265
249
  with ux_utils.print_exception_no_traceback():
266
250
  raise RuntimeError(
267
251
  f'Failed to terminate instance {inst_id}: '
@@ -291,7 +275,7 @@ def get_cluster_info(
291
275
  tags={},
292
276
  )
293
277
  ]
294
- if instance_info['hostname'].endswith('-head'):
278
+ if instance_info['name'].endswith('-head'):
295
279
  head_instance_id = instance_id
296
280
 
297
281
  return common.ClusterInfo(instances=instances,
@@ -311,22 +295,10 @@ def query_instances(
311
295
  instances = _filter_instances(cluster_name_on_cloud, None)
312
296
  instances = _filter_instances(cluster_name_on_cloud, None)
313
297
  status_map = {
314
- 'provisioning': status_lib.ClusterStatus.INIT,
315
- 'requesting': status_lib.ClusterStatus.INIT,
316
- 'create': status_lib.ClusterStatus.INIT,
317
- 'customizing': status_lib.ClusterStatus.INIT,
318
- 'stopping': status_lib.ClusterStatus.STOPPED,
319
- 'stop': status_lib.ClusterStatus.STOPPED,
320
- 'start': status_lib.ClusterStatus.INIT,
321
- 'reboot': status_lib.ClusterStatus.STOPPED,
322
- 'rebooting': status_lib.ClusterStatus.STOPPED,
298
+ 'pending': status_lib.ClusterStatus.INIT,
323
299
  'stopped': status_lib.ClusterStatus.STOPPED,
324
- 'starting': status_lib.ClusterStatus.INIT,
325
300
  'running': status_lib.ClusterStatus.UP,
326
- 'failed to create': status_lib.ClusterStatus.INIT,
327
- 'timeout error': status_lib.ClusterStatus.INIT,
328
- 'out of stock': status_lib.ClusterStatus.INIT,
329
- 'terminating': None,
301
+ 'unhealthy': status_lib.ClusterStatus.INIT,
330
302
  'terminated': None,
331
303
  }
332
304
  statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}