skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/provision/common.py CHANGED
@@ -6,6 +6,7 @@ import os
6
6
  from typing import Any, Dict, List, Optional, Tuple
7
7
 
8
8
  from sky import sky_logging
9
+ from sky.utils import env_options
9
10
  from sky.utils import resources_utils
10
11
 
11
12
  # NOTE: we can use pydantic instead of dataclasses or namedtuples, because
@@ -96,6 +97,8 @@ class InstanceInfo:
96
97
  external_ip: Optional[str]
97
98
  tags: Dict[str, str]
98
99
  ssh_port: int = 22
100
+ # The internal service address of the instance on Kubernetes.
101
+ internal_svc: Optional[str] = None
99
102
 
100
103
  def get_feasible_ip(self) -> str:
101
104
  """Get the most feasible IPs of the instance. This function returns
@@ -238,12 +241,21 @@ class Endpoint:
238
241
 
239
242
  @dataclasses.dataclass
240
243
  class SocketEndpoint(Endpoint):
241
- """Socket endpoint accesible via a host and a port."""
244
+ """Socket endpoint accessible via a host and a port."""
242
245
  port: Optional[int]
243
246
  host: str = ''
244
247
 
245
248
  def url(self, override_ip: Optional[str] = None) -> str:
246
249
  host = override_ip if override_ip else self.host
250
+ if env_options.Options.RUNNING_IN_BUILDKITE.get(
251
+ ) and 'localhost' in host:
252
+ # In Buildkite CI, we run a kind (Kubernetes in Docker) cluster.
253
+ # The controller pod runs inside this kind cluster, which itself
254
+ # runs in a container. When the pod tries to access 'localhost',
255
+ # it can't reach the host machine's localhost. Using
256
+ # 'host.docker.internal' allows the pod to properly communicate
257
+ # with services running on the host machine's localhost.
258
+ host = 'host.docker.internal'
247
259
  return f'{host}{":" + str(self.port) if self.port else ""}'
248
260
 
249
261
 
@@ -1,7 +1,7 @@
1
1
  """Cudo Compute VM spec helper for SkyPilot."""
2
2
  import csv
3
3
 
4
- from sky.clouds.service_catalog.common import get_catalog_path
4
+ from sky.catalog.common import get_catalog_path
5
5
 
6
6
  VMS_CSV = 'cudo/vms.csv'
7
7
 
@@ -1,22 +1,28 @@
1
1
  """Cudo catalog helper."""
2
2
 
3
3
  cudo_gpu_model = {
4
- 'NVIDIA V100': 'V100',
5
- 'NVIDIA A40': 'A40',
6
- 'RTX 3080': 'RTX3080',
7
- 'RTX A4000': 'RTXA4000',
8
- 'RTX A4500': 'RTXA4500',
4
+ 'H100 NVL': 'H100',
5
+ 'H100 SXM': 'H100-SXM',
6
+ 'L40S (compute mode)': 'L40S',
7
+ 'L40S (graphics mode)': 'L40S',
8
+ 'A40 (compute mode)': 'A40',
9
+ 'A40 (graphics mode)': 'A40',
9
10
  'RTX A5000': 'RTXA5000',
10
11
  'RTX A6000': 'RTXA6000',
12
+ 'A100 80GB PCIe': 'A100',
13
+ 'A800 PCIe': 'A800',
14
+ 'V100': 'V100',
11
15
  }
12
16
 
13
17
  cudo_gpu_mem = {
14
- 'RTX3080': 12,
18
+ 'H100': 94,
19
+ 'H100-SXM': 80,
20
+ 'L40S': 48,
15
21
  'A40': 48,
16
- 'RTXA4000': 16,
17
- 'RTXA4500': 20,
18
22
  'RTXA5000': 24,
19
23
  'RTXA6000': 48,
24
+ 'A100': 80,
25
+ 'A800': 80,
20
26
  'V100': 16,
21
27
  }
22
28
 
@@ -4,7 +4,7 @@ from typing import Dict
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.adaptors import cudo
7
- import sky.provision.cudo.cudo_utils as utils
7
+ from sky.provision.cudo import cudo_utils as utils
8
8
 
9
9
  logger = sky_logging.init_logger(__name__)
10
10
 
@@ -28,12 +28,10 @@ def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
28
28
  size_gib=disk_size),
29
29
  metadata=tags)
30
30
 
31
- try:
32
- api = cudo.cudo.cudo_api.virtual_machines()
33
- vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
34
- return vm.to_dict()['id']
35
- except cudo.cudo.rest.ApiException as e:
36
- raise e
31
+ api = cudo.cudo.cudo_api.virtual_machines()
32
+ vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
33
+
34
+ return vm.to_dict()['id']
37
35
 
38
36
 
39
37
  def remove(instance_id: str):
@@ -54,11 +52,8 @@ def remove(instance_id: str):
54
52
  state = 'unknown'
55
53
  project_id = cudo.cudo.cudo_api.project_id_throwable()
56
54
  while retry_count < max_retries:
57
- try:
58
- vm = api.get_vm(project_id, instance_id)
59
- state = vm.to_dict()['vm']['short_state']
60
- except cudo.cudo.rest.ApiException as e:
61
- raise e
55
+ vm = api.get_vm(project_id, instance_id)
56
+ state = vm.to_dict()['vm']['short_state']
62
57
 
63
58
  if state in terminate_ok:
64
59
  break
@@ -69,76 +64,82 @@ def remove(instance_id: str):
69
64
  'Timeout error, could not terminate due to VM state: {}'.format(
70
65
  state))
71
66
 
72
- try:
73
- api.terminate_vm(project_id, instance_id)
74
- except cudo.cudo.rest.ApiException as e:
75
- raise e
67
+ api.terminate_vm(project_id, instance_id)
76
68
 
77
69
 
78
70
  def set_tags(instance_id: str, tags: Dict):
79
71
  """Sets the tags for the given instance."""
80
- try:
81
- api = cudo.cudo.cudo_api.virtual_machines()
82
- api.update_vm_metadata(
83
- cudo.cudo.cudo_api.project_id(), instance_id,
84
- cudo.cudo.UpdateVMMetadataBody(
85
- metadata=tags,
86
- merge=True)) # TODO (skypilot team) merge or overwrite?
87
- except cudo.cudo.rest.ApiException as e:
88
- raise e
72
+ api = cudo.cudo.cudo_api.virtual_machines()
73
+ api.update_vm_metadata(
74
+ cudo.cudo.cudo_api.project_id(), instance_id,
75
+ cudo.cudo.UpdateVMMetadataBody(
76
+ metadata=tags,
77
+ merge=True)) # TODO (skypilot team) merge or overwrite?
89
78
 
90
79
 
91
80
  def get_instance(vm_id):
92
- try:
93
- api = cudo.cudo.cudo_api.virtual_machines()
94
- vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
95
- vm_dict = vm.to_dict()
96
- return vm_dict
97
- except cudo.cudo.rest.ApiException as e:
98
- raise e
81
+ api = cudo.cudo.cudo_api.virtual_machines()
82
+ vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
83
+ vm_dict = vm.to_dict()
84
+ return vm_dict
99
85
 
100
86
 
101
87
  def list_instances():
102
- try:
103
- api = cudo.cudo.cudo_api.virtual_machines()
104
- vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
105
- instances = {}
106
- for vm in vms.to_dict()['vms']:
107
- ex_ip = vm['external_ip_address']
108
- in_ip = vm['internal_ip_address']
109
- if not in_ip:
110
- in_ip = ex_ip
111
- instance = {
112
- # active_state, init_state, lcm_state, short_state
113
- 'status': vm['short_state'],
114
- 'tags': vm['metadata'],
115
- 'name': vm['id'],
116
- 'ip': ex_ip,
117
- 'external_ip': ex_ip,
118
- 'internal_ip': in_ip
119
- }
120
- instances[vm['id']] = instance
121
- return instances
122
- except cudo.cudo.rest.ApiException as e:
123
- raise e
88
+ api = cudo.cudo.cudo_api.virtual_machines()
89
+ vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
90
+ instances = {}
91
+ for vm in vms.to_dict()['vms']:
92
+ ex_ip = vm['external_ip_address']
93
+ in_ip = vm['internal_ip_address']
94
+ if not in_ip:
95
+ in_ip = ex_ip
96
+ instance = {
97
+ # active_state, init_state, lcm_state, short_state
98
+ 'status': vm['short_state'],
99
+ 'tags': vm['metadata'],
100
+ 'name': vm['id'],
101
+ 'ip': ex_ip,
102
+ 'external_ip': ex_ip,
103
+ 'internal_ip': in_ip
104
+ }
105
+ instances[vm['id']] = instance
106
+ return instances
124
107
 
125
108
 
126
109
  def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
127
110
  cpus):
128
- try:
129
- gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
130
- api = cudo.cudo.cudo_api.virtual_machines()
131
- types = api.list_vm_machine_types(mem,
132
- cpus,
133
- gpu=gpu_count,
134
- gpu_model=gpu_model,
135
- data_center_id=data_center_id)
136
- types_dict = types.to_dict()
137
- hc = types_dict['host_configs']
138
- total_count = sum(item['count_vm_available'] for item in hc)
139
- if total_count < to_start_count:
140
- raise Exception(
141
- 'Too many VMs requested, try another gpu type or region')
142
- return total_count
143
- except cudo.cudo.rest.ApiException as e:
144
- raise e
111
+ gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
112
+ api = cudo.cudo.cudo_api.virtual_machines()
113
+ types = api.list_vm_machine_types2()
114
+ types_dict = types.to_dict()
115
+ machine_types = types_dict['machine_types']
116
+
117
+ # Filter machine types based on requirements
118
+ matching_types = []
119
+ for machine_type in machine_types:
120
+ # Check if this machine type matches our requirements
121
+ if (machine_type['data_center_id'] == data_center_id and
122
+ machine_type['gpu_model'] == gpu_model and
123
+ machine_type['min_vcpu'] <= cpus <= machine_type.get(
124
+ 'max_vcpu_free', float('inf')) and
125
+ machine_type['min_memory_gib'] <= mem <= machine_type.get(
126
+ 'max_memory_gib_free', float('inf'))):
127
+
128
+ # Calculate available VMs based on resource constraints
129
+ max_vms_by_vcpu = machine_type[
130
+ 'total_vcpu_free'] // cpus if cpus > 0 else float('inf')
131
+ max_vms_by_memory = machine_type[
132
+ 'total_memory_gib_free'] // mem if mem > 0 else float('inf')
133
+ max_vms_by_gpu = machine_type[
134
+ 'total_gpu_free'] // gpu_count if gpu_count > 0 else float(
135
+ 'inf')
136
+
137
+ available_vms = min(max_vms_by_vcpu, max_vms_by_memory,
138
+ max_vms_by_gpu)
139
+ matching_types.append(available_vms)
140
+
141
+ total_count = sum(matching_types)
142
+ if total_count < to_start_count:
143
+ raise Exception(
144
+ 'Too many VMs requested, try another gpu type or region')
145
+ return total_count
@@ -1,7 +1,7 @@
1
1
  """Cudo Compute instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
40
40
  return head_instance_id
41
41
 
42
42
 
43
- def run_instances(region: str, cluster_name_on_cloud: str,
43
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
44
44
  config: common.ProvisionConfig) -> common.ProvisionRecord:
45
45
  """Runs instances for the given cluster."""
46
-
46
+ del cluster_name # unused
47
47
  pending_status = ['pend', 'init', 'prol', 'boot']
48
48
 
49
49
  while True:
@@ -191,11 +191,14 @@ def get_cluster_info(
191
191
 
192
192
 
193
193
  def query_instances(
194
+ cluster_name: str,
194
195
  cluster_name_on_cloud: str,
195
196
  provider_config: Optional[Dict[str, Any]] = None,
196
197
  non_terminated_only: bool = True,
197
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
198
+ retry_if_missing: bool = False,
199
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
198
200
  """See sky/provision/__init__.py"""
201
+ del cluster_name, retry_if_missing # unused
199
202
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
200
203
  instances = _filter_instances(cluster_name_on_cloud, None)
201
204
 
@@ -210,12 +213,13 @@ def query_instances(
210
213
  'done': status_lib.ClusterStatus.STOPPED,
211
214
  'poff': status_lib.ClusterStatus.STOPPED,
212
215
  }
213
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
216
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
217
+ Optional[str]]] = {}
214
218
  for inst_id, inst in instances.items():
215
219
  status = status_map[inst['status']]
216
220
  if non_terminated_only and status is None:
217
221
  continue
218
- statuses[inst_id] = status
222
+ statuses[inst_id] = (status, None)
219
223
  return statuses
220
224
 
221
225
 
@@ -1,7 +1,7 @@
1
1
  """DigitalOcean instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
  import uuid
6
6
 
7
7
  from sky import sky_logging
@@ -26,10 +26,10 @@ def _get_head_instance(
26
26
  return None
27
27
 
28
28
 
29
- def run_instances(region: str, cluster_name_on_cloud: str,
29
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
30
30
  config: common.ProvisionConfig) -> common.ProvisionRecord:
31
31
  """Runs instances for the given cluster."""
32
-
32
+ del cluster_name # unused
33
33
  pending_status = ['new']
34
34
  newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
35
35
  pending_status + ['off'])
@@ -242,11 +242,14 @@ def get_cluster_info(
242
242
 
243
243
 
244
244
  def query_instances(
245
+ cluster_name: str,
245
246
  cluster_name_on_cloud: str,
246
247
  provider_config: Optional[Dict[str, Any]] = None,
247
248
  non_terminated_only: bool = True,
248
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
249
+ retry_if_missing: bool = False,
250
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
249
251
  """See sky/provision/__init__.py"""
252
+ del cluster_name, retry_if_missing # unused
250
253
  # terminated instances are not retrieved by the
251
254
  # API making `non_terminated_only` argument moot.
252
255
  del non_terminated_only
@@ -260,10 +263,11 @@ def query_instances(
260
263
  'active': status_lib.ClusterStatus.UP,
261
264
  'off': status_lib.ClusterStatus.STOPPED,
262
265
  }
263
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
266
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
267
+ Optional[str]]] = {}
264
268
  for instance_meta in instances.values():
265
269
  status = status_map[instance_meta['status']]
266
- statuses[instance_meta['name']] = status
270
+ statuses[instance_meta['name']] = (status, None)
267
271
  return statuses
268
272
 
269
273
 
sky/provision/do/utils.py CHANGED
@@ -17,6 +17,7 @@ from sky.provision import constants as provision_constants
17
17
  from sky.provision.do import constants
18
18
  from sky.utils import annotations
19
19
  from sky.utils import common_utils
20
+ from sky.utils import yaml_utils
20
21
 
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
@@ -30,7 +31,7 @@ POSSIBLE_CREDENTIALS_PATHS = [
30
31
  INITIAL_BACKOFF_SECONDS = 10
31
32
  MAX_BACKOFF_FACTOR = 10
32
33
  MAX_ATTEMPTS = 6
33
- SSH_KEY_NAME_ON_DO = f'sky-key-{common_utils.get_user_hash()}'
34
+ SSH_KEY_NAME_ON_DO_PREFIX = 'sky-key-'
34
35
 
35
36
  _client = None
36
37
  _ssh_key_id = None
@@ -61,7 +62,7 @@ def _init_client():
61
62
  if get_credentials_path() is None:
62
63
  raise DigitalOceanError(
63
64
  'No credentials found, please run `doctl auth init`')
64
- credentials = common_utils.read_yaml(get_credentials_path())
65
+ credentials = yaml_utils.read_yaml(get_credentials_path())
65
66
  default_token = credentials.get('access-token', None)
66
67
  if default_token is not None:
67
68
  try:
@@ -125,7 +126,7 @@ def ssh_key_id(public_key: str):
125
126
 
126
127
  request = {
127
128
  'public_key': public_key,
128
- 'name': SSH_KEY_NAME_ON_DO,
129
+ 'name': SSH_KEY_NAME_ON_DO_PREFIX + common_utils.get_user_hash(),
129
130
  }
130
131
  _ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
131
132
  return _ssh_key_id
@@ -3,7 +3,7 @@
3
3
  import dataclasses
4
4
  import shlex
5
5
  import time
6
- from typing import Any, Dict, List
6
+ from typing import Any, Dict, List, Optional
7
7
 
8
8
  from sky import sky_logging
9
9
  from sky.skylet import constants
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Configure environment variables. A docker image can have environment variables
16
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
17
17
  # shell environment, so that our ssh session can access them.
18
+ # Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
19
+ # Docker images with Ray 2.48.0+ set this for UV package manager support,
20
+ # but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
21
+ # See: https://github.com/skypilot-org/skypilot/pull/7181
18
22
  SETUP_ENV_VARS_CMD = (
19
23
  'prefix_cmd() '
20
24
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
- 'export -p > ~/container_env_var.sh && '
25
+ 'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
22
26
  '$(prefix_cmd) '
23
27
  'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
24
28
 
@@ -32,6 +36,30 @@ DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
32
36
 
33
37
  _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
34
38
 
39
+ # Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
40
+ # AWS CLI v2 is installed as a standalone binary, not a Python package. See:
41
+ # https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
42
+ INSTALL_AWS_CLI_CMD = (
43
+ 'which aws || ((command -v unzip >/dev/null 2>&1 || '
44
+ '(sudo apt-get update && sudo apt-get install -y unzip)) && '
45
+ 'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
46
+ '-o "/tmp/awscliv2.zip" && '
47
+ 'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
48
+ '&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
49
+
50
+
51
+ def _extract_region_from_ecr_server(server: str) -> str:
52
+ """Extract AWS region from ECR server URL.
53
+
54
+ ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
55
+ Returns the region part from the URL.
56
+ """
57
+ # Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
58
+ parts = server.split('.')
59
+ if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
60
+ return parts[3]
61
+ raise ValueError(f'Invalid ECR server format: {server}')
62
+
35
63
 
36
64
  @dataclasses.dataclass
37
65
  class DockerLoginConfig:
@@ -83,6 +111,21 @@ def check_docker_image(cname, docker_cmd):
83
111
  return _check_helper(cname, '.Config.Image', docker_cmd)
84
112
 
85
113
 
114
+ def maybe_remove_container_cmds(container_name, docker_cmd):
115
+ """Remove the container if it exists. If not, it will be a no-op.
116
+ """
117
+ docker_rm = [
118
+ docker_cmd,
119
+ 'rm',
120
+ '-f',
121
+ container_name,
122
+ '2>/dev/null',
123
+ '||',
124
+ 'true',
125
+ ]
126
+ return ' '.join(docker_rm)
127
+
128
+
86
129
  def docker_start_cmds(
87
130
  image,
88
131
  container_name,
@@ -149,12 +192,16 @@ class DockerInitializer:
149
192
  self.docker_cmd = 'podman' if use_podman else 'docker'
150
193
  self.log_path = log_path
151
194
 
152
- def _run(self,
153
- cmd,
154
- run_env='host',
155
- wait_for_docker_daemon: bool = False,
156
- separate_stderr: bool = False,
157
- log_err_when_fail: bool = True) -> str:
195
+ def _run(
196
+ self,
197
+ cmd,
198
+ run_env='host',
199
+ wait_for_docker_daemon: bool = False,
200
+ separate_stderr: bool = False,
201
+ log_err_when_fail: bool = True,
202
+ flock_name: Optional[str] = None,
203
+ flock_args: Optional[str] = None,
204
+ ) -> str:
158
205
 
159
206
  if run_env == 'docker':
160
207
  cmd = self._docker_expand_user(cmd, any_char=True)
@@ -163,8 +210,13 @@ class DockerInitializer:
163
210
  # an error: `the input device is not a TTY`, and it works without
164
211
  # `-it` flag.
165
212
  # TODO(zhwu): ray use the `-it` flag, we need to check why.
166
- cmd = (f'{self.docker_cmd} exec {self.container_name} /bin/bash -c'
167
- f' {shlex.quote(cmd)} ')
213
+ cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
214
+ f' /bin/bash -c {shlex.quote(cmd)} ')
215
+
216
+ if flock_name is not None:
217
+ flock_args = flock_args or ''
218
+ cmd = (f'flock {flock_args} /tmp/{flock_name} '
219
+ f'-c {shlex.quote(cmd)}')
168
220
 
169
221
  logger.debug(f'+ {cmd}')
170
222
  start = time.time()
@@ -216,14 +268,17 @@ class DockerInitializer:
216
268
  if self._check_container_exited():
217
269
  self.initialized = True
218
270
  self._run(f'{self.docker_cmd} start {self.container_name}')
219
- self._run('sudo service ssh start', run_env='docker')
271
+ self._run('sudo service ssh start',
272
+ run_env='docker',
273
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
274
+ flock_args='-s -w 1')
220
275
  return self._run('whoami', run_env='docker')
221
276
 
222
277
  # SkyPilot: Docker login if user specified a private docker registry.
223
278
  if 'docker_login_config' in self.docker_config:
224
- # TODO(tian): Maybe support a command to get the login password?
225
279
  docker_login_config = DockerLoginConfig(
226
280
  **self.docker_config['docker_login_config'])
281
+
227
282
  if docker_login_config.password:
228
283
  # Password is allowed to be empty, in that case, we will not run
229
284
  # the login command, and assume that the image pulling is
@@ -234,6 +289,25 @@ class DockerInitializer:
234
289
  f'--password {shlex.quote(docker_login_config.password)} '
235
290
  f'{shlex.quote(docker_login_config.server)}',
236
291
  wait_for_docker_daemon=True)
292
+ elif (docker_login_config.server.endswith('.amazonaws.com') and
293
+ '.dkr.ecr.' in docker_login_config.server):
294
+ # AWS ECR: Use aws ecr get-login-password for authentication
295
+ # ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
296
+ # This command uses the IAM credentials from the EC2 instance
297
+ # Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
298
+ region = _extract_region_from_ecr_server(
299
+ docker_login_config.server)
300
+
301
+ # AWS CLI is not pre-installed on AWS instances, unlike gcloud
302
+ # on GCP instances, so we need to install it first
303
+ self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
304
+
305
+ self._run(
306
+ f'aws ecr get-login-password --region {region} | '
307
+ f'{self.docker_cmd} login --username AWS '
308
+ f'--password-stdin '
309
+ f'{shlex.quote(docker_login_config.server)}',
310
+ wait_for_docker_daemon=True)
237
311
  elif docker_login_config.server.endswith('-docker.pkg.dev'):
238
312
  # Docker image server is on GCR, we need to do additional setup
239
313
  # to pull the image.
@@ -285,6 +359,10 @@ class DockerInitializer:
285
359
  'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
286
360
  'sudo systemctl restart docker; } || true')
287
361
  user_docker_run_options = self.docker_config.get('run_options', [])
362
+ remove_container_cmd = maybe_remove_container_cmds(
363
+ self.container_name,
364
+ self.docker_cmd,
365
+ )
288
366
  start_command = docker_start_cmds(
289
367
  specific_image,
290
368
  self.container_name,
@@ -292,7 +370,9 @@ class DockerInitializer:
292
370
  self._auto_configure_shm(user_docker_run_options)),
293
371
  self.docker_cmd,
294
372
  )
295
- self._run(start_command)
373
+ self._run(f'{remove_container_cmd} && {start_command}',
374
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
375
+ flock_args='-x -w 10')
296
376
 
297
377
  # SkyPilot: Setup Commands.
298
378
  # TODO(zhwu): the following setups should be aligned with the kubernetes
@@ -310,14 +390,18 @@ class DockerInitializer:
310
390
  'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
311
391
  run_env='docker')
312
392
  # Install dependencies.
313
- self._run(
314
- 'sudo apt-get update; '
393
+ cmd = (
394
+ 'bash -lc \''
395
+ 'exec 200>/var/tmp/sky_apt.lock; '
396
+ 'flock -x -w 120 200 || exit 1; '
397
+ 'export DEBIAN_FRONTEND=noninteractive; '
398
+ 'apt-get -yq update && '
315
399
  # Our mount script will install gcsfuse without fuse package.
316
400
  # We need to install fuse package first to enable storage mount.
317
401
  # The dpkg option is to suppress the prompt for fuse installation.
318
- 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '
319
- 'rsync curl wget patch openssh-server python3-pip fuse;',
320
- run_env='docker')
402
+ 'apt-get -o DPkg::Options::=--force-confnew install -y '
403
+ 'rsync curl wget patch openssh-server python3-pip fuse\'')
404
+ self._run(cmd, run_env='docker')
321
405
 
322
406
  # Copy local authorized_keys to docker container.
323
407
  # Stop and disable jupyter service. This is to avoid port conflict on
@@ -343,13 +427,16 @@ class DockerInitializer:
343
427
  # `mesg: ttyname failed: inappropriate ioctl for device`.
344
428
  # see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
345
429
  port = constants.DEFAULT_DOCKER_PORT
430
+ # In case the port is already configured in the sshd_config file
431
+ # in some images, we delete it first and then append the new one.
346
432
  # pylint: disable=anomalous-backslash-in-string
347
433
  self._run(
348
- f'sudo sed -i "s/#Port 22/Port {port}/" /etc/ssh/sshd_config;'
434
+ 'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
435
+ f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
349
436
  'mkdir -p ~/.ssh;'
350
437
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
351
438
  'sudo service ssh start;'
352
- 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'
439
+ 'sudo sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;'
353
440
  f'{SETUP_ENV_VARS_CMD}',
354
441
  run_env='docker')
355
442
 
@@ -390,9 +477,13 @@ class DockerInitializer:
390
477
  user_pos = string.find('~')
391
478
  if user_pos > -1:
392
479
  if self.home_dir is None:
393
- cmd = (f'{self.docker_cmd} exec {self.container_name} '
394
- 'printenv HOME')
395
- self.home_dir = self._run(cmd, separate_stderr=True)
480
+ cmd = (f'{self.docker_cmd} exec {self.container_name}'
481
+ ' printenv HOME')
482
+ self.home_dir = self._run(
483
+ cmd,
484
+ separate_stderr=True,
485
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
486
+ flock_args='-s -w 1')
396
487
  # Check for unexpected newline in home directory, which can be
397
488
  # a common issue when the output is mixed with stderr.
398
489
  assert '\n' not in self.home_dir, (