skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
305
305
  Returns:
306
306
  A list of route tables associated with the options VPC and region
307
307
  """
308
- filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
308
+ filters: List['ec2_type_defs.FilterTypeDef'] = [{
309
+ 'Name': 'association.main',
310
+ 'Values': [str(main).lower()],
311
+ }]
309
312
  if vpc_id is not None:
310
313
  filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
311
314
  logger.debug(
@@ -406,10 +409,26 @@ def _usable_subnets(
406
409
  s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg
407
410
  ]
408
411
 
412
+ if not candidate_subnets:
413
+ _skypilot_log_error_and_exit_for_failover(
414
+ 'No candidate subnets found in specified VPC '
415
+ f'{vpc_id_of_sg}.')
416
+
409
417
  available_subnets = [
410
418
  s for s in candidate_subnets if s.state == 'available'
411
419
  ]
412
420
 
421
+ if not available_subnets:
422
+ _skypilot_log_error_and_exit_for_failover(
423
+ 'All candidate subnets are pending in specified VPC '
424
+ f'{vpc_id_of_sg}.')
425
+
426
+ if len(candidate_subnets) > len(available_subnets):
427
+ num_pruned = len(candidate_subnets) - len(available_subnets)
428
+ logger.debug(
429
+ f'{num_pruned} candidate subnets pruned since they are not '
430
+ 'available.')
431
+
413
432
  if use_internal_ips:
414
433
  # Get private subnets.
415
434
  #
@@ -421,6 +440,10 @@ def _usable_subnets(
421
440
  if not _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg) and
422
441
  not s.map_public_ip_on_launch
423
442
  ]
443
+ if not subnets:
444
+ _skypilot_log_error_and_exit_for_failover(
445
+ 'The use_internal_ips option is set to True, but all '
446
+ 'candidate subnets are public.')
424
447
  else:
425
448
  # Get public subnets.
426
449
  #
@@ -436,6 +459,10 @@ def _usable_subnets(
436
459
  s for s in available_subnets
437
460
  if _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg)
438
461
  ]
462
+ if not subnets:
463
+ _skypilot_log_error_and_exit_for_failover(
464
+ 'All candidate subnets are private, did you mean to '
465
+ 'set use_internal_ips to True?')
439
466
 
440
467
  subnets = sorted(
441
468
  subnets,
@@ -449,18 +476,7 @@ def _usable_subnets(
449
476
  'Failed to fetch available subnets from AWS.')
450
477
  raise exc
451
478
 
452
- if not subnets:
453
- vpc_msg = (f'Does a default VPC exist in region '
454
- f'{ec2.meta.client.meta.region_name}? ') if (
455
- vpc_id_of_sg is None) else ''
456
- _skypilot_log_error_and_exit_for_failover(
457
- f'No usable subnets found. {vpc_msg}'
458
- 'Try manually creating an instance in your specified region to '
459
- 'populate the list of subnets and try again. '
460
- 'Note that the subnet must map public IPs '
461
- 'on instance launch unless you set `use_internal_ips: true` in '
462
- 'the `provider` config.')
463
- elif _are_user_subnets_pruned(subnets):
479
+ if _are_user_subnets_pruned(subnets):
464
480
  _skypilot_log_error_and_exit_for_failover(
465
481
  f'The specified subnets are not '
466
482
  f'usable: {_get_pruned_subnets(subnets)}')
@@ -579,6 +595,11 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
579
595
  # not want SkyPilot to use.
580
596
  if vpc_id_of_sg is None:
581
597
  all_subnets = [s for s in all_subnets if s.vpc.is_default]
598
+ if not all_subnets:
599
+ _skypilot_log_error_and_exit_for_failover(
600
+ f'The default VPC in {region} either does not exist or '
601
+ 'has no subnets.')
602
+
582
603
  subnets, vpc_id = _usable_subnets(
583
604
  ec2,
584
605
  user_specified_subnets=None,
@@ -311,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
311
311
  return head_instance_id
312
312
 
313
313
 
314
- def run_instances(region: str, cluster_name_on_cloud: str,
314
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
315
315
  config: common.ProvisionConfig) -> common.ProvisionRecord:
316
316
  """See sky/provision/__init__.py"""
317
+ del cluster_name # unused
317
318
  ec2 = _default_ec2_resource(region)
318
319
  # NOTE: We set max_attempts=0 for fast failing when the resource is not
319
320
  # available (although the doc says it will only retry for network
@@ -629,9 +630,10 @@ def query_instances(
629
630
  cluster_name_on_cloud: str,
630
631
  provider_config: Optional[Dict[str, Any]] = None,
631
632
  non_terminated_only: bool = True,
633
+ retry_if_missing: bool = False,
632
634
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
633
635
  """See sky/provision/__init__.py"""
634
- del cluster_name # unused
636
+ del cluster_name, retry_if_missing # unused
635
637
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
636
638
  region = provider_config['region']
637
639
  ec2 = _default_ec2_resource(region)
@@ -743,6 +745,7 @@ def terminate_instances(
743
745
 
744
746
  # Make this multithreaded: modify all instances' SGs in parallel.
745
747
  def modify_instance_sg(instance):
748
+ assert default_sg is not None # Type narrowing for mypy
746
749
  instance.modify_attribute(Groups=[default_sg.id])
747
750
  logger.debug(f'Instance {instance.id} modified to use default SG:'
748
751
  f'{default_sg.id} for quick deletion.')
@@ -214,7 +214,7 @@ def _create_network_interface(
214
214
  location=provider_config['location'],
215
215
  public_ip_allocation_method='Static',
216
216
  public_ip_address_version='IPv4',
217
- sku=network.PublicIPAddressSku(name='Basic', tier='Regional'))
217
+ sku=network.PublicIPAddressSku(name='Standard', tier='Regional'))
218
218
  ip_poller = network_client.public_ip_addresses.begin_create_or_update(
219
219
  resource_group_name=provider_config['resource_group'],
220
220
  public_ip_address_name=f'{vm_name}-ip',
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
362
362
  return instances
363
363
 
364
364
 
365
- def run_instances(region: str, cluster_name_on_cloud: str,
365
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
366
366
  config: common.ProvisionConfig) -> common.ProvisionRecord:
367
367
  """See sky/provision/__init__.py"""
368
+ del cluster_name # unused
368
369
  # TODO(zhwu): This function is too long. We should refactor it.
369
370
  provider_config = config.provider_config
370
371
  resource_group = provider_config['resource_group']
@@ -956,9 +957,10 @@ def query_instances(
956
957
  cluster_name_on_cloud: str,
957
958
  provider_config: Optional[Dict[str, Any]] = None,
958
959
  non_terminated_only: bool = True,
960
+ retry_if_missing: bool = False,
959
961
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
960
962
  """See sky/provision/__init__.py"""
961
- del cluster_name # unused
963
+ del cluster_name, retry_if_missing # unused
962
964
  assert provider_config is not None, cluster_name_on_cloud
963
965
 
964
966
  subscription_id = provider_config['subscription_id']
sky/provision/common.py CHANGED
@@ -6,6 +6,7 @@ import os
6
6
  from typing import Any, Dict, List, Optional, Tuple
7
7
 
8
8
  from sky import sky_logging
9
+ from sky.utils import config_utils
9
10
  from sky.utils import env_options
10
11
  from sky.utils import resources_utils
11
12
 
@@ -36,6 +37,13 @@ class StopFailoverError(Exception):
36
37
  """
37
38
 
38
39
 
40
+ # These fields are sensitive and should be redacted from the config for logging
41
+ # purposes.
42
+ SENSITIVE_FIELDS = [
43
+ ('docker_config', 'docker_login_config', 'password'),
44
+ ]
45
+
46
+
39
47
  @dataclasses.dataclass
40
48
  class ProvisionConfig:
41
49
  """Configuration for provisioning."""
@@ -56,6 +64,18 @@ class ProvisionConfig:
56
64
  # Optional ports to open on launch of the cluster.
57
65
  ports_to_open_on_launch: Optional[List[int]]
58
66
 
67
+ def get_redacted_config(self) -> Dict[str, Any]:
68
+ """Get the redacted config."""
69
+ config = dataclasses.asdict(self)
70
+
71
+ config_copy = config_utils.Config(config)
72
+
73
+ for field_list in SENSITIVE_FIELDS:
74
+ val = config_copy.get_nested(field_list, default_value=None)
75
+ if val is not None:
76
+ config_copy.set_nested(field_list, '<redacted>')
77
+ return dict(**config_copy)
78
+
59
79
 
60
80
  # -------------------- output data model -------------------- #
61
81
 
@@ -97,6 +117,8 @@ class InstanceInfo:
97
117
  external_ip: Optional[str]
98
118
  tags: Dict[str, str]
99
119
  ssh_port: int = 22
120
+ # The internal service address of the instance on Kubernetes.
121
+ internal_svc: Optional[str] = None
100
122
 
101
123
  def get_feasible_ip(self) -> str:
102
124
  """Get the most feasible IPs of the instance. This function returns
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
40
40
  return head_instance_id
41
41
 
42
42
 
43
- def run_instances(region: str, cluster_name_on_cloud: str,
43
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
44
44
  config: common.ProvisionConfig) -> common.ProvisionRecord:
45
45
  """Runs instances for the given cluster."""
46
-
46
+ del cluster_name # unused
47
47
  pending_status = ['pend', 'init', 'prol', 'boot']
48
48
 
49
49
  while True:
@@ -195,9 +195,10 @@ def query_instances(
195
195
  cluster_name_on_cloud: str,
196
196
  provider_config: Optional[Dict[str, Any]] = None,
197
197
  non_terminated_only: bool = True,
198
+ retry_if_missing: bool = False,
198
199
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
199
200
  """See sky/provision/__init__.py"""
200
- del cluster_name # unused
201
+ del cluster_name, retry_if_missing # unused
201
202
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
202
203
  instances = _filter_instances(cluster_name_on_cloud, None)
203
204
 
@@ -26,10 +26,10 @@ def _get_head_instance(
26
26
  return None
27
27
 
28
28
 
29
- def run_instances(region: str, cluster_name_on_cloud: str,
29
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
30
30
  config: common.ProvisionConfig) -> common.ProvisionRecord:
31
31
  """Runs instances for the given cluster."""
32
-
32
+ del cluster_name # unused
33
33
  pending_status = ['new']
34
34
  newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
35
35
  pending_status + ['off'])
@@ -246,9 +246,10 @@ def query_instances(
246
246
  cluster_name_on_cloud: str,
247
247
  provider_config: Optional[Dict[str, Any]] = None,
248
248
  non_terminated_only: bool = True,
249
+ retry_if_missing: bool = False,
249
250
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
250
251
  """See sky/provision/__init__.py"""
251
- del cluster_name # unused
252
+ del cluster_name, retry_if_missing # unused
252
253
  # terminated instances are not retrieved by the
253
254
  # API making `non_terminated_only` argument moot.
254
255
  del non_terminated_only
@@ -3,7 +3,7 @@
3
3
  import dataclasses
4
4
  import shlex
5
5
  import time
6
- from typing import Any, Dict, List
6
+ from typing import Any, Dict, List, Optional
7
7
 
8
8
  from sky import sky_logging
9
9
  from sky.skylet import constants
@@ -15,23 +15,52 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Configure environment variables. A docker image can have environment variables
16
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
17
17
  # shell environment, so that our ssh session can access them.
18
+ # Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
19
+ # Docker images with Ray 2.48.0+ set this for UV package manager support,
20
+ # but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
21
+ # See: https://github.com/skypilot-org/skypilot/pull/7181
18
22
  SETUP_ENV_VARS_CMD = (
19
23
  'prefix_cmd() '
20
24
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
- 'export -p > ~/container_env_var.sh && '
25
+ 'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
22
26
  '$(prefix_cmd) '
23
27
  'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
24
28
 
25
29
  # Docker daemon may not be ready when the machine is firstly started. The error
26
30
  # message starts with the following string. We should wait for a while and retry
27
31
  # the command.
28
- DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
29
- 'the Docker daemon socket')
32
+ DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')
30
33
 
31
34
  DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
35
+ DOCKER_SOCKET_NOT_READY_STR_2 = (
36
+ 'check if the path is correct and if the daemon is running')
32
37
 
33
38
  _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
34
39
 
40
+ # Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
41
+ # AWS CLI v2 is installed as a standalone binary, not a Python package. See:
42
+ # https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
43
+ INSTALL_AWS_CLI_CMD = (
44
+ 'which aws || ((command -v unzip >/dev/null 2>&1 || '
45
+ '(sudo apt-get update && sudo apt-get install -y unzip)) && '
46
+ 'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
47
+ '-o "/tmp/awscliv2.zip" && '
48
+ 'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
49
+ '&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
50
+
51
+
52
+ def _extract_region_from_ecr_server(server: str) -> str:
53
+ """Extract AWS region from ECR server URL.
54
+
55
+ ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
56
+ Returns the region part from the URL.
57
+ """
58
+ # Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
59
+ parts = server.split('.')
60
+ if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
61
+ return parts[3]
62
+ raise ValueError(f'Invalid ECR server format: {server}')
63
+
35
64
 
36
65
  @dataclasses.dataclass
37
66
  class DockerLoginConfig:
@@ -147,6 +176,17 @@ def _with_interactive(cmd):
147
176
  return ['bash', '--login', '-c', '-i', shlex.quote(force_interactive)]
148
177
 
149
178
 
179
+ def _redact_docker_password(cmd: str) -> str:
180
+ parts = shlex.split(cmd)
181
+ for i, part in enumerate(parts):
182
+ if part.startswith('--password'):
183
+ if part.startswith('--password='):
184
+ parts[i] = '--password=<redacted>'
185
+ elif i + 1 < len(parts):
186
+ parts[i + 1] = '<redacted>'
187
+ return ' '.join(parts)
188
+
189
+
150
190
  # SkyPilot: New class to initialize docker containers on a remote node.
151
191
  # Adopted from ray.autoscaler._private.command_runner.DockerCommandRunner.
152
192
  class DockerInitializer:
@@ -157,19 +197,23 @@ class DockerInitializer:
157
197
  self.docker_config = docker_config
158
198
  self.container_name = docker_config['container_name']
159
199
  self.runner = runner
160
- self.home_dir = None
200
+ self.home_dir: Optional[str] = None
161
201
  self.initialized = False
162
202
  # podman is not fully tested yet.
163
203
  use_podman = docker_config.get('use_podman', False)
164
204
  self.docker_cmd = 'podman' if use_podman else 'docker'
165
205
  self.log_path = log_path
166
206
 
167
- def _run(self,
168
- cmd,
169
- run_env='host',
170
- wait_for_docker_daemon: bool = False,
171
- separate_stderr: bool = False,
172
- log_err_when_fail: bool = True) -> str:
207
+ def _run(
208
+ self,
209
+ cmd,
210
+ run_env='host',
211
+ wait_for_docker_daemon: bool = False,
212
+ separate_stderr: bool = False,
213
+ log_err_when_fail: bool = True,
214
+ flock_name: Optional[str] = None,
215
+ flock_args: Optional[str] = None,
216
+ ) -> str:
173
217
 
174
218
  if run_env == 'docker':
175
219
  cmd = self._docker_expand_user(cmd, any_char=True)
@@ -178,10 +222,17 @@ class DockerInitializer:
178
222
  # an error: `the input device is not a TTY`, and it works without
179
223
  # `-it` flag.
180
224
  # TODO(zhwu): ray use the `-it` flag, we need to check why.
181
- cmd = (f'{self.docker_cmd} exec {self.container_name} /bin/bash -c'
182
- f' {shlex.quote(cmd)} ')
225
+ cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
226
+ f' /bin/bash -c {shlex.quote(cmd)} ')
227
+
228
+ if flock_name is not None:
229
+ flock_args = flock_args or ''
230
+ cmd = (f'flock {flock_args} /tmp/{flock_name} '
231
+ f'-c {shlex.quote(cmd)}')
183
232
 
184
- logger.debug(f'+ {cmd}')
233
+ # Redact the password in the login command.
234
+ redacted_cmd = _redact_docker_password(cmd)
235
+ logger.debug(f'+ {redacted_cmd}')
185
236
  start = time.time()
186
237
  while True:
187
238
  rc, stdout, stderr = self.runner.run(
@@ -191,7 +242,8 @@ class DockerInitializer:
191
242
  separate_stderr=separate_stderr,
192
243
  log_path=self.log_path)
193
244
  if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
194
- DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
245
+ DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
246
+ DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
195
247
  if wait_for_docker_daemon:
196
248
  if time.time(
197
249
  ) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
@@ -212,7 +264,7 @@ class DockerInitializer:
212
264
  break
213
265
  subprocess_utils.handle_returncode(
214
266
  rc,
215
- cmd,
267
+ redacted_cmd,
216
268
  error_msg='Failed to run docker setup commands.',
217
269
  stderr=stdout + stderr,
218
270
  # Print out the error message if the command failed.
@@ -231,14 +283,17 @@ class DockerInitializer:
231
283
  if self._check_container_exited():
232
284
  self.initialized = True
233
285
  self._run(f'{self.docker_cmd} start {self.container_name}')
234
- self._run('sudo service ssh start', run_env='docker')
286
+ self._run('sudo service ssh start',
287
+ run_env='docker',
288
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
289
+ flock_args='-s -w 1')
235
290
  return self._run('whoami', run_env='docker')
236
291
 
237
292
  # SkyPilot: Docker login if user specified a private docker registry.
238
293
  if 'docker_login_config' in self.docker_config:
239
- # TODO(tian): Maybe support a command to get the login password?
240
294
  docker_login_config = DockerLoginConfig(
241
295
  **self.docker_config['docker_login_config'])
296
+
242
297
  if docker_login_config.password:
243
298
  # Password is allowed to be empty, in that case, we will not run
244
299
  # the login command, and assume that the image pulling is
@@ -249,6 +304,25 @@ class DockerInitializer:
249
304
  f'--password {shlex.quote(docker_login_config.password)} '
250
305
  f'{shlex.quote(docker_login_config.server)}',
251
306
  wait_for_docker_daemon=True)
307
+ elif (docker_login_config.server.endswith('.amazonaws.com') and
308
+ '.dkr.ecr.' in docker_login_config.server):
309
+ # AWS ECR: Use aws ecr get-login-password for authentication
310
+ # ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
311
+ # This command uses the IAM credentials from the EC2 instance
312
+ # Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
313
+ region = _extract_region_from_ecr_server(
314
+ docker_login_config.server)
315
+
316
+ # AWS CLI is not pre-installed on AWS instances, unlike gcloud
317
+ # on GCP instances, so we need to install it first
318
+ self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
319
+
320
+ self._run(
321
+ f'aws ecr get-login-password --region {region} | '
322
+ f'{self.docker_cmd} login --username AWS '
323
+ f'--password-stdin '
324
+ f'{shlex.quote(docker_login_config.server)}',
325
+ wait_for_docker_daemon=True)
252
326
  elif docker_login_config.server.endswith('-docker.pkg.dev'):
253
327
  # Docker image server is on GCR, we need to do additional setup
254
328
  # to pull the image.
@@ -311,7 +385,9 @@ class DockerInitializer:
311
385
  self._auto_configure_shm(user_docker_run_options)),
312
386
  self.docker_cmd,
313
387
  )
314
- self._run(f'{remove_container_cmd}; {start_command}')
388
+ self._run(f'{remove_container_cmd} && {start_command}',
389
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
390
+ flock_args='-x -w 10')
315
391
 
316
392
  # SkyPilot: Setup Commands.
317
393
  # TODO(zhwu): the following setups should be aligned with the kubernetes
@@ -329,14 +405,18 @@ class DockerInitializer:
329
405
  'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
330
406
  run_env='docker')
331
407
  # Install dependencies.
332
- self._run(
333
- 'sudo apt-get update; '
408
+ cmd = (
409
+ 'bash -lc \''
410
+ 'exec 200>/var/tmp/sky_apt.lock; '
411
+ 'flock -x -w 120 200 || exit 1; '
412
+ 'export DEBIAN_FRONTEND=noninteractive; '
413
+ 'apt-get -yq update && '
334
414
  # Our mount script will install gcsfuse without fuse package.
335
415
  # We need to install fuse package first to enable storage mount.
336
416
  # The dpkg option is to suppress the prompt for fuse installation.
337
- 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '
338
- 'rsync curl wget patch openssh-server python3-pip fuse;',
339
- run_env='docker')
417
+ 'apt-get -o DPkg::Options::=--force-confnew install -y '
418
+ 'rsync curl wget patch openssh-server python3-pip fuse\'')
419
+ self._run(cmd, run_env='docker')
340
420
 
341
421
  # Copy local authorized_keys to docker container.
342
422
  # Stop and disable jupyter service. This is to avoid port conflict on
@@ -367,7 +447,7 @@ class DockerInitializer:
367
447
  # pylint: disable=anomalous-backslash-in-string
368
448
  self._run(
369
449
  'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
370
- f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
450
+ f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
371
451
  'mkdir -p ~/.ssh;'
372
452
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
373
453
  'sudo service ssh start;'
@@ -412,9 +492,13 @@ class DockerInitializer:
412
492
  user_pos = string.find('~')
413
493
  if user_pos > -1:
414
494
  if self.home_dir is None:
415
- cmd = (f'{self.docker_cmd} exec {self.container_name} '
416
- 'printenv HOME')
417
- self.home_dir = self._run(cmd, separate_stderr=True)
495
+ cmd = (f'{self.docker_cmd} exec {self.container_name}'
496
+ ' printenv HOME')
497
+ self.home_dir = self._run(
498
+ cmd,
499
+ separate_stderr=True,
500
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
501
+ flock_args='-s -w 1')
418
502
  # Check for unexpected newline in home directory, which can be
419
503
  # a common issue when the output is mixed with stderr.
420
504
  assert '\n' not in self.home_dir, (
@@ -3,11 +3,11 @@ import os
3
3
  import time
4
4
  from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
- from sky import authentication as auth
7
6
  from sky import exceptions
8
7
  from sky import sky_logging
9
8
  from sky.provision import common
10
9
  from sky.provision.fluidstack import fluidstack_utils as utils
10
+ from sky.utils import auth_utils
11
11
  from sky.utils import command_runner
12
12
  from sky.utils import common_utils
13
13
  from sky.utils import status_lib
@@ -27,7 +27,7 @@ logger = sky_logging.init_logger(__name__)
27
27
  def get_internal_ip(node_info: Dict[str, Any]) -> None:
28
28
  node_info['internal_ip'] = node_info['ip_address']
29
29
 
30
- private_key_path, _ = auth.get_or_generate_keys()
30
+ private_key_path, _ = auth_utils.get_or_generate_keys()
31
31
  runner = command_runner.SSHCommandRunner(
32
32
  (node_info['ip_address'], 22),
33
33
  ssh_user='ubuntu',
@@ -78,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
78
78
  return head_instance_id
79
79
 
80
80
 
81
- def run_instances(region: str, cluster_name_on_cloud: str,
81
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
82
82
  config: common.ProvisionConfig) -> common.ProvisionRecord:
83
83
  """Runs instances for the given cluster."""
84
-
84
+ del cluster_name # unused
85
85
  pending_status = ['pending', 'provisioning']
86
86
  while True:
87
87
  instances = _filter_instances(cluster_name_on_cloud, pending_status)
@@ -291,9 +291,10 @@ def query_instances(
291
291
  cluster_name_on_cloud: str,
292
292
  provider_config: Optional[Dict[str, Any]] = None,
293
293
  non_terminated_only: bool = True,
294
+ retry_if_missing: bool = False,
294
295
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
295
296
  """See sky/provision/__init__.py"""
296
- del cluster_name # unused
297
+ del cluster_name, retry_if_missing # unused
297
298
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
298
299
  instances = _filter_instances(cluster_name_on_cloud, None)
299
300
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -5,6 +5,8 @@ import time
5
5
  import typing
6
6
  from typing import Any, Dict, List, Set, Tuple
7
7
 
8
+ from typing_extensions import TypedDict
9
+
8
10
  from sky.adaptors import gcp
9
11
  from sky.clouds.utils import gcp_utils
10
12
  from sky.provision import common
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
415
417
  return iam_role
416
418
 
417
419
 
420
+ AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
421
+
422
+
418
423
  def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
419
424
  compute):
420
425
  """Check if the firewall rules in the VPC are sufficient."""
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
466
471
  }
467
472
  """
468
473
  source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
469
- source2allowed_list: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
474
+ source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
470
475
  for rule in rules:
471
476
  # Rules applied to specific VM (targetTags) may not work for the
472
477
  # current VM, so should be skipped.
@@ -62,9 +62,10 @@ def query_instances(
62
62
  cluster_name_on_cloud: str,
63
63
  provider_config: Optional[Dict[str, Any]] = None,
64
64
  non_terminated_only: bool = True,
65
+ retry_if_missing: bool = False,
65
66
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
66
67
  """See sky/provision/__init__.py"""
67
- del cluster_name # unused
68
+ del cluster_name, retry_if_missing # unused
68
69
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
69
70
  zone = provider_config['availability_zone']
70
71
  project_id = provider_config['project_id']
@@ -360,9 +361,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
360
361
  created_instance_ids=created_instance_ids)
361
362
 
362
363
 
363
- def run_instances(region: str, cluster_name_on_cloud: str,
364
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
364
365
  config: common.ProvisionConfig) -> common.ProvisionRecord:
365
366
  """See sky/provision/__init__.py"""
367
+ del cluster_name # unused
366
368
  try:
367
369
  return _run_instances(region, cluster_name_on_cloud, config)
368
370
  except gcp.http_error_exception() as e:
@@ -64,8 +64,9 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
64
64
  return next(iter(instances.keys()))
65
65
 
66
66
 
67
- def run_instances(region: str, cluster_name_on_cloud: str,
67
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
68
68
  config: common.ProvisionConfig) -> common.ProvisionRecord:
69
+ del cluster_name # unused
69
70
  logger.info(f'Starting run_instances with region={region}, '
70
71
  f'cluster={cluster_name_on_cloud}')
71
72
  logger.debug(f'Config: {config}')
@@ -308,9 +309,10 @@ def query_instances(
308
309
  cluster_name_on_cloud: str,
309
310
  provider_config: Optional[dict] = None,
310
311
  non_terminated_only: bool = True,
312
+ retry_if_missing: bool = False,
311
313
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
312
314
  """Returns the status of the specified instances for Hyperbolic."""
313
- del cluster_name, provider_config # unused
315
+ del cluster_name, provider_config, retry_if_missing # unused
314
316
  # Fetch all instances for this cluster
315
317
  instances = utils.list_instances(
316
318
  metadata={'skypilot': {