skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/utils/locks.py CHANGED
@@ -11,15 +11,19 @@ import time
11
11
  from typing import Any, Optional
12
12
 
13
13
  import filelock
14
+ import psycopg2
14
15
  import sqlalchemy
15
16
 
16
17
  from sky import global_user_state
17
- from sky.skylet import constants
18
+ from sky.skylet import runtime_utils
18
19
  from sky.utils import common_utils
19
20
  from sky.utils.db import db_utils
20
21
 
21
22
  logger = logging.getLogger(__name__)
22
23
 
24
+ # The directory for file locks.
25
+ SKY_LOCKS_DIR = runtime_utils.get_runtime_dir_path('.sky/locks')
26
+
23
27
 
24
28
  class LockTimeout(RuntimeError):
25
29
  """Raised when a lock acquisition times out."""
@@ -126,9 +130,8 @@ class FileLock(DistributedLock):
126
130
  poll_interval: Interval in seconds to poll for lock acquisition.
127
131
  """
128
132
  super().__init__(lock_id, timeout, poll_interval)
129
- os.makedirs(constants.SKY_LOCKS_DIR, exist_ok=True)
130
- self.lock_path = os.path.join(constants.SKY_LOCKS_DIR,
131
- f'.{lock_id}.lock')
133
+ os.makedirs(SKY_LOCKS_DIR, exist_ok=True)
134
+ self.lock_path = os.path.join(SKY_LOCKS_DIR, f'.{lock_id}.lock')
132
135
  if timeout is None:
133
136
  timeout = -1
134
137
  self._filelock: filelock.FileLock = filelock.FileLock(self.lock_path,
@@ -154,7 +157,7 @@ class FileLock(DistributedLock):
154
157
  common_utils.remove_file_if_exists(self.lock_path)
155
158
 
156
159
  def is_locked(self) -> bool:
157
- return self._filelock.is_locked()
160
+ return self._filelock.is_locked
158
161
 
159
162
 
160
163
  class PostgresLock(DistributedLock):
@@ -162,15 +165,20 @@ class PostgresLock(DistributedLock):
162
165
 
163
166
  Uses PostgreSQL advisory locks to implement distributed locking
164
167
  that works across multiple machines sharing the same database.
165
- Reference:
166
- https://www.postgresql.org/docs/current/explicit-locking.html
167
- #ADVISORY-LOCKS
168
+ Supports both exclusive and shared lock modes.
169
+
170
+ References:
171
+ # pylint: disable=line-too-long
172
+ - https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
173
+ - https://www.postgresql.org/docs/current/functions-admin.html#FUNCTIONS-ADVISORY-LOCKS
174
+ # TODO(cooperc): re-enable pylint line-too-long
168
175
  """
169
176
 
170
177
  def __init__(self,
171
178
  lock_id: str,
172
179
  timeout: Optional[float] = None,
173
- poll_interval: float = 1):
180
+ poll_interval: float = 1,
181
+ shared_lock: bool = False):
174
182
  """Initialize the postgres lock.
175
183
 
176
184
  Args:
@@ -178,10 +186,13 @@ class PostgresLock(DistributedLock):
178
186
  timeout: Maximum time to wait for lock acquisition.
179
187
  poll_interval: Interval in seconds to poll for lock acquisition,
180
188
  default to 1 second to avoid storming the database.
189
+ shared_lock: Whether to use shared advisory lock or exclusive
190
+ advisory lock (default).
181
191
  """
182
192
  super().__init__(lock_id, timeout, poll_interval)
183
193
  # Convert string lock_id to integer for postgres advisory locks
184
194
  self._lock_key = self._string_to_lock_key(lock_id)
195
+ self._shared_lock = shared_lock
185
196
  self._acquired = False
186
197
  self._connection: Optional[sqlalchemy.pool.PoolProxiedConnection] = None
187
198
 
@@ -197,6 +208,7 @@ class PostgresLock(DistributedLock):
197
208
  if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
198
209
  raise ValueError('PostgresLock requires PostgreSQL database. '
199
210
  f'Current dialect: {engine.dialect.name}')
211
+ # Borrow a dedicated connection from the pool.
200
212
  return engine.raw_connection()
201
213
 
202
214
  def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
@@ -209,33 +221,37 @@ class PostgresLock(DistributedLock):
209
221
 
210
222
  start_time = time.time()
211
223
 
224
+ if self._shared_lock:
225
+ lock_func = 'pg_try_advisory_lock_shared'
226
+ else:
227
+ lock_func = 'pg_try_advisory_lock'
228
+
212
229
  try:
213
230
  while True:
214
- cursor.execute('SELECT pg_try_advisory_lock(%s)',
215
- (self._lock_key,))
231
+ cursor.execute(f'SELECT {lock_func}(%s)', (self._lock_key,))
216
232
  result = cursor.fetchone()[0]
217
233
 
218
234
  if result:
219
235
  self._acquired = True
220
236
  return AcquireReturnProxy(self)
221
237
 
238
+ mode_str = ('shared' if self._shared_lock else 'exclusive')
222
239
  if not blocking:
223
240
  raise LockTimeout(
224
- f'Failed to immediately acquire postgres lock '
225
- f'{self.lock_id}')
241
+ f'Failed to immediately acquire {mode_str} '
242
+ f'postgres lock {self.lock_id}')
226
243
 
227
244
  if (self.timeout is not None and
228
245
  time.time() - start_time > self.timeout):
229
246
  raise LockTimeout(
230
- f'Failed to acquire postgres lock {self.lock_id} '
231
- f'within {self.timeout} seconds')
247
+ f'Failed to acquire {mode_str} postgres lock '
248
+ f'{self.lock_id} within {self.timeout} '
249
+ f'seconds')
232
250
 
233
251
  time.sleep(self.poll_interval)
234
252
 
235
253
  except Exception:
236
- if self._connection:
237
- self._connection.close()
238
- self._connection = None
254
+ self._close_connection()
239
255
  raise
240
256
 
241
257
  def release(self) -> None:
@@ -243,32 +259,94 @@ class PostgresLock(DistributedLock):
243
259
  if not self._acquired or not self._connection:
244
260
  return
245
261
 
262
+ connection_lost = False
246
263
  try:
247
264
  cursor = self._connection.cursor()
248
- cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
265
+ if self._shared_lock:
266
+ unlock_func = 'pg_advisory_unlock_shared'
267
+ else:
268
+ unlock_func = 'pg_advisory_unlock'
269
+ cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
249
270
  self._connection.commit()
250
271
  self._acquired = False
272
+ except psycopg2.OperationalError as e:
273
+ # Lost connection to the database, likely the lock is force unlocked
274
+ # by other routines.
275
+ logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
276
+ connection_lost = True
251
277
  finally:
252
- if self._connection:
253
- self._connection.close()
254
- self._connection = None
278
+ # Invalidate if connection was lost to prevent SQLAlchemy from
279
+ # trying to reset a dead connection
280
+ self._close_connection(invalidate=connection_lost)
255
281
 
256
282
  def force_unlock(self) -> None:
257
283
  """Force unlock the postgres advisory lock."""
258
284
  try:
259
- if not self._connection:
285
+ # The lock is held by current routine, gracefully unlock it
286
+ if self._acquired:
287
+ self.release()
288
+ return
289
+
290
+ # The lock is held by another routine, force unlock it.
291
+ if self._connection is None:
260
292
  self._connection = self._get_connection()
261
293
  cursor = self._connection.cursor()
262
- cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
263
- self._connection.commit()
294
+ if self._shared_lock:
295
+ unlock_func = 'pg_advisory_unlock_shared'
296
+ else:
297
+ unlock_func = 'pg_advisory_unlock'
298
+
299
+ cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
300
+ result = cursor.fetchone()[0]
301
+ if result:
302
+ # The lock is held by current routine and unlock succeed
303
+ self._connection.commit()
304
+ self._acquired = False
305
+ return
306
+ cursor.execute(
307
+ ('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
308
+ 'AND ((classid::bigint << 32) | objid::bigint) = %s'),
309
+ (self._lock_key,))
310
+ rows = cursor.fetchall()
311
+ if rows:
312
+ # There can be multiple PIDs holding the lock, it is not enough
313
+ # to only kill some of them. For example, if pid 1 is holding a
314
+ # shared lock, and pid 2 is waiting to grab an exclusive lock,
315
+ # killing pid 1 will transfer the lock to pid 2, so the lock
316
+ # will still not be released.
317
+ for row in rows:
318
+ cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
319
+ self._connection.commit()
320
+ return
264
321
  except Exception as e:
265
322
  raise RuntimeError(
266
323
  f'Failed to force unlock postgres lock {self.lock_id}: {e}'
267
324
  ) from e
268
325
  finally:
269
- if self._connection:
270
- self._connection.close()
271
- self._connection = None
326
+ self._close_connection()
327
+
328
+ def _close_connection(self, invalidate: bool = False) -> None:
329
+ """Close the postgres connection.
330
+
331
+ Args:
332
+ invalidate: If True, invalidate connection instead of closing it.
333
+ Use this when the connection might be broken (e.g., after
334
+ pg_terminate_backend) to prevent SQLAlchemy from trying to
335
+ reset it (which would result in an error being logged).
336
+ """
337
+ if self._connection:
338
+ try:
339
+ if invalidate:
340
+ self._connection.invalidate()
341
+ else:
342
+ self._connection.close()
343
+ except Exception as e: # pylint: disable=broad-except
344
+ if invalidate:
345
+ logger.debug(
346
+ f'Failed to invalidate postgres connection: {e}')
347
+ else:
348
+ logger.debug(f'Failed to close postgres connection: {e}')
349
+ self._connection = None
272
350
 
273
351
  def is_locked(self) -> bool:
274
352
  """Check if the postgres advisory lock is acquired."""
@@ -278,7 +356,8 @@ class PostgresLock(DistributedLock):
278
356
  def get_lock(lock_id: str,
279
357
  timeout: Optional[float] = None,
280
358
  lock_type: Optional[str] = None,
281
- poll_interval: Optional[float] = None) -> DistributedLock:
359
+ poll_interval: Optional[float] = None,
360
+ shared_lock: bool = False) -> DistributedLock:
282
361
  """Create a distributed lock instance.
283
362
 
284
363
  Args:
@@ -287,6 +366,9 @@ def get_lock(lock_id: str,
287
366
  None means wait indefinitely.
288
367
  lock_type: Type of lock to create ('filelock' or 'postgres').
289
368
  If None, auto-detect based on database configuration.
369
+ poll_interval: Interval in seconds to poll for lock acquisition.
370
+ shared_lock: Whether to use shared lock or exclusive lock (default).
371
+ NOTE: Only applicable for PostgresLock.
290
372
 
291
373
  Returns:
292
374
  DistributedLock instance.
@@ -296,9 +378,24 @@ def get_lock(lock_id: str,
296
378
 
297
379
  if lock_type == 'postgres':
298
380
  if poll_interval is None:
299
- return PostgresLock(lock_id, timeout)
300
- return PostgresLock(lock_id, timeout, poll_interval)
381
+ return PostgresLock(lock_id, timeout, shared_lock=shared_lock)
382
+ return PostgresLock(lock_id,
383
+ timeout,
384
+ poll_interval,
385
+ shared_lock=shared_lock)
301
386
  elif lock_type == 'filelock':
387
+ # The filelock library we use does not support shared locks.
388
+ # It explicitly uses fcntl.LOCK_EX on Unix systems,
389
+ # whereas fcntl.LOCK_SH is needed for shared locks.
390
+
391
+ # This should be fine as it should not introduce correctness issues,
392
+ # just that concurrency is reduced and so is performance, because
393
+ # read-only operations can't run at the same time, each of them need
394
+ # to wait to exclusively hold the lock.
395
+
396
+ # But given that we recommend users to use Postgres in production,
397
+ # the impact of this should be limited to local API server mostly.
398
+ del shared_lock
302
399
  if poll_interval is None:
303
400
  return FileLock(lock_id, timeout)
304
401
  return FileLock(lock_id, timeout, poll_interval)
sky/utils/log_utils.py CHANGED
@@ -198,325 +198,6 @@ class SkyLocalUpLineProcessor(LineProcessor):
198
198
  self.status_display.stop()
199
199
 
200
200
 
201
- class SkyRemoteUpLineProcessor(LineProcessor):
202
- """A processor for deploy_remote_cluster.py log lines."""
203
-
204
- def __init__(self, log_path: str, is_local: bool):
205
- self.log_path = log_path
206
- self.is_local = is_local
207
-
208
- def __enter__(self) -> None:
209
- # TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
210
- # messages.
211
- status = rich_utils.safe_status(
212
- ux_utils.spinner_message('Creating remote cluster',
213
- log_path=self.log_path,
214
- is_local=self.is_local))
215
- self.status_display = status
216
- self.status_display.start()
217
-
218
- def process_line(self, log_line: str) -> None:
219
- # Pre-flight checks
220
- if 'SSH connection successful' in log_line:
221
- logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
222
- f'{colorama.Style.RESET_ALL}')
223
-
224
- # Kubernetes installation steps
225
- if 'Deploying Kubernetes on head node' in log_line:
226
- self.status_display.update(
227
- ux_utils.spinner_message(
228
- 'Creating remote cluster - '
229
- 'deploying Kubernetes on head node',
230
- log_path=self.log_path,
231
- is_local=self.is_local))
232
- if 'K3s deployed on head node.' in log_line:
233
- logger.info(f'{colorama.Fore.GREEN}'
234
- '✔ K3s successfully deployed on head node.'
235
- f'{colorama.Style.RESET_ALL}')
236
-
237
- # Worker nodes
238
- if 'Deploying Kubernetes on worker node' in log_line:
239
- self.status_display.update(
240
- ux_utils.spinner_message(
241
- 'Creating remote cluster - '
242
- 'deploying Kubernetes on worker nodes',
243
- log_path=self.log_path,
244
- is_local=self.is_local))
245
- if 'Kubernetes deployed on worker node' in log_line:
246
- logger.info(f'{colorama.Fore.GREEN}'
247
- '✔ K3s successfully deployed on worker node.'
248
- f'{colorama.Style.RESET_ALL}')
249
-
250
- # Cluster configuration
251
- if 'Configuring local kubectl to connect to the cluster...' in log_line:
252
- self.status_display.update(
253
- ux_utils.spinner_message(
254
- 'Creating remote cluster - '
255
- 'configuring local kubectl',
256
- log_path=self.log_path,
257
- is_local=self.is_local))
258
- if 'kubectl configured to connect to the cluster.' in log_line:
259
- logger.info(f'{colorama.Fore.GREEN}'
260
- '✔ kubectl configured for the remote cluster.'
261
- f'{colorama.Style.RESET_ALL}')
262
-
263
- # GPU operator installation
264
- if 'Installing Nvidia GPU Operator...' in log_line:
265
- self.status_display.update(
266
- ux_utils.spinner_message(
267
- 'Creating remote cluster - '
268
- 'installing Nvidia GPU Operator',
269
- log_path=self.log_path,
270
- is_local=self.is_local))
271
- if 'GPU Operator installed.' in log_line:
272
- logger.info(f'{colorama.Fore.GREEN}'
273
- '✔ Nvidia GPU Operator installed successfully.'
274
- f'{colorama.Style.RESET_ALL}')
275
-
276
- # Cleanup steps
277
- if 'Cleaning up head node' in log_line:
278
- self.status_display.update(
279
- ux_utils.spinner_message('Cleaning up head node',
280
- log_path=self.log_path,
281
- is_local=self.is_local))
282
- if 'Cleaning up node' in log_line:
283
- self.status_display.update(
284
- ux_utils.spinner_message('Cleaning up worker node',
285
- log_path=self.log_path,
286
- is_local=self.is_local))
287
- if 'cleaned up successfully' in log_line:
288
- logger.info(f'{colorama.Fore.GREEN}'
289
- f'{log_line.strip()}{colorama.Style.RESET_ALL}')
290
-
291
- # Final status
292
- if 'Cluster deployment completed.' in log_line:
293
- logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
294
- f'{colorama.Style.RESET_ALL}')
295
-
296
- def __exit__(self, except_type: Optional[Type[BaseException]],
297
- except_value: Optional[BaseException],
298
- traceback: Optional[types.TracebackType]) -> None:
299
- del except_type, except_value, traceback # unused
300
- self.status_display.stop()
301
-
302
-
303
- class SkySSHUpLineProcessor(LineProcessor):
304
- """A processor for deploy_remote_cluster.py log lines for SSH clusters"""
305
-
306
- def __init__(self, log_path: str, is_local: bool):
307
- self.log_path = log_path
308
- self.is_local = is_local
309
- self.current_cluster: Optional[str] = None
310
- self.is_cleanup_mode = False
311
-
312
- def __enter__(self) -> None:
313
- status = rich_utils.safe_status(
314
- ux_utils.spinner_message('Preparing to set up SSH Node Pools',
315
- log_path=self.log_path,
316
- is_local=self.is_local))
317
- self.status_display = status
318
- self.status_display.start()
319
-
320
- def process_line(self, log_line: str) -> None:
321
- # Detect cleanup mode
322
- if 'SKYPILOT_CLEANUP_MODE:' in log_line:
323
- self.is_cleanup_mode = True
324
- if self.current_cluster:
325
- self.status_display.update(
326
- ux_utils.spinner_message(
327
- f'Cleaning up Node Pool: \\[{self.current_cluster}]',
328
- log_path=self.log_path,
329
- is_local=self.is_local))
330
-
331
- # Cluster detection message
332
- if 'SKYPILOT_CLUSTER_INFO:' in log_line:
333
- clusters_part = log_line.split('SKYPILOT_CLUSTER_INFO:',
334
- 1)[1].strip()
335
- if clusters_part.startswith('Found'):
336
- logger.info(f'{colorama.Style.RESET_ALL}'
337
- f'{colorama.Fore.CYAN}{clusters_part}'
338
- f'{colorama.Style.RESET_ALL}')
339
-
340
- # Current cluster being operated on
341
- if 'SKYPILOT_CURRENT_CLUSTER:' in log_line:
342
- self.current_cluster = log_line.split('SKYPILOT_CURRENT_CLUSTER:',
343
- 1)[1].strip()
344
-
345
- if self.is_cleanup_mode:
346
- self.status_display.update(
347
- ux_utils.spinner_message(
348
- f'Cleaning up Node Pool: {self.current_cluster}',
349
- log_path=self.log_path,
350
- is_local=self.is_local))
351
- logger.info(f'{colorama.Fore.CYAN}\nCleaning up Node Pool: '
352
- f'{self.current_cluster}{colorama.Style.RESET_ALL}')
353
- else:
354
- self.status_display.update(
355
- ux_utils.spinner_message(
356
- f'Deploying SkyPilot \\[{self.current_cluster}]',
357
- log_path=self.log_path,
358
- is_local=self.is_local))
359
- logger.info(f'{colorama.Style.RESET_ALL}'
360
- f'{colorama.Fore.CYAN}\nSetting up Node Pool: '
361
- f'{self.current_cluster}{colorama.Style.RESET_ALL}')
362
-
363
- # Handle cluster completion marker
364
- if 'SKYPILOT_CLUSTER_COMPLETED:' in log_line:
365
- if self.is_cleanup_mode:
366
- logger.info(
367
- f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
368
- f'✔ Node Pool {self.current_cluster} cleaned up '
369
- f'successfully.{colorama.Style.RESET_ALL}')
370
- else:
371
- logger.info(
372
- f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
373
- f'✔ Node Pool {self.current_cluster} deployed successfully.'
374
- f'{colorama.Style.RESET_ALL}')
375
-
376
- # Pre-flight checks
377
- if 'Checking SSH connection to head node' in log_line:
378
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
379
- 'Checking SSH connection to head node...'
380
- f'{colorama.Style.RESET_ALL}')
381
-
382
- if log_line.startswith('SSH connection successful'):
383
- node_name = log_line.split('(')[-1].split(')')[0]
384
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
385
- '✔ SSH connection established to head node '
386
- f'{node_name}.{colorama.Style.RESET_ALL}')
387
-
388
- # Kubernetes installation steps
389
- if 'Deploying Kubernetes on head node' in log_line:
390
- current_cluster_str = f' \\[{self.current_cluster}]' if (
391
- self.current_cluster) else ''
392
- self.status_display.update(
393
- ux_utils.spinner_message(
394
- 'Deploying SkyPilot runtime on head node'
395
- f'{current_cluster_str}',
396
- log_path=self.log_path,
397
- is_local=self.is_local))
398
-
399
- if 'K3s deployed on head node' in log_line:
400
- node_name = log_line.split('(')[-1].split(')')[0]
401
- logger.info(
402
- f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
403
- f'✔ SkyPilot runtime successfully deployed on head node '
404
- f'{node_name}.{colorama.Style.RESET_ALL}')
405
-
406
- # Worker nodes
407
- if 'Deploying Kubernetes on worker node' in log_line:
408
- self.status_display.update(
409
- ux_utils.spinner_message(
410
- 'Deploying SkyPilot runtime on worker nodes' +
411
- (f' \\[{self.current_cluster}]'
412
- if self.current_cluster else ''),
413
- log_path=self.log_path,
414
- is_local=self.is_local))
415
-
416
- if 'Kubernetes deployed on worker node' in log_line:
417
- node_name = log_line.split('(')[-1].split(')')[0]
418
- logger.info(
419
- f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
420
- '✔ SkyPilot runtime successfully deployed on worker node '
421
- f'{node_name}.{colorama.Style.RESET_ALL}')
422
-
423
- if 'Failed to deploy K3s on worker node' in log_line:
424
- node_name = log_line.split('(')[-1].split(')')[0]
425
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
426
- '✗ Failed to deploy K3s on worker node '
427
- f'{node_name}.{colorama.Style.RESET_ALL}')
428
-
429
- # Cluster configuration
430
- if 'Configuring local kubectl to connect to the cluster...' in log_line:
431
- self.status_display.update(
432
- ux_utils.spinner_message('Setting up SkyPilot configuration' +
433
- (f' \\[{self.current_cluster}]'
434
- if self.current_cluster else ''),
435
- log_path=self.log_path,
436
- is_local=self.is_local))
437
-
438
- if 'kubectl configured to connect to the cluster.' in log_line:
439
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
440
- '✔ SkyPilot configuration complete.'
441
- f'{colorama.Style.RESET_ALL}')
442
-
443
- # GPU operator installation
444
- if 'Installing Nvidia GPU Operator...' in log_line:
445
- self.status_display.update(
446
- ux_utils.spinner_message('Configuring Nvidia GPUs' +
447
- (f' \\[{self.current_cluster}]'
448
- if self.current_cluster else ''),
449
- log_path=self.log_path,
450
- is_local=self.is_local))
451
-
452
- if 'GPU Operator installed.' in log_line:
453
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
454
- '✔ Nvidia GPUs configured successfully.'
455
- f'{colorama.Style.RESET_ALL}')
456
-
457
- # Cleanup steps
458
- if 'Cleaning up head node' in log_line:
459
- self.status_display.update(
460
- ux_utils.spinner_message('Cleaning up head node' +
461
- (f' \\[{self.current_cluster}]'
462
- if self.current_cluster else ''),
463
- log_path=self.log_path,
464
- is_local=self.is_local))
465
-
466
- if 'Cleaning up worker node' in log_line:
467
- self.status_display.update(
468
- ux_utils.spinner_message('Cleaning up worker nodes' +
469
- (f' \\[{self.current_cluster}]'
470
- if self.current_cluster else ''),
471
- log_path=self.log_path,
472
- is_local=self.is_local))
473
-
474
- # Handle node cleanup success messages
475
- if 'Node' in log_line and 'cleaned up successfully' in log_line:
476
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
477
- f'{log_line.strip()}{colorama.Style.RESET_ALL}')
478
-
479
- if 'Node' in log_line and 'Failed to clean up' in log_line:
480
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
481
- f'{log_line.strip()}{colorama.Style.RESET_ALL}')
482
-
483
- if 'Failed to clean up worker node' in log_line:
484
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
485
- f'{log_line.strip()}{colorama.Style.RESET_ALL}')
486
-
487
- # Final status for the cluster deployment
488
- if 'Cluster deployment completed.' in log_line:
489
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
490
- '✔ SkyPilot runtime is up.'
491
- f'{colorama.Style.RESET_ALL}')
492
-
493
- if 'Failed to deploy Kubernetes on the following nodes:' in log_line:
494
- logger.info(log_line.strip())
495
-
496
- if 'already exists in history. ' in log_line:
497
- node_name = log_line.split('(')[-1].split(')')[0]
498
- logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.YELLOW}'
499
- '✔ SkyPilot runtime already deployed on worker node '
500
- f'{node_name}. Skipping.{colorama.Style.RESET_ALL}')
501
-
502
- if 'Failed to setup TCP forwarding on head node' in log_line:
503
- node_name = log_line.split('(')[-1].split(')')[0]
504
- logger.info(
505
- f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
506
- f'✗ Failed to setup TCP forwarding on head node {node_name}.'
507
- f'{colorama.Style.RESET_ALL}')
508
-
509
- if 'Error in deploying SSH Target' in log_line:
510
- logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.RED}'
511
- f'{log_line.strip()}{colorama.Style.RESET_ALL}')
512
-
513
- def __exit__(self, except_type: Optional[Type[BaseException]],
514
- except_value: Optional[BaseException],
515
- traceback: Optional[types.TracebackType]) -> None:
516
- del except_type, except_value, traceback # unused
517
- self.status_display.stop()
518
-
519
-
520
201
  def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
521
202
  """Creates table with default style."""
522
203
  border = kwargs.pop('border', False)
@@ -140,7 +140,7 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
140
140
 
141
141
  def check_users_workspaces_active_resources(
142
142
  user_ids: List[str],
143
- workspace_names: List[str]) -> Tuple[str, List[str]]:
143
+ workspace_names: List[str]) -> Tuple[str, List[str], Dict[str, str]]:
144
144
  """Check if all the active clusters or managed jobs in workspaces
145
145
  belong to the user_ids. If not, return the error message.
146
146
 
@@ -151,6 +151,7 @@ def check_users_workspaces_active_resources(
151
151
  Returns:
152
152
  resource_error_summary: str
153
153
  missed_users_names: List[str]
154
+ missed_user_dict: Dict[str, str]
154
155
  """
155
156
  all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
156
157
  workspace_names)
@@ -187,14 +188,14 @@ def check_users_workspaces_active_resources(
187
188
  if resource_errors:
188
189
  resource_error_summary = ' and '.join(resource_errors)
189
190
  missed_users_names = []
191
+ missed_user_dict = {}
190
192
  if missed_users:
191
193
  all_users = global_user_state.get_all_users()
192
- missed_users_names = [
193
- user.name if user.name else user.id
194
- for user in all_users
195
- if user.id in missed_users
196
- ]
197
- return resource_error_summary, missed_users_names
194
+ for user in all_users:
195
+ if user.id in missed_users:
196
+ missed_users_names.append(user.name if user.name else user.id)
197
+ missed_user_dict[user.id] = user.name if user.name else user.id
198
+ return resource_error_summary, missed_users_names, missed_user_dict
198
199
 
199
200
 
200
201
  def _get_active_resources_for_workspaces(
@@ -276,9 +277,11 @@ def _get_active_resources(
276
277
  # pylint: disable=import-outside-toplevel
277
278
  from sky.jobs.server import core as managed_jobs_core
278
279
  try:
279
- filtered_jobs, _, _, _ = managed_jobs_core.queue(refresh=False,
280
- skip_finished=True,
281
- all_users=True)
280
+ filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
281
+ refresh=False,
282
+ skip_finished=True,
283
+ all_users=True,
284
+ fields=['job_id', 'user_hash', 'workspace'])
282
285
  return filtered_jobs
283
286
  except exceptions.ClusterNotUpError:
284
287
  logger.warning('All jobs should be finished.')