skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -2,14 +2,15 @@
2
2
  import copy
3
3
  import dataclasses
4
4
  import enum
5
- import inspect
6
5
  import json
7
6
  import math
8
7
  import os
9
8
  import pathlib
9
+ import random
10
10
  import re
11
11
  import shlex
12
12
  import signal
13
+ import socket
13
14
  import subprocess
14
15
  import sys
15
16
  import tempfile
@@ -17,8 +18,8 @@ import textwrap
17
18
  import threading
18
19
  import time
19
20
  import typing
20
- from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
21
- Union)
21
+ from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
22
+ Set, Tuple, Union)
22
23
 
23
24
  import colorama
24
25
  import psutil
@@ -39,6 +40,7 @@ from sky import skypilot_config
39
40
  from sky import task as task_lib
40
41
  from sky.adaptors import common as adaptors_common
41
42
  from sky.backends import backend_utils
43
+ from sky.backends import task_codegen
42
44
  from sky.backends import wheel_utils
43
45
  from sky.clouds import cloud as sky_cloud
44
46
  from sky.clouds.utils import gcp_utils
@@ -48,14 +50,15 @@ from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
50
52
  from sky.provision import provisioner
53
+ from sky.provision.kubernetes import config as config_lib
51
54
  from sky.provision.kubernetes import utils as kubernetes_utils
55
+ from sky.serve import constants as serve_constants
52
56
  from sky.server.requests import requests as requests_lib
53
57
  from sky.skylet import autostop_lib
54
58
  from sky.skylet import constants
55
59
  from sky.skylet import job_lib
56
60
  from sky.skylet import log_lib
57
61
  from sky.usage import usage_lib
58
- from sky.utils import accelerator_registry
59
62
  from sky.utils import annotations
60
63
  from sky.utils import cluster_utils
61
64
  from sky.utils import command_runner
@@ -85,13 +88,34 @@ if typing.TYPE_CHECKING:
85
88
  from sky import dag
86
89
  from sky.schemas.generated import autostopv1_pb2
87
90
  from sky.schemas.generated import autostopv1_pb2_grpc
91
+ from sky.schemas.generated import jobsv1_pb2
92
+ from sky.schemas.generated import jobsv1_pb2_grpc
93
+ from sky.schemas.generated import managed_jobsv1_pb2
94
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
95
+ from sky.schemas.generated import servev1_pb2
96
+ from sky.schemas.generated import servev1_pb2_grpc
88
97
  else:
89
98
  # To avoid requiring grpcio to be installed on the client side.
90
- grpc = adaptors_common.LazyImport('grpc')
99
+ grpc = adaptors_common.LazyImport(
100
+ 'grpc',
101
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
102
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
103
+ if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
91
104
  autostopv1_pb2 = adaptors_common.LazyImport(
92
105
  'sky.schemas.generated.autostopv1_pb2')
93
106
  autostopv1_pb2_grpc = adaptors_common.LazyImport(
94
107
  'sky.schemas.generated.autostopv1_pb2_grpc')
108
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
109
+ jobsv1_pb2_grpc = adaptors_common.LazyImport(
110
+ 'sky.schemas.generated.jobsv1_pb2_grpc')
111
+ servev1_pb2 = adaptors_common.LazyImport(
112
+ 'sky.schemas.generated.servev1_pb2')
113
+ servev1_pb2_grpc = adaptors_common.LazyImport(
114
+ 'sky.schemas.generated.servev1_pb2_grpc')
115
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
116
+ 'sky.schemas.generated.managed_jobsv1_pb2')
117
+ managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
118
+ 'sky.schemas.generated.managed_jobsv1_pb2_grpc')
95
119
 
96
120
  Path = str
97
121
 
@@ -113,6 +137,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
113
137
  clouds.OCI: 300,
114
138
  clouds.Paperspace: 600,
115
139
  clouds.Kubernetes: 300,
140
+ clouds.Shadeform: 300,
116
141
  clouds.Vsphere: 240,
117
142
  }
118
143
 
@@ -167,18 +192,12 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
167
192
  pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
168
193
  'monkey_patches' / 'monkey_patch_ray_up.py')
169
194
 
170
- # The maximum size of a command line arguments is 128 KB, i.e. the command
171
- # executed with /bin/sh should be less than 128KB.
172
- # https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
173
- #
174
- # If a user have very long run or setup commands, the generated command may
175
- # exceed the limit, as we directly include scripts in job submission commands.
176
- # If the command is too long, we instead write it to a file, rsync and execute
177
- # it.
178
- #
179
- # We use 100KB as a threshold to be safe for other arguments that
180
- # might be added during ssh.
181
- _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
195
+ _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
196
+ ('too long', 255),
197
+ ('request-uri too large', 1),
198
+ ('request header fields too large', 1),
199
+ ('400 bad request', 1), # CloudFlare 400 error
200
+ ]
182
201
 
183
202
  _RESOURCES_UNAVAILABLE_LOG = (
184
203
  'Reasons for provision failures (for details, please check the log above):')
@@ -187,16 +206,59 @@ _RESOURCES_UNAVAILABLE_LOG = (
187
206
  _CLUSTER_LOCK_TIMEOUT = 5.0
188
207
 
189
208
 
190
- def _is_command_length_over_limit(command: str) -> bool:
191
- """Check if the length of the command exceeds the limit.
209
+ def _is_message_too_long(returncode: int,
210
+ output: Optional[str] = None,
211
+ file_path: Optional[str] = None) -> bool:
212
+ """Check if the message sent to the remote is too long.
192
213
 
193
- We calculate the length of the command after quoting the command twice as
194
- when it is executed by the CommandRunner, the command will be quoted twice
195
- to ensure the correctness, which will add significant length to the command.
214
+ We use inline script to run the setup or run command, i.e. the script will
215
+ be part of the message sent to the remote cluster. There is a chance that
216
+ the command is too long, when people has very long run or setup commands, or
217
+ there is a cloudflare proxy in front of the remote blocking the long
218
+ message. Several common causes are:
219
+ - SSH returning: `too long` in the error message.
220
+ - Cloudflare proxy returning: `414 Request-URI Too Large` or
221
+ `431 Request Header Fields Too Large` error.
222
+
223
+ We use a general length limit check before but it could be inaccurate on
224
+ some systems, e.g. cloudflare proxy, so this is necessary.
225
+
226
+ Args:
227
+ returncode: The return code of the setup command.
228
+ output: The output of the setup command.
229
+ file_path: The path to the setup log file.
196
230
  """
231
+ assert (output is None) != (file_path is None), (
232
+ 'Either output or file_path must be provided.', output, file_path)
233
+ to_check = []
234
+ for (match_str,
235
+ desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
236
+ if desired_rc == returncode:
237
+ to_check.append(match_str)
238
+ if not to_check:
239
+ return False
240
+
241
+ def _check_output_for_match_str(output: str) -> bool:
242
+ for match_str in to_check:
243
+ if match_str.lower() in output.lower():
244
+ return True
245
+ return False
197
246
 
198
- quoted_length = len(shlex.quote(shlex.quote(command)))
199
- return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
247
+ if file_path is not None:
248
+ try:
249
+ with open(os.path.expanduser(file_path), 'r',
250
+ encoding='utf-8') as f:
251
+ content = f.read()
252
+ return _check_output_for_match_str(content)
253
+ except Exception as e: # pylint: disable=broad-except
254
+ # We don't crash the setup if we cannot read the log file.
255
+ # Instead, we should retry the setup with dumping the script
256
+ # to a file to be safe.
257
+ logger.debug(f'Failed to read setup log file {file_path}: {e}')
258
+ return True
259
+ else:
260
+ assert output is not None, (output, file_path)
261
+ return _check_output_for_match_str(output)
200
262
 
201
263
 
202
264
  def _get_cluster_config_template(cloud):
@@ -208,17 +270,21 @@ def _get_cluster_config_template(cloud):
208
270
  clouds.Lambda: 'lambda-ray.yml.j2',
209
271
  clouds.IBM: 'ibm-ray.yml.j2',
210
272
  clouds.SCP: 'scp-ray.yml.j2',
273
+ clouds.Slurm: 'slurm-ray.yml.j2',
211
274
  clouds.OCI: 'oci-ray.yml.j2',
212
275
  clouds.Paperspace: 'paperspace-ray.yml.j2',
276
+ clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
213
277
  clouds.DO: 'do-ray.yml.j2',
214
278
  clouds.RunPod: 'runpod-ray.yml.j2',
215
279
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
216
280
  clouds.SSH: 'kubernetes-ray.yml.j2',
281
+ clouds.Shadeform: 'shadeform-ray.yml.j2',
217
282
  clouds.Vsphere: 'vsphere-ray.yml.j2',
218
283
  clouds.Vast: 'vast-ray.yml.j2',
219
284
  clouds.Fluidstack: 'fluidstack-ray.yml.j2',
220
285
  clouds.Nebius: 'nebius-ray.yml.j2',
221
- clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
286
+ clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
287
+ clouds.Seeweb: 'seeweb-ray.yml.j2'
222
288
  }
223
289
  return cloud_to_template[type(cloud)]
224
290
 
@@ -248,511 +314,6 @@ def write_ray_up_script_with_patched_launch_hash_fn(
248
314
  return f.name
249
315
 
250
316
 
251
- class RayCodeGen:
252
- """Code generator of a Ray program that executes a sky.Task.
253
-
254
- Usage:
255
-
256
- >> codegen = RayCodegen()
257
- >> codegen.add_prologue()
258
-
259
- >> codegen.add_ray_task(...)
260
- >> codegen.add_ray_task(...)
261
-
262
- >> codegen.add_epilogue()
263
- >> code = codegen.build()
264
- """
265
-
266
- def __init__(self):
267
- # Code generated so far, to be joined via '\n'.
268
- self._code = []
269
- # Guard method calling order.
270
- self._has_prologue = False
271
- self._has_epilogue = False
272
-
273
- # For n nodes gang scheduling.
274
- self._has_gang_scheduling = False
275
- self._num_nodes = 0
276
-
277
- self._has_register_run_fn = False
278
-
279
- # job_id
280
- # Job ID is used to identify the job (also this generated code).
281
- # It is a int automatically generated by the DB on the cluster
282
- # and monotonically increasing starting from 1.
283
- # To generate the job ID, we use the following logic:
284
- # code = job_lib.JobLibCodeGen.add_job(username,
285
- # run_timestamp)
286
- # job_id = get_output(run_on_cluster(code))
287
- self.job_id = None
288
-
289
- def add_prologue(self, job_id: int) -> None:
290
- assert not self._has_prologue, 'add_prologue() called twice?'
291
- self._has_prologue = True
292
- self.job_id = job_id
293
- # Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
294
- # 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
295
- # Otherwise, ray will fail to get the placement group because of a bug
296
- # in ray job.
297
- ray_address = 'auto'
298
- self._code = [
299
- textwrap.dedent(f"""\
300
- import functools
301
- import getpass
302
- import hashlib
303
- import io
304
- import os
305
- import pathlib
306
- import selectors
307
- import shlex
308
- import subprocess
309
- import sys
310
- import tempfile
311
- import textwrap
312
- import time
313
- from typing import Dict, List, Optional, Tuple, Union
314
-
315
- # Set the environment variables to avoid deduplicating logs and
316
- # scheduler events. This should be set in driver code, since we are
317
- # not using `ray job submit` anymore, and the environment variables
318
- # from the ray cluster is not inherited.
319
- os.environ['RAY_DEDUP_LOGS'] = '0'
320
- os.environ['RAY_SCHEDULER_EVENTS'] = '0'
321
-
322
- import ray
323
- import ray.util as ray_util
324
-
325
- from sky.skylet import autostop_lib
326
- from sky.skylet import constants
327
- from sky.skylet import job_lib
328
- from sky.utils import log_utils
329
- from sky.utils import subprocess_utils
330
-
331
- SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
332
-
333
- kwargs = dict()
334
- # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
335
- # the directory exists for backward compatibility for the VM
336
- # launched before #1790.
337
- if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
338
- kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
339
- ray.init(
340
- address={ray_address!r},
341
- namespace='__sky__{job_id}__',
342
- log_to_driver=True,
343
- **kwargs
344
- )
345
- def get_or_fail(futures, pg) -> List[int]:
346
- \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
347
- if not futures:
348
- return []
349
- returncodes = [1] * len(futures)
350
- # Wait for 1 task to be ready.
351
- ready = []
352
- # Keep invoking ray.wait if ready is empty. This is because
353
- # ray.wait with timeout=None will only wait for 10**6 seconds,
354
- # which will cause tasks running for more than 12 days to return
355
- # before becoming ready.
356
- # (Such tasks are common in serving jobs.)
357
- # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
358
- while not ready:
359
- ready, unready = ray.wait(futures)
360
- idx = futures.index(ready[0])
361
- returncodes[idx] = ray.get(ready[0])
362
- while unready:
363
- if returncodes[idx] != 0:
364
- for task in unready:
365
- # ray.cancel without force fails to kill tasks.
366
- # We use force=True to kill unready tasks.
367
- ray.cancel(task, force=True)
368
- # Use SIGKILL=128+9 to indicate the task is forcely
369
- # killed.
370
- idx = futures.index(task)
371
- returncodes[idx] = 137
372
- break
373
- ready, unready = ray.wait(unready)
374
- idx = futures.index(ready[0])
375
- returncodes[idx] = ray.get(ready[0])
376
- # Remove the placement group after all tasks are done, so that
377
- # the next job can be scheduled on the released resources
378
- # immediately.
379
- ray_util.remove_placement_group(pg)
380
- sys.stdout.flush()
381
- return returncodes
382
-
383
- run_fn = None
384
- futures = []
385
- """),
386
- # FIXME: This is a hack to make sure that the functions can be found
387
- # by ray.remote. This should be removed once we have a better way to
388
- # specify dependencies for ray.
389
- inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
390
- inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
391
- inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
392
- inspect.getsource(log_lib.process_subprocess_stream),
393
- inspect.getsource(log_lib.run_with_log),
394
- inspect.getsource(log_lib.make_task_bash_script),
395
- inspect.getsource(log_lib.add_ray_env_vars),
396
- inspect.getsource(log_lib.run_bash_command_with_log),
397
- 'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
398
- ]
399
- # Currently, the codegen program is/can only be submitted to the head
400
- # node, due to using job_lib for updating job statuses, and using
401
- # autostop_lib here.
402
- self._code.append(
403
- # Use hasattr to handle backward compatibility.
404
- # TODO(zongheng): remove in ~1-2 minor releases (currently 0.2.x).
405
- textwrap.dedent("""\
406
- if hasattr(autostop_lib, 'set_last_active_time_to_now'):
407
- autostop_lib.set_last_active_time_to_now()
408
- """))
409
- self._code += [
410
- f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
411
- ]
412
-
413
- def add_gang_scheduling_placement_group_and_setup(
414
- self,
415
- num_nodes: int,
416
- resources_dict: Dict[str, float],
417
- stable_cluster_internal_ips: List[str],
418
- env_vars: Dict[str, str],
419
- setup_cmd: Optional[str] = None,
420
- setup_log_path: Optional[str] = None,
421
- ) -> None:
422
- """Create the gang scheduling placement group for a Task.
423
-
424
- cluster_ips_sorted is used to ensure that the SKY_NODE_RANK environment
425
- variable is assigned in a deterministic order whenever a new task is
426
- added.
427
- """
428
- assert self._has_prologue, (
429
- 'Call add_prologue() before '
430
- 'add_gang_scheduling_placement_group_and_setup().')
431
- self._has_gang_scheduling = True
432
- self._num_nodes = num_nodes
433
-
434
- bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
435
- # Set CPU to avoid ray hanging the resources allocation
436
- # for remote functions, since the task will request 1 CPU
437
- # by default.
438
- task_cpu_demand = resources_dict.pop('CPU')
439
-
440
- if resources_dict:
441
- assert len(resources_dict) == 1, (
442
- 'There can only be one type of accelerator per instance. '
443
- f'Found: {resources_dict}.')
444
- acc_name, acc_count = list(resources_dict.items())[0]
445
- gpu_dict = {'GPU': acc_count}
446
- # gpu_dict should be empty when the accelerator is not GPU.
447
- # TODO(zongheng,zhanghao): an alternative is to start the remote
448
- # cluster with custom resource 'GPU': <n> even if the accelerator(s)
449
- # are not GPU. We opt for the current solution for now.
450
- if accelerator_registry.is_schedulable_non_gpu_accelerator(
451
- acc_name):
452
- gpu_dict = {}
453
- for bundle in bundles:
454
- bundle.update({
455
- # Set the GPU to avoid ray hanging the resources allocation
456
- **gpu_dict,
457
- })
458
-
459
- streaming_message = (
460
- f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
461
- f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
462
- f'be killed){colorama.Style.RESET_ALL}')
463
- self._code += [
464
- textwrap.dedent(f"""\
465
- pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
466
- plural = 's' if {num_nodes} > 1 else ''
467
- node_str = f'{num_nodes} node{{plural}}'
468
- message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
469
- 'Waiting for task resources on '
470
- f'{{node_str}}.{colorama.Style.RESET_ALL}')
471
- print(message, flush=True)
472
- # FIXME: This will print the error message from autoscaler if
473
- # it is waiting for other task to finish. We should hide the
474
- # error message.
475
- ray.get(pg.ready())
476
- print({streaming_message!r}, flush=True)
477
- """)
478
- ]
479
-
480
- job_id = self.job_id
481
- if setup_cmd is not None:
482
- setup_envs = env_vars.copy()
483
- setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
484
- self._code += [
485
- textwrap.dedent(f"""\
486
- setup_cmd = {setup_cmd!r}
487
- _SETUP_CPUS = 0.0001
488
- # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
489
- # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
490
- # We unset it so that user setup command may properly use this env var.
491
- setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
492
- job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
493
-
494
- # The schedule_step should be called after the job status is set to non-PENDING,
495
- # otherwise, the scheduler will think the current job is not submitted yet, and
496
- # skip the scheduling step.
497
- job_lib.scheduler.schedule_step()
498
-
499
- total_num_nodes = len(ray.nodes())
500
- setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
501
- setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
502
- setup_workers = [run_bash_command_with_log \\
503
- .options(
504
- name='setup',
505
- num_cpus=_SETUP_CPUS,
506
- scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
507
- placement_group=setup_pg,
508
- placement_group_bundle_index=i)
509
- ) \\
510
- .remote(
511
- setup_cmd,
512
- os.path.expanduser({setup_log_path!r}),
513
- env_vars={setup_envs!r},
514
- stream_logs=True,
515
- with_ray=True,
516
- ) for i in range(total_num_nodes)]
517
- setup_returncodes = get_or_fail(setup_workers, setup_pg)
518
- if sum(setup_returncodes) != 0:
519
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
520
- # This waits for all streaming logs to finish.
521
- time.sleep(1)
522
- print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
523
- 'return code list:{colorama.Style.RESET_ALL}',
524
- setup_returncodes,
525
- flush=True)
526
- # Need this to set the job status in ray job to be FAILED.
527
- sys.exit(1)
528
- """)
529
- ]
530
-
531
- self._code.append(f'job_lib.set_job_started({self.job_id!r})')
532
- if setup_cmd is None:
533
- # Need to call schedule_step() to make sure the scheduler
534
- # schedule the next pending job.
535
- self._code.append('job_lib.scheduler.schedule_step()')
536
-
537
- # Export IP and node rank to the environment variables.
538
- self._code += [
539
- textwrap.dedent(f"""\
540
- @ray.remote
541
- def check_ip():
542
- return ray.util.get_node_ip_address()
543
- gang_scheduling_id_to_ip = ray.get([
544
- check_ip.options(
545
- num_cpus={task_cpu_demand},
546
- scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
547
- placement_group=pg,
548
- placement_group_bundle_index=i
549
- )).remote()
550
- for i in range(pg.bundle_count)
551
- ])
552
-
553
- cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
554
- job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
555
- job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
556
- job_ip_list_str = '\\n'.join(job_ip_rank_list)
557
- """),
558
- ]
559
-
560
- def register_run_fn(self, run_fn: str, run_fn_name: str) -> None:
561
- """Register the run function to be run on the remote cluster.
562
-
563
- Args:
564
- run_fn: The run function to be run on the remote cluster.
565
- """
566
- assert self._has_gang_scheduling, (
567
- 'Call add_gang_scheduling_placement_group_and_setup() '
568
- 'before register_run_fn().')
569
- assert not self._has_register_run_fn, (
570
- 'register_run_fn() called twice?')
571
- self._has_register_run_fn = True
572
-
573
- self._code += [
574
- run_fn,
575
- f'run_fn = {run_fn_name}',
576
- ]
577
-
578
- def add_ray_task(self,
579
- bash_script: Optional[str],
580
- task_name: Optional[str],
581
- ray_resources_dict: Dict[str, float],
582
- log_dir: str,
583
- env_vars: Optional[Dict[str, str]] = None,
584
- gang_scheduling_id: int = 0) -> None:
585
- """Generates code for a ray remote task that runs a bash command."""
586
- assert self._has_gang_scheduling, (
587
- 'Call add_gang_scheduling_placement_group_and_setup() before '
588
- 'add_ray_task().')
589
- assert (not self._has_register_run_fn or
590
- bash_script is None), ('bash_script should '
591
- 'be None when run_fn is registered.')
592
- task_cpu_demand = ray_resources_dict.pop('CPU')
593
- # Build remote_task.options(...)
594
- # resources=...
595
- # num_gpus=...
596
- options = []
597
- options.append(f'num_cpus={task_cpu_demand}')
598
-
599
- num_gpus = 0.0
600
- if ray_resources_dict:
601
- assert len(ray_resources_dict) == 1, (
602
- 'There can only be one type of accelerator per instance. '
603
- f'Found: {ray_resources_dict}.')
604
- num_gpus = list(ray_resources_dict.values())[0]
605
- options.append(f'resources={json.dumps(ray_resources_dict)}')
606
-
607
- resources_key = list(ray_resources_dict.keys())[0]
608
- if not accelerator_registry.is_schedulable_non_gpu_accelerator(
609
- resources_key):
610
- # `num_gpus` should be empty when the accelerator is not GPU.
611
- # FIXME: use a set of GPU types, instead of 'tpu' in the key.
612
-
613
- # Passing this ensures that the Ray remote task gets
614
- # CUDA_VISIBLE_DEVICES set correctly. If not passed, that flag
615
- # would be force-set to empty by Ray.
616
- options.append(f'num_gpus={num_gpus}')
617
- options.append(
618
- 'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
619
- 'placement_group=pg, '
620
- f'placement_group_bundle_index={gang_scheduling_id})')
621
-
622
- sky_env_vars_dict_str = [
623
- textwrap.dedent(f"""\
624
- sky_env_vars_dict = {{}}
625
- sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
626
- sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
627
- """)
628
- ]
629
-
630
- if env_vars is not None:
631
- sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
632
- for k, v in env_vars.items())
633
- sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
634
-
635
- options_str = ', '.join(options)
636
- logger.debug('Added Task with options: '
637
- f'{options_str}')
638
- # Script to block completion of a job until all storage mounted with
639
- # CACHED_MOUNT mode is uploaded to remote.
640
- rclone_flush_script = textwrap.dedent(f"""\
641
-
642
- # Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
643
- # findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
644
- # rclone for normal mounts as well.
645
- if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
646
- [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
647
- [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
648
- flushed=0
649
- # extra second on top of --vfs-cache-poll-interval to
650
- # avoid race condition between rclone log line creation and this check.
651
- sleep 1
652
- while [ $flushed -eq 0 ]; do
653
- # sleep for the same interval as --vfs-cache-poll-interval
654
- sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
655
- flushed=1
656
- for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
657
- exitcode=0
658
- tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
659
- if [ $exitcode -ne 0 ]; then
660
- echo "skypilot: cached mount is still uploading to remote"
661
- flushed=0
662
- break
663
- fi
664
- done
665
- done
666
- echo "skypilot: cached mount uploaded complete"
667
- fi""")
668
- self._code += [
669
- sky_env_vars_dict_str,
670
- textwrap.dedent(f"""\
671
- script = {bash_script!r}
672
- rclone_flush_script = {rclone_flush_script!r}
673
- if run_fn is not None:
674
- script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
675
-
676
- if script is not None:
677
- script += rclone_flush_script
678
- sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
679
-
680
- ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
681
- rank = job_ip_rank_map[ip]
682
-
683
- if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
684
- name_str = '{task_name},' if {task_name!r} != None else 'task,'
685
- log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
686
- else: # Single-node or multi-node task on multi-node cluster
687
- idx_in_cluster = cluster_ips_to_node_id[ip]
688
- if cluster_ips_to_node_id[ip] == 0:
689
- node_name = 'head'
690
- else:
691
- node_name = f'worker{{idx_in_cluster}}'
692
- name_str = f'{{node_name}}, rank={{rank}},'
693
- log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
694
- sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
695
-
696
- sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
697
-
698
- futures.append(run_bash_command_with_log \\
699
- .options(name=name_str, {options_str}) \\
700
- .remote(
701
- script,
702
- log_path,
703
- env_vars=sky_env_vars_dict,
704
- stream_logs=True,
705
- with_ray=True,
706
- ))""")
707
- ]
708
-
709
- def add_epilogue(self) -> None:
710
- """Generates code that waits for all tasks, then exits."""
711
- assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
712
- assert not self._has_epilogue, 'add_epilogue() called twice?'
713
- self._has_epilogue = True
714
-
715
- self._code += [
716
- textwrap.dedent(f"""\
717
- returncodes = get_or_fail(futures, pg)
718
- if sum(returncodes) != 0:
719
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
720
- # Schedule the next pending job immediately to make the job
721
- # scheduling more efficient.
722
- job_lib.scheduler.schedule_step()
723
- # This waits for all streaming logs to finish.
724
- time.sleep(0.5)
725
- reason = ''
726
- # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
727
- if any(r == 139 for r in returncodes):
728
- reason = '(likely due to Segmentation Fault)'
729
- if any(r == 137 for r in returncodes):
730
- # Find the first non-137 return code
731
- non_137 = next(r for r in returncodes if r != 137)
732
- reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
733
- print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
734
- 'return code list:{colorama.Style.RESET_ALL}',
735
- returncodes,
736
- reason,
737
- flush=True)
738
- # Need this to set the job status in ray job to be FAILED.
739
- sys.exit(1)
740
- else:
741
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
742
- # Schedule the next pending job immediately to make the job
743
- # scheduling more efficient.
744
- job_lib.scheduler.schedule_step()
745
- # This waits for all streaming logs to finish.
746
- time.sleep(0.5)
747
- """)
748
- ]
749
-
750
- def build(self) -> str:
751
- """Returns the entire generated program."""
752
- assert self._has_epilogue, 'Call add_epilogue() before build().'
753
- return '\n'.join(self._code)
754
-
755
-
756
317
  class GangSchedulingStatus(enum.Enum):
757
318
  """Enum for gang scheduling status."""
758
319
  CLUSTER_READY = 0
@@ -1340,6 +901,34 @@ class RetryingVmProvisioner(object):
1340
901
  zones = [clouds.Zone(name=to_provision.zone)]
1341
902
  yield zones
1342
903
 
904
+ def _insufficient_resources_msg(
905
+ self,
906
+ to_provision: resources_lib.Resources,
907
+ requested_resources: Set[resources_lib.Resources],
908
+ insufficient_resources: Optional[List[str]],
909
+ ) -> str:
910
+ insufficent_resource_msg = ('' if insufficient_resources is None else
911
+ f' ({", ".join(insufficient_resources)})')
912
+ message = f'Failed to acquire resources{insufficent_resource_msg} '
913
+ if to_provision.zone is not None:
914
+ message += (f'in {to_provision.zone} for {requested_resources}. ')
915
+ elif to_provision.region is not None and to_provision.cloud is not None:
916
+ # For public clouds, provision.region is always set.
917
+ if clouds.SSH().is_same_cloud(to_provision.cloud):
918
+ message += (
919
+ f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
920
+ f'for {requested_resources}. The SSH Node Pool may not '
921
+ 'have enough resources.')
922
+ elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
923
+ message += (f'in context {to_provision.region} for '
924
+ f'{requested_resources}. ')
925
+ else:
926
+ message += (f'in all zones in {to_provision.region} for '
927
+ f'{requested_resources}. ')
928
+ else:
929
+ message += (f'{to_provision.cloud} for {requested_resources}. ')
930
+ return message
931
+
1343
932
  def _retry_zones(
1344
933
  self,
1345
934
  to_provision: resources_lib.Resources,
@@ -1418,6 +1007,7 @@ class RetryingVmProvisioner(object):
1418
1007
  f'To request quotas, check the instruction: '
1419
1008
  f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
1420
1009
 
1010
+ insufficient_resources = None
1421
1011
  for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
1422
1012
  prev_cluster_status,
1423
1013
  prev_cluster_ever_up):
@@ -1630,6 +1220,24 @@ class RetryingVmProvisioner(object):
1630
1220
  # No teardown happens for this error.
1631
1221
  with ux_utils.print_exception_no_traceback():
1632
1222
  raise
1223
+ except config_lib.KubernetesError as e:
1224
+ if e.insufficent_resources:
1225
+ insufficient_resources = e.insufficent_resources
1226
+ # NOTE: We try to cleanup the cluster even if the previous
1227
+ # cluster does not exist. Also we are fast at
1228
+ # cleaning up clusters now if there is no existing node.
1229
+ CloudVmRayBackend().post_teardown_cleanup(
1230
+ handle,
1231
+ terminate=not prev_cluster_ever_up,
1232
+ remove_from_db=False,
1233
+ failover=True,
1234
+ )
1235
+ # TODO(suquark): other clouds may have different zone
1236
+ # blocking strategy. See '_update_blocklist_on_error'
1237
+ # for details.
1238
+ FailoverCloudErrorHandlerV2.update_blocklist_on_error(
1239
+ self._blocked_resources, to_provision, region, zones, e)
1240
+ continue
1633
1241
  except Exception as e: # pylint: disable=broad-except
1634
1242
  # NOTE: We try to cleanup the cluster even if the previous
1635
1243
  # cluster does not exist. Also we are fast at
@@ -1760,26 +1368,9 @@ class RetryingVmProvisioner(object):
1760
1368
  terminate=terminate_or_stop,
1761
1369
  remove_from_db=False)
1762
1370
 
1763
- if to_provision.zone is not None:
1764
- message = (
1765
- f'Failed to acquire resources in {to_provision.zone} for '
1766
- f'{requested_resources}. ')
1767
- elif to_provision.region is not None:
1768
- # For public clouds, provision.region is always set.
1769
- if clouds.SSH().is_same_cloud(to_provision.cloud):
1770
- message = ('Failed to acquire resources in SSH Node Pool '
1771
- f'({to_provision.region.lstrip("ssh-")}) for '
1772
- f'{requested_resources}. The SSH Node Pool may not '
1773
- 'have enough resources.')
1774
- elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
1775
- message = ('Failed to acquire resources in context '
1776
- f'{to_provision.region} for {requested_resources}. ')
1777
- else:
1778
- message = ('Failed to acquire resources in all zones in '
1779
- f'{to_provision.region} for {requested_resources}. ')
1780
- else:
1781
- message = (f'Failed to acquire resources in {to_provision.cloud} '
1782
- f'for {requested_resources}. ')
1371
+ message = self._insufficient_resources_msg(to_provision,
1372
+ requested_resources,
1373
+ insufficient_resources)
1783
1374
  # Do not failover to other locations if the cluster was ever up, since
1784
1375
  # the user can have some data on the cluster.
1785
1376
  raise exceptions.ResourcesUnavailableError(
@@ -2175,8 +1766,6 @@ class RetryingVmProvisioner(object):
2175
1766
  # terminated by _retry_zones().
2176
1767
  assert (prev_cluster_status == status_lib.ClusterStatus.INIT
2177
1768
  ), prev_cluster_status
2178
- assert global_user_state.get_handle_from_cluster_name(
2179
- cluster_name) is None, cluster_name
2180
1769
  logger.info(
2181
1770
  ux_utils.retry_message(
2182
1771
  f'Retrying provisioning with requested resources: '
@@ -2215,9 +1804,8 @@ class RetryingVmProvisioner(object):
2215
1804
  for (resource, exception) in resource_exceptions.items():
2216
1805
  table.add_row([
2217
1806
  resource.infra.formatted_str(),
2218
- resources_utils.format_resource(resource,
2219
- simplify=True),
2220
- exception
1807
+ resources_utils.format_resource(
1808
+ resource, simplified_only=True)[0], exception
2221
1809
  ])
2222
1810
  # Set the max width of REASON column to 80 to avoid the table
2223
1811
  # being wrapped in a unreadable way.
@@ -2239,6 +1827,18 @@ class SSHTunnelInfo:
2239
1827
  pid: int
2240
1828
 
2241
1829
 
1830
+ def _is_tunnel_healthy(tunnel: SSHTunnelInfo) -> bool:
1831
+ try:
1832
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1833
+ s.settimeout(0.5)
1834
+ s.connect(('localhost', tunnel.port))
1835
+ return True
1836
+ except socket.error as e:
1837
+ logger.warning(f'Failed to connect to tunnel on port {tunnel.port}: '
1838
+ f'{common_utils.format_exception(e)}')
1839
+ return False
1840
+
1841
+
2242
1842
  class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2243
1843
  """A pickle-able handle to a cluster created by CloudVmRayBackend.
2244
1844
 
@@ -2261,8 +1861,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2261
1861
  - (optional) Skylet SSH tunnel info.
2262
1862
  """
2263
1863
  # Bump if any fields get added/removed/changed, and add backward
2264
- # compaitibility logic in __setstate__.
2265
- _VERSION = 11
1864
+ # compatibility logic in __setstate__ and/or __getstate__.
1865
+ _VERSION = 12
2266
1866
 
2267
1867
  def __init__(
2268
1868
  self,
@@ -2296,7 +1896,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2296
1896
  self.launched_resources = launched_resources
2297
1897
  self.docker_user: Optional[str] = None
2298
1898
  self.is_grpc_enabled = True
2299
- self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
2300
1899
 
2301
1900
  def __repr__(self):
2302
1901
  return (f'ResourceHandle('
@@ -2313,12 +1912,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2313
1912
  f'{self.launched_resources}, '
2314
1913
  f'\n\tdocker_user={self.docker_user},'
2315
1914
  f'\n\tssh_user={self.ssh_user},'
2316
- f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
2317
- f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
1915
+ f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
2318
1916
 
2319
1917
  def get_cluster_name(self):
2320
1918
  return self.cluster_name
2321
1919
 
1920
+ def get_cluster_name_on_cloud(self):
1921
+ return self.cluster_name_on_cloud
1922
+
2322
1923
  def _use_internal_ips(self):
2323
1924
  """Returns whether to use internal IPs for SSH connections."""
2324
1925
  # Directly load the `use_internal_ips` flag from the cluster yaml
@@ -2345,7 +1946,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2345
1946
  def _update_cluster_info(self):
2346
1947
  # When a cluster is on a cloud that does not support the new
2347
1948
  # provisioner, we should skip updating cluster_info.
2348
- if (self.launched_resources.cloud.PROVISIONER_VERSION >=
1949
+ if (self.launched_resources.cloud is not None and
1950
+ self.launched_resources.cloud.PROVISIONER_VERSION >=
2349
1951
  clouds.ProvisionerVersion.SKYPILOT):
2350
1952
  provider_name = str(self.launched_resources.cloud).lower()
2351
1953
  config = {}
@@ -2643,64 +2245,199 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2643
2245
  cluster_config_file)
2644
2246
  self.docker_user = docker_user
2645
2247
 
2248
+ def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
2249
+ metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
2250
+ self.cluster_name)
2251
+ if metadata is None:
2252
+ return None
2253
+ return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
2254
+
2255
+ def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
2256
+ global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
2257
+ self.cluster_name,
2258
+ (tunnel.port, tunnel.pid) if tunnel is not None else None)
2259
+
2260
+ def close_skylet_ssh_tunnel(self) -> None:
2261
+ """Terminate the SSH tunnel process and clear its metadata."""
2262
+ tunnel = self._get_skylet_ssh_tunnel()
2263
+ if tunnel is None:
2264
+ return
2265
+ logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
2266
+ self.cluster_name, tunnel.port)
2267
+ try:
2268
+ self._terminate_ssh_tunnel_process(tunnel)
2269
+ finally:
2270
+ self._set_skylet_ssh_tunnel(None)
2271
+
2646
2272
  def get_grpc_channel(self) -> 'grpc.Channel':
2647
- if self.skylet_ssh_tunnel is None:
2648
- self.open_and_update_skylet_tunnel()
2649
- assert self.skylet_ssh_tunnel is not None
2650
- return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
2273
+ grpc_options = [
2274
+ # The task YAMLs can be large, so the default
2275
+ # max_receive_message_length of 4MB might not be enough.
2276
+ ('grpc.max_receive_message_length', -1),
2277
+ ]
2278
+ # It's fine to not grab the lock here, as we're only reading,
2279
+ # and writes are very rare.
2280
+ # It's acceptable to read while another process is opening a tunnel,
2281
+ # because it will only happen on:
2282
+ # 1. A new cluster who has no tunnel yet, or
2283
+ # 2. A cluster with an unhealthy tunnel
2284
+ # For (2), for processes that read the "stale" tunnel, it will fail
2285
+ # and on the next retry, it will call get_grpc_channel again
2286
+ # and get the new tunnel.
2287
+ tunnel = self._get_skylet_ssh_tunnel()
2288
+ if tunnel is not None:
2289
+ if _is_tunnel_healthy(tunnel):
2290
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2291
+ options=grpc_options)
2292
+ logger.debug('Failed to connect to SSH tunnel for cluster '
2293
+ f'{self.cluster_name!r} on port {tunnel.port}')
2294
+
2295
+ lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
2296
+ remaining_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
2297
+ start_time = time.perf_counter()
2298
+ attempt = 1
2299
+
2300
+ def _get_remaining_timeout() -> float:
2301
+ return max(0.0,
2302
+ remaining_timeout - (time.perf_counter() - start_time))
2303
+
2304
+ while remaining_timeout > 0:
2305
+ logger.debug(
2306
+ 'Attempting to acquire exclusive lock for %s (attempt %d)',
2307
+ lock_id, attempt)
2308
+ exclusive_lock = locks.get_lock(lock_id, remaining_timeout)
2309
+ try:
2310
+ with exclusive_lock.acquire(blocking=False):
2311
+ wait_elapsed = time.perf_counter() - start_time
2312
+ logger.debug(f'Acquired exclusive lock for {lock_id} after '
2313
+ f'{wait_elapsed:.2f}s')
2314
+ try:
2315
+ tunnel = self._open_and_update_skylet_tunnel()
2316
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2317
+ options=grpc_options)
2318
+ except Exception as e: # pylint: disable=broad-except
2319
+ # Failed to open tunnel, release the lock and retry.
2320
+ logger.warning(f'Failed to open tunnel for cluster '
2321
+ f'{self.cluster_name!r}: '
2322
+ f'{common_utils.format_exception(e)}')
2323
+ remaining_timeout = _get_remaining_timeout()
2324
+ attempt += 1
2325
+ continue
2326
+ except locks.LockTimeout:
2327
+ pass
2651
2328
 
2652
- def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
2653
- """Clean up an SSH tunnel by terminating the process."""
2329
+ remaining_timeout = _get_remaining_timeout()
2330
+ logger.debug(f'Could not acquire exclusive lock for {lock_id}, '
2331
+ f'waiting on shared lock (attempt {attempt})')
2332
+ try:
2333
+ # Use shared lock so that concurrent readers can
2334
+ # proceed in parallel.
2335
+ shared_lock = locks.get_lock(lock_id,
2336
+ remaining_timeout,
2337
+ shared_lock=True)
2338
+ # Wait for the exclusive lock to be released.
2339
+ shared_lock.acquire(blocking=True)
2340
+ # We only need the lock for signalling that the new tunnel has
2341
+ # been opened, not for checking the tunnel health.
2342
+ # Same reasoning as why we don't need to grab the lock in
2343
+ # the fast path at the start of this function.
2344
+ shared_lock.release()
2345
+ wait_elapsed = time.perf_counter() - start_time
2346
+ logger.debug(f'Acquired shared lock for {lock_id} after '
2347
+ f'{wait_elapsed:.2f}s')
2348
+ except locks.LockTimeout as e:
2349
+ raise RuntimeError(
2350
+ f'Failed to get gRPC channel for cluster '
2351
+ f'{self.cluster_name!r} due to a timeout when waiting '
2352
+ 'for the SSH tunnel to be opened. Please try again or '
2353
+ f'manually remove the lock at {lock_id}. '
2354
+ f'{common_utils.format_exception(e)}') from e
2355
+
2356
+ # Add small jitter before probing to smoothen the effects
2357
+ # of many readers waking up simultaneously.
2358
+ jitter = random.uniform(0.01, 0.05)
2359
+ time.sleep(jitter)
2360
+
2361
+ # Re-read the tunnel metadata and verify it's healthy.
2362
+ tunnel = self._get_skylet_ssh_tunnel()
2363
+ if tunnel is not None:
2364
+ if _is_tunnel_healthy(tunnel):
2365
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2366
+ options=grpc_options)
2367
+ logger.debug('Failed to connect to SSH tunnel for cluster '
2368
+ f'{self.cluster_name!r} on port {tunnel.port}')
2369
+ # Tunnel is still unhealthy or missing, try again with updated
2370
+ # timeout. This could happen in the case where the thread who
2371
+ # held the exclusive lock to open the tunnel crashed.
2372
+ remaining_timeout = _get_remaining_timeout()
2373
+ attempt += 1
2374
+ raise RuntimeError('Timeout waiting for gRPC channel for cluster '
2375
+ f'{self.cluster_name!r} to be ready.')
2376
+
2377
+ def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
2378
+ """Terminate the SSH tunnel process."""
2654
2379
  try:
2655
2380
  proc = psutil.Process(tunnel_info.pid)
2656
2381
  if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
2657
2382
  logger.debug(
2658
2383
  f'Terminating SSH tunnel process {tunnel_info.pid}')
2659
- proc.terminate()
2660
- try:
2661
- proc.wait(timeout=3)
2662
- except psutil.TimeoutExpired:
2663
- proc.kill()
2664
- proc.wait(timeout=1)
2384
+ subprocess_utils.kill_children_processes(proc.pid)
2665
2385
  except psutil.NoSuchProcess:
2666
2386
  pass
2667
2387
  except Exception as e: # pylint: disable=broad-except
2668
2388
  logger.warning(
2669
2389
  f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
2670
2390
 
2671
- def open_and_update_skylet_tunnel(self) -> None:
2391
+ def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
2672
2392
  """Opens an SSH tunnel to the Skylet on the head node,
2673
2393
  updates the cluster handle, and persists it to the database."""
2674
- local_port = common_utils.find_free_port(10000)
2675
- runners = self.get_command_runners()
2676
- head_runner = runners[0]
2677
- if isinstance(head_runner, command_runner.SSHCommandRunner):
2678
- # Disabling ControlMaster makes things easier to reason about
2679
- # with respect to resource management/ownership,
2680
- # as killing the process will close the tunnel too.
2681
- head_runner.disable_control_master = True
2682
-
2683
- cmd = head_runner.port_forward_command([(local_port,
2684
- constants.SKYLET_GRPC_PORT)])
2685
- ssh_tunnel_proc = subprocess.Popen(cmd)
2686
- tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
2394
+ max_attempts = 3
2395
+ # There could be a race condition here, as multiple processes may
2396
+ # attempt to open the same port at the same time.
2397
+ for attempt in range(max_attempts):
2398
+ runners = self.get_command_runners()
2399
+ head_runner = runners[0]
2400
+ local_port = random.randint(10000, 65535)
2401
+ try:
2402
+ ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
2403
+ head_runner, (local_port, constants.SKYLET_GRPC_PORT))
2404
+ except exceptions.CommandError as e:
2405
+ # Don't retry if the error is due to timeout,
2406
+ # connection refused, Kubernetes pods not found,
2407
+ # or an in-progress termination.
2408
+ if (e.detailed_reason is not None and
2409
+ (backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
2410
+ e.detailed_reason) or
2411
+ backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
2412
+ e.detailed_reason) or attempt == max_attempts - 1)):
2413
+ raise e
2414
+ logger.warning(
2415
+ f'Failed to open SSH tunnel on port {local_port} '
2416
+ f'({attempt + 1}/{max_attempts}). '
2417
+ f'{e.error_msg}\n{e.detailed_reason}')
2418
+ continue
2419
+ tunnel_info = SSHTunnelInfo(port=local_port,
2420
+ pid=ssh_tunnel_proc.pid)
2421
+ break
2422
+
2687
2423
  try:
2688
2424
  grpc.channel_ready_future(
2689
2425
  grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
2690
2426
  timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
2691
2427
  # Clean up existing tunnel before setting up the new one.
2692
- if self.skylet_ssh_tunnel is not None:
2693
- self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
2694
- self.skylet_ssh_tunnel = tunnel_info
2695
- global_user_state.update_cluster_handle(self.cluster_name, self)
2428
+ old_tunnel = self._get_skylet_ssh_tunnel()
2429
+ if old_tunnel is not None:
2430
+ self._terminate_ssh_tunnel_process(old_tunnel)
2431
+ self._set_skylet_ssh_tunnel(tunnel_info)
2432
+ return tunnel_info
2696
2433
  except grpc.FutureTimeoutError as e:
2697
- self._cleanup_ssh_tunnel(tunnel_info)
2434
+ self._terminate_ssh_tunnel_process(tunnel_info)
2698
2435
  logger.warning(
2699
2436
  f'Skylet gRPC channel for cluster {self.cluster_name} not '
2700
2437
  f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
2701
2438
  raise e
2702
2439
  except Exception as e:
2703
- self._cleanup_ssh_tunnel(tunnel_info)
2440
+ self._terminate_ssh_tunnel_process(tunnel_info)
2704
2441
  raise e
2705
2442
 
2706
2443
  @property
@@ -2713,6 +2450,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2713
2450
  def cluster_yaml(self, value: Optional[str]):
2714
2451
  self._cluster_yaml = value
2715
2452
 
2453
+ @property
2454
+ def instance_ids(self):
2455
+ if self.cached_cluster_info is not None:
2456
+ return self.cached_cluster_info.instance_ids()
2457
+ return None
2458
+
2716
2459
  @property
2717
2460
  def ssh_user(self):
2718
2461
  if self.cached_cluster_info is not None:
@@ -2750,7 +2493,16 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2750
2493
  @property
2751
2494
  def is_grpc_enabled_with_flag(self) -> bool:
2752
2495
  """Returns whether this handle has gRPC enabled and gRPC flag is set."""
2753
- return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
2496
+ return (env_options.Options.ENABLE_GRPC.get() and
2497
+ self.is_grpc_enabled and
2498
+ not isinstance(self.launched_resources.cloud, clouds.Slurm))
2499
+
2500
+ def __getstate__(self):
2501
+ state = self.__dict__.copy()
2502
+ # For backwards compatibility. Refer to
2503
+ # https://github.com/skypilot-org/skypilot/pull/7133
2504
+ state.setdefault('skylet_ssh_tunnel', None)
2505
+ return state
2754
2506
 
2755
2507
  def __setstate__(self, state):
2756
2508
  self._version = self._VERSION
@@ -2809,6 +2561,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2809
2561
  state['is_grpc_enabled'] = False
2810
2562
  state['skylet_ssh_tunnel'] = None
2811
2563
 
2564
+ if version >= 12:
2565
+ # DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
2566
+ state.pop('skylet_ssh_tunnel', None)
2567
+
2812
2568
  self.__dict__.update(state)
2813
2569
 
2814
2570
  # Because the update_cluster_ips and update_ssh_ports
@@ -2886,21 +2642,180 @@ class SkyletClient:
2886
2642
 
2887
2643
  def __init__(self, channel: 'grpc.Channel'):
2888
2644
  self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
2645
+ self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
2646
+ self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
2647
+ self._managed_jobs_stub = (
2648
+ managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
2889
2649
 
2890
2650
  def set_autostop(
2891
2651
  self,
2892
2652
  request: 'autostopv1_pb2.SetAutostopRequest',
2893
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2653
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2894
2654
  ) -> 'autostopv1_pb2.SetAutostopResponse':
2895
2655
  return self._autostop_stub.SetAutostop(request, timeout=timeout)
2896
2656
 
2897
2657
  def is_autostopping(
2898
2658
  self,
2899
2659
  request: 'autostopv1_pb2.IsAutostoppingRequest',
2900
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2660
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2901
2661
  ) -> 'autostopv1_pb2.IsAutostoppingResponse':
2902
2662
  return self._autostop_stub.IsAutostopping(request, timeout=timeout)
2903
2663
 
2664
+ def add_job(
2665
+ self,
2666
+ request: 'jobsv1_pb2.AddJobRequest',
2667
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2668
+ ) -> 'jobsv1_pb2.AddJobResponse':
2669
+ return self._jobs_stub.AddJob(request, timeout=timeout)
2670
+
2671
+ def queue_job(
2672
+ self,
2673
+ request: 'jobsv1_pb2.QueueJobRequest',
2674
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2675
+ ) -> 'jobsv1_pb2.QueueJobResponse':
2676
+ return self._jobs_stub.QueueJob(request, timeout=timeout)
2677
+
2678
+ def update_status(
2679
+ self,
2680
+ request: 'jobsv1_pb2.UpdateStatusRequest',
2681
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2682
+ ) -> 'jobsv1_pb2.UpdateStatusResponse':
2683
+ return self._jobs_stub.UpdateStatus(request, timeout=timeout)
2684
+
2685
+ def get_job_queue(
2686
+ self,
2687
+ request: 'jobsv1_pb2.GetJobQueueRequest',
2688
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2689
+ ) -> 'jobsv1_pb2.GetJobQueueResponse':
2690
+ return self._jobs_stub.GetJobQueue(request, timeout=timeout)
2691
+
2692
+ def cancel_jobs(
2693
+ self,
2694
+ request: 'jobsv1_pb2.CancelJobsRequest',
2695
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2696
+ ) -> 'jobsv1_pb2.CancelJobsResponse':
2697
+ return self._jobs_stub.CancelJobs(request, timeout=timeout)
2698
+
2699
+ def fail_all_in_progress_jobs(
2700
+ self,
2701
+ request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
2702
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2703
+ ) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
2704
+ return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
2705
+
2706
+ def get_job_status(
2707
+ self,
2708
+ request: 'jobsv1_pb2.GetJobStatusRequest',
2709
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2710
+ ) -> 'jobsv1_pb2.GetJobStatusResponse':
2711
+ return self._jobs_stub.GetJobStatus(request, timeout=timeout)
2712
+
2713
+ def get_job_submitted_timestamp(
2714
+ self,
2715
+ request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
2716
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2717
+ ) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
2718
+ return self._jobs_stub.GetJobSubmittedTimestamp(request,
2719
+ timeout=timeout)
2720
+
2721
+ def get_job_ended_timestamp(
2722
+ self,
2723
+ request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
2724
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2725
+ ) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
2726
+ return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
2727
+
2728
+ def get_log_dirs_for_jobs(
2729
+ self,
2730
+ request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
2731
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2732
+ ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
2733
+ return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
2734
+
2735
+ def tail_logs(
2736
+ self,
2737
+ request: 'jobsv1_pb2.TailLogsRequest',
2738
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2739
+ ) -> Iterator['jobsv1_pb2.TailLogsResponse']:
2740
+ return self._jobs_stub.TailLogs(request, timeout=timeout)
2741
+
2742
+ def get_service_status(
2743
+ self,
2744
+ request: 'servev1_pb2.GetServiceStatusRequest',
2745
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2746
+ ) -> 'servev1_pb2.GetServiceStatusResponse':
2747
+ return self._serve_stub.GetServiceStatus(request, timeout=timeout)
2748
+
2749
+ def add_serve_version(
2750
+ self,
2751
+ request: 'servev1_pb2.AddVersionRequest',
2752
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2753
+ ) -> 'servev1_pb2.AddVersionResponse':
2754
+ return self._serve_stub.AddVersion(request, timeout=timeout)
2755
+
2756
+ def terminate_services(
2757
+ self,
2758
+ request: 'servev1_pb2.TerminateServicesRequest',
2759
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2760
+ ) -> 'servev1_pb2.TerminateServicesResponse':
2761
+ return self._serve_stub.TerminateServices(request, timeout=timeout)
2762
+
2763
+ def terminate_replica(
2764
+ self,
2765
+ request: 'servev1_pb2.TerminateReplicaRequest',
2766
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2767
+ ) -> 'servev1_pb2.TerminateReplicaResponse':
2768
+ return self._serve_stub.TerminateReplica(request, timeout=timeout)
2769
+
2770
+ def wait_service_registration(
2771
+ self,
2772
+ request: 'servev1_pb2.WaitServiceRegistrationRequest',
2773
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2774
+ ) -> 'servev1_pb2.WaitServiceRegistrationResponse':
2775
+ # set timeout to at least 10 seconds more than service register
2776
+ # constant to make sure that timeouts will not occur.
2777
+ if timeout is not None:
2778
+ timeout = max(timeout,
2779
+ serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
2780
+ return self._serve_stub.WaitServiceRegistration(request,
2781
+ timeout=timeout)
2782
+
2783
+ def update_service(
2784
+ self,
2785
+ request: 'servev1_pb2.UpdateServiceRequest',
2786
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2787
+ ) -> 'servev1_pb2.UpdateServiceResponse':
2788
+ return self._serve_stub.UpdateService(request, timeout=timeout)
2789
+
2790
+ def get_managed_job_controller_version(
2791
+ self,
2792
+ request: 'managed_jobsv1_pb2.GetVersionRequest',
2793
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2794
+ ) -> 'managed_jobsv1_pb2.GetVersionResponse':
2795
+ return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
2796
+
2797
+ def get_managed_job_table(
2798
+ self,
2799
+ request: 'managed_jobsv1_pb2.GetJobTableRequest',
2800
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2801
+ ) -> 'managed_jobsv1_pb2.GetJobTableResponse':
2802
+ return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
2803
+
2804
+ def get_all_managed_job_ids_by_name(
2805
+ self,
2806
+ request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
2807
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2808
+ ) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
2809
+ return self._managed_jobs_stub.GetAllJobIdsByName(request,
2810
+ timeout=timeout)
2811
+
2812
+ def cancel_managed_jobs(
2813
+ self,
2814
+ request: 'managed_jobsv1_pb2.CancelJobsRequest',
2815
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2816
+ ) -> 'managed_jobsv1_pb2.CancelJobsResponse':
2817
+ return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
2818
+
2904
2819
 
2905
2820
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2906
2821
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
@@ -2931,6 +2846,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2931
2846
  self._requested_features = set()
2932
2847
  self._dump_final_script = False
2933
2848
  self._is_managed = False
2849
+ # Optional planner (via register_info): used under the per-cluster lock
2850
+ # to produce a fresh concrete plan when neither a reusable snapshot nor
2851
+ # a caller plan is available.
2852
+ self._planner = None
2934
2853
 
2935
2854
  # Command for running the setup script. It is only set when the
2936
2855
  # setup needs to be run outside the self._setup() and as part of
@@ -2948,6 +2867,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2948
2867
  self._requested_features)
2949
2868
  self._dump_final_script = kwargs.pop('dump_final_script', False)
2950
2869
  self._is_managed = kwargs.pop('is_managed', False)
2870
+ # Optional planner callback for a fresh plan under lock when no
2871
+ # reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
2872
+ self._planner = kwargs.pop('planner', self._planner)
2951
2873
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2952
2874
 
2953
2875
  def check_resources_fit_cluster(
@@ -2974,9 +2896,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2974
2896
  # Usage Collection:
2975
2897
  usage_lib.messages.usage.update_cluster_resources(
2976
2898
  handle.launched_nodes, launched_resources)
2977
- record = global_user_state.get_cluster_from_name(cluster_name)
2978
- if record is not None:
2979
- usage_lib.messages.usage.update_cluster_status(record['status'])
2899
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
2900
+ if status is not None:
2901
+ usage_lib.messages.usage.update_cluster_status(status)
2980
2902
 
2981
2903
  assert launched_resources.region is not None, handle
2982
2904
 
@@ -3115,7 +3037,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3115
3037
  colorama.Style.RESET_ALL +
3116
3038
  colorama.Style.DIM +
3117
3039
  'Check concurrent requests: ' +
3118
- 'sky api status '))
3040
+ 'sky api status -v | grep '
3041
+ f'{cluster_name}'))
3119
3042
 
3120
3043
  def _locked_provision(
3121
3044
  self,
@@ -3172,8 +3095,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3172
3095
  try:
3173
3096
  retry_provisioner = RetryingVmProvisioner(
3174
3097
  self.log_dir,
3175
- self._dag,
3176
- self._optimize_target,
3098
+ self._dag, # type: ignore[arg-type]
3099
+ self._optimize_target, # type: ignore[arg-type]
3177
3100
  self._requested_features,
3178
3101
  local_wheel_path,
3179
3102
  wheel_hash,
@@ -3204,9 +3127,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3204
3127
  gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
3205
3128
  retry_message = ux_utils.retry_message(
3206
3129
  f'Retry after {gap_seconds:.0f}s ')
3207
- hint_message = (f'\n{retry_message} '
3208
- f'{ux_utils.log_path_hint(log_path)}'
3209
- f'{colorama.Style.RESET_ALL}')
3130
+ hint_message = (
3131
+ f'\n{retry_message} '
3132
+ f'{ux_utils.provision_hint(cluster_name)}'
3133
+ f'{colorama.Style.RESET_ALL}')
3210
3134
 
3211
3135
  # Add cluster event for retry.
3212
3136
  global_user_state.add_cluster_event(
@@ -3235,7 +3159,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3235
3159
  logger.error(
3236
3160
  ux_utils.error_message(
3237
3161
  'Failed to provision resources. '
3238
- f'{ux_utils.log_path_hint(log_path)}'))
3162
+ f'{ux_utils.provision_hint(cluster_name)}'))
3239
3163
  error_message += (
3240
3164
  '\nTo keep retrying until the cluster is up, use '
3241
3165
  'the `--retry-until-up` flag.')
@@ -3244,8 +3168,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3244
3168
  error_message + '\n' + str(e),
3245
3169
  failover_history=e.failover_history) from None
3246
3170
  if dryrun:
3247
- record = global_user_state.get_cluster_from_name(cluster_name)
3248
- return record['handle'] if record is not None else None, False
3171
+ handle = global_user_state.get_handle_from_cluster_name(
3172
+ cluster_name)
3173
+ return handle if handle is not None else None, False
3249
3174
 
3250
3175
  if config_dict['provisioning_skipped']:
3251
3176
  # Skip further provisioning.
@@ -3253,10 +3178,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3253
3178
  # ('handle', 'provision_record', 'resources_vars')
3254
3179
  # We need to return the handle - but it should be the existing
3255
3180
  # handle for the cluster.
3256
- record = global_user_state.get_cluster_from_name(cluster_name)
3257
- assert record is not None and record['handle'] is not None, (
3258
- cluster_name, record)
3259
- return record['handle'], True
3181
+ handle = global_user_state.get_handle_from_cluster_name(
3182
+ cluster_name)
3183
+ assert handle is not None, (cluster_name, handle)
3184
+ return handle, True
3260
3185
 
3261
3186
  if 'provision_record' in config_dict:
3262
3187
  # New provisioner is used here.
@@ -3279,7 +3204,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3279
3204
  global_user_state.ClusterEventType.STATUS_CHANGE)
3280
3205
 
3281
3206
  cluster_info = provisioner.post_provision_runtime_setup(
3282
- repr(handle.launched_resources.cloud),
3207
+ handle.launched_resources,
3283
3208
  resources_utils.ClusterName(handle.cluster_name,
3284
3209
  handle.cluster_name_on_cloud),
3285
3210
  handle.cluster_yaml,
@@ -3293,6 +3218,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3293
3218
  # manually or by the cloud provider.
3294
3219
  # Optimize the case where the cluster's IPs can be retrieved
3295
3220
  # from cluster_info.
3221
+ handle.cached_cluster_info = cluster_info
3296
3222
  handle.docker_user = cluster_info.docker_user
3297
3223
  handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
3298
3224
  cluster_info=cluster_info)
@@ -3304,7 +3230,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3304
3230
 
3305
3231
  self._update_after_cluster_provisioned(
3306
3232
  handle, to_provision_config.prev_handle, task,
3307
- prev_cluster_status, lock_id, config_hash)
3233
+ prev_cluster_status, config_hash)
3308
3234
  return handle, False
3309
3235
 
3310
3236
  cluster_config_file = config_dict['ray']
@@ -3376,7 +3302,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3376
3302
 
3377
3303
  self._update_after_cluster_provisioned(
3378
3304
  handle, to_provision_config.prev_handle, task,
3379
- prev_cluster_status, lock_id, config_hash)
3305
+ prev_cluster_status, config_hash)
3380
3306
  return handle, False
3381
3307
 
3382
3308
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3394,7 +3320,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3394
3320
  prev_handle: Optional[CloudVmRayResourceHandle],
3395
3321
  task: task_lib.Task,
3396
3322
  prev_cluster_status: Optional[status_lib.ClusterStatus],
3397
- lock_id: str, config_hash: str) -> None:
3323
+ config_hash: str) -> None:
3398
3324
  usage_lib.messages.usage.update_cluster_resources(
3399
3325
  handle.launched_nodes, handle.launched_resources)
3400
3326
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3406,16 +3332,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3406
3332
  # update_status will query the ray job status for all INIT /
3407
3333
  # PENDING / RUNNING jobs for the real status, since we do not
3408
3334
  # know the actual previous status of the cluster.
3409
- cmd = job_lib.JobLibCodeGen.update_status()
3410
3335
  logger.debug('Update job queue on remote cluster.')
3411
3336
  with rich_utils.safe_status(
3412
3337
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3413
- returncode, _, stderr = self.run_on_head(handle,
3414
- cmd,
3415
- require_outputs=True)
3416
- subprocess_utils.handle_returncode(returncode, cmd,
3417
- 'Failed to update job status.',
3418
- stderr)
3338
+ use_legacy = not handle.is_grpc_enabled_with_flag
3339
+
3340
+ if not use_legacy:
3341
+ try:
3342
+ request = jobsv1_pb2.UpdateStatusRequest()
3343
+ backend_utils.invoke_skylet_with_retries(
3344
+ lambda: SkyletClient(handle.get_grpc_channel()
3345
+ ).update_status(request))
3346
+ except exceptions.SkyletMethodNotImplementedError:
3347
+ use_legacy = True
3348
+
3349
+ if use_legacy:
3350
+ cmd = job_lib.JobLibCodeGen.update_status()
3351
+ returncode, _, stderr = self.run_on_head(
3352
+ handle, cmd, require_outputs=True)
3353
+ subprocess_utils.handle_returncode(
3354
+ returncode, cmd, 'Failed to update job status.', stderr)
3419
3355
  if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
3420
3356
  # Safely set all the previous jobs to FAILED since the cluster
3421
3357
  # is restarted
@@ -3423,14 +3359,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3423
3359
  # 1. A job finishes RUNNING, but right before it update itself
3424
3360
  # to SUCCEEDED, the cluster is STOPPED by `sky stop`.
3425
3361
  # 2. On next `sky start`, it gets reset to FAILED.
3426
- cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3427
- returncode, stdout, stderr = self.run_on_head(handle,
3428
- cmd,
3429
- require_outputs=True)
3430
- subprocess_utils.handle_returncode(
3431
- returncode, cmd,
3432
- 'Failed to set previously in-progress jobs to FAILED',
3433
- stdout + stderr)
3362
+ use_legacy = not handle.is_grpc_enabled_with_flag
3363
+
3364
+ if not use_legacy:
3365
+ try:
3366
+ fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3367
+ backend_utils.invoke_skylet_with_retries(
3368
+ lambda: SkyletClient(handle.get_grpc_channel(
3369
+ )).fail_all_in_progress_jobs(fail_request))
3370
+ except exceptions.SkyletMethodNotImplementedError:
3371
+ use_legacy = True
3372
+
3373
+ if use_legacy:
3374
+ cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3375
+ returncode, stdout, stderr = self.run_on_head(
3376
+ handle, cmd, require_outputs=True)
3377
+ subprocess_utils.handle_returncode(
3378
+ returncode, cmd,
3379
+ 'Failed to set previously in-progress jobs to FAILED',
3380
+ stdout + stderr)
3434
3381
 
3435
3382
  prev_ports = None
3436
3383
  if prev_handle is not None:
@@ -3485,8 +3432,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3485
3432
  handle.cached_external_ssh_ports, handle.docker_user,
3486
3433
  handle.ssh_user)
3487
3434
 
3488
- locks.get_lock(lock_id).force_unlock()
3489
-
3490
3435
  def _sync_workdir(self, handle: CloudVmRayResourceHandle,
3491
3436
  workdir: Union[Path, Dict[str, Any]],
3492
3437
  envs_and_secrets: Dict[str, str]) -> None:
@@ -3618,8 +3563,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3618
3563
  self._set_storage_mounts_metadata(handle.cluster_name,
3619
3564
  storage_mounts)
3620
3565
 
3566
+ def _get_num_gpus(self, task: task_lib.Task) -> int:
3567
+ if task.resources is not None:
3568
+ for resource in task.resources:
3569
+ if (resource.accelerators is not None and
3570
+ isinstance(resource.accelerators, dict)):
3571
+ if len(resource.accelerators) > 0:
3572
+ return math.ceil(
3573
+ list(resource.accelerators.values())[0])
3574
+ return 0
3575
+
3621
3576
  def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
3622
3577
  detach_setup: bool) -> None:
3578
+
3623
3579
  start = time.time()
3624
3580
 
3625
3581
  if task.setup is None:
@@ -3630,13 +3586,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3630
3586
  remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
3631
3587
  # Need this `-i` option to make sure `source ~/.bashrc` work
3632
3588
  setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
3589
+ unset_ray_env_vars = ' && '.join(
3590
+ [f'unset {var}' for var in task_codegen.UNSET_RAY_ENV_VARS])
3591
+ setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
3633
3592
  runners = handle.get_command_runners(avoid_ssh_control=True)
3634
3593
 
3635
3594
  def _setup_node(node_id: int) -> None:
3636
- setup_envs = task.envs_and_secrets
3595
+ setup_envs = task_lib.get_plaintext_envs_and_secrets(
3596
+ task.envs_and_secrets)
3637
3597
  setup_envs.update(self._skypilot_predefined_env_vars(handle))
3638
3598
  setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
3639
3599
  setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
3600
+ setup_envs[constants.SKYPILOT_SETUP_NUM_GPUS_PER_NODE] = (str(
3601
+ self._get_num_gpus(task)))
3602
+
3640
3603
  runner = runners[node_id]
3641
3604
  setup_script = log_lib.make_task_bash_script(setup,
3642
3605
  env_vars=setup_envs)
@@ -3664,7 +3627,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3664
3627
  _dump_final_script(setup_script,
3665
3628
  constants.PERSISTENT_SETUP_SCRIPT_PATH)
3666
3629
 
3667
- if detach_setup or _is_command_length_over_limit(encoded_script):
3630
+ if (detach_setup or
3631
+ backend_utils.is_command_length_over_limit(encoded_script)):
3668
3632
  _dump_final_script(setup_script)
3669
3633
  create_script_code = 'true'
3670
3634
  else:
@@ -3693,29 +3657,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3693
3657
 
3694
3658
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3695
3659
 
3696
- def _load_setup_log_and_match(match_str: str) -> bool:
3697
- try:
3698
- with open(os.path.expanduser(setup_log_path),
3699
- 'r',
3700
- encoding='utf-8') as f:
3701
- return match_str.lower() in f.read().lower()
3702
- except Exception as e: # pylint: disable=broad-except
3703
- # We don't crash the setup if we cannot read the log file.
3704
- # Instead, we should retry the setup with dumping the script
3705
- # to a file to be safe.
3706
- logger.debug(
3707
- f'Failed to read setup log file {setup_log_path}: {e}')
3708
- return True
3709
-
3710
- if ((returncode == 255 and _load_setup_log_and_match('too long')) or
3711
- (returncode == 1 and
3712
- _load_setup_log_and_match('request-uri too large'))):
3713
- # If the setup script is too long, we retry it with dumping
3714
- # the script to a file and running it with SSH. We use a
3715
- # general length limit check before but it could be
3716
- # inaccurate on some systems.
3717
- # When there is a cloudflare proxy in front of the remote, it
3718
- # could cause `414 Request-URI Too Large` error.
3660
+ if _is_message_too_long(returncode, file_path=setup_log_path):
3661
+ # If the setup script is too long, we need to retry it
3662
+ # with dumping the script to a file and running it the script
3663
+ # on remote cluster instead.
3719
3664
  logger.debug('Failed to run setup command inline due to '
3720
3665
  'command length limit. Dumping setup script to '
3721
3666
  'file and running it with SSH.')
@@ -3779,119 +3724,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3779
3724
  logger.info(
3780
3725
  ux_utils.finishing_message('Setup completed.', setup_log_path))
3781
3726
 
3727
+ def _download_file(self, handle: CloudVmRayResourceHandle,
3728
+ local_file_path: str, remote_file_path: str) -> None:
3729
+ """Syncs file from remote to local."""
3730
+ runners = handle.get_command_runners()
3731
+ head_runner = runners[0]
3732
+ head_runner.rsync(
3733
+ source=local_file_path,
3734
+ target=remote_file_path,
3735
+ up=False,
3736
+ stream_logs=False,
3737
+ )
3738
+
3782
3739
  def _exec_code_on_head(
3783
3740
  self,
3784
3741
  handle: CloudVmRayResourceHandle,
3785
3742
  codegen: str,
3786
3743
  job_id: int,
3787
- detach_run: bool = False,
3788
3744
  managed_job_dag: Optional['dag.Dag'] = None,
3745
+ managed_job_user_id: Optional[str] = None,
3789
3746
  remote_log_dir: Optional[str] = None,
3790
3747
  ) -> None:
3791
3748
  """Executes generated code on the head node."""
3792
- script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3749
+ use_legacy = not handle.is_grpc_enabled_with_flag
3750
+ file_name = f'sky_job_{job_id}'
3751
+ script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
3793
3752
  if remote_log_dir is None:
3794
3753
  remote_log_dir = self.log_dir
3795
3754
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3796
3755
 
3797
- cd = f'cd {SKY_REMOTE_WORKDIR}'
3756
+ def _dump_code_to_file(codegen: str,
3757
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3758
+ runners = handle.get_command_runners()
3759
+ head_runner = runners[0]
3760
+ with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3761
+ fp.write(codegen)
3762
+ fp.flush()
3763
+ script_path = os.path.join(target_dir, file_name)
3764
+ # We choose to sync code + exec, because the alternative of
3765
+ # 'ray submit' may not work as it may use system python
3766
+ # (python2) to execute the script. Happens for AWS.
3767
+ head_runner.rsync(source=fp.name,
3768
+ target=script_path,
3769
+ up=True,
3770
+ stream_logs=False)
3798
3771
 
3772
+ cd = f'cd {SKY_REMOTE_WORKDIR}'
3799
3773
  mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3800
3774
  f'touch {remote_log_path}')
3801
3775
  encoded_script = shlex.quote(codegen)
3802
3776
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3803
3777
  job_submit_cmd = (
3804
- # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3805
- # with pid is the same driver process.
3778
+ # JOB_CMD_IDENTIFIER is used for identifying the process
3779
+ # retrieved with pid is the same driver process.
3806
3780
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3807
3781
  f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3808
3782
  # Do not use &>, which is not POSIX and may not work.
3809
3783
  # Note that the order of ">filename 2>&1" matters.
3810
3784
  f'> {remote_log_path} 2>&1')
3811
-
3812
3785
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3813
3786
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3814
3787
 
3815
- def _dump_code_to_file(codegen: str,
3816
- target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3817
- runners = handle.get_command_runners()
3818
- head_runner = runners[0]
3819
- with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3820
- fp.write(codegen)
3821
- fp.flush()
3822
- script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3823
- # We choose to sync code + exec, because the alternative of 'ray
3824
- # submit' may not work as it may use system python (python2) to
3825
- # execute the script. Happens for AWS.
3826
- head_runner.rsync(source=fp.name,
3827
- target=script_path,
3828
- up=True,
3829
- stream_logs=False)
3830
-
3831
- # Should also be ealier than _is_command_length_over_limit
3788
+ # Should also be ealier than is_command_length_over_limit
3832
3789
  # Same reason as in _setup
3833
3790
  if self._dump_final_script:
3834
3791
  _dump_code_to_file(job_submit_cmd,
3835
3792
  constants.PERSISTENT_RUN_SCRIPT_DIR)
3836
3793
 
3837
- if _is_command_length_over_limit(job_submit_cmd):
3838
- _dump_code_to_file(codegen)
3839
- job_submit_cmd = f'{mkdir_code} && {code}'
3840
-
3841
- def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
3842
- if managed_job_dag is not None:
3843
- # Add the managed job to job queue database.
3844
- managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3845
- managed_job_code = managed_job_codegen.set_pending(
3846
- job_id,
3847
- managed_job_dag,
3848
- skypilot_config.get_active_workspace(
3849
- force_user_workspace=True),
3850
- entrypoint=common_utils.get_current_command())
3851
- # Set the managed job to PENDING state to make sure that this
3852
- # managed job appears in the `sky jobs queue`, even if it needs
3853
- # to wait to be submitted.
3854
- # We cannot set the managed job to PENDING state in the job
3855
- # template (jobs-controller.yaml.j2), as it may need to wait for
3856
- # the run commands to be scheduled on the job controller in
3857
- # high-load cases.
3858
- job_submit_cmd += ' && ' + managed_job_code
3859
- return job_submit_cmd
3860
-
3861
- job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3794
+ if not use_legacy:
3795
+ try:
3796
+ managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
3797
+ if managed_job_dag is not None:
3798
+ workspace = skypilot_config.get_active_workspace(
3799
+ force_user_workspace=True)
3800
+ entrypoint = common_utils.get_current_command()
3801
+
3802
+ managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
3803
+ for task_id, task in enumerate(managed_job_dag.tasks):
3804
+ resources_str = backend_utils.get_task_resources_str(
3805
+ task, is_managed_job=True)
3806
+ managed_job_tasks.append(
3807
+ jobsv1_pb2.ManagedJobTask(
3808
+ task_id=task_id,
3809
+ name=task.name,
3810
+ resources_str=resources_str,
3811
+ metadata_json=task.metadata_json))
3812
+
3813
+ managed_job_info = jobsv1_pb2.ManagedJobInfo(
3814
+ name=managed_job_dag.name,
3815
+ pool=managed_job_dag.pool,
3816
+ workspace=workspace,
3817
+ entrypoint=entrypoint,
3818
+ tasks=managed_job_tasks,
3819
+ user_id=managed_job_user_id)
3820
+
3821
+ if backend_utils.is_command_length_over_limit(codegen):
3822
+ _dump_code_to_file(codegen)
3823
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
3824
+ job_id=job_id,
3825
+ # codegen not set - server assumes script uploaded
3826
+ remote_log_dir=remote_log_dir,
3827
+ managed_job=managed_job_info,
3828
+ script_path=script_path)
3829
+ else:
3830
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
3831
+ job_id=job_id,
3832
+ codegen=codegen,
3833
+ remote_log_dir=remote_log_dir,
3834
+ managed_job=managed_job_info,
3835
+ script_path=script_path)
3836
+
3837
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
3838
+ handle.get_grpc_channel()).queue_job(queue_job_request))
3839
+ except exceptions.SkyletMethodNotImplementedError:
3840
+ use_legacy = True
3841
+
3842
+ if use_legacy:
3843
+ if backend_utils.is_command_length_over_limit(job_submit_cmd):
3844
+ _dump_code_to_file(codegen)
3845
+ job_submit_cmd = f'{mkdir_code} && {code}'
3846
+
3847
+ def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
3848
+ if managed_job_dag is not None:
3849
+ # Add the managed job to job queue database.
3850
+ managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3851
+ managed_job_code = managed_job_codegen.set_pending(
3852
+ job_id,
3853
+ managed_job_dag,
3854
+ skypilot_config.get_active_workspace(
3855
+ force_user_workspace=True),
3856
+ entrypoint=common_utils.get_current_command(),
3857
+ user_hash=managed_job_user_id)
3858
+ # Set the managed job to PENDING state to make sure that
3859
+ # this managed job appears in the `sky jobs queue`, even
3860
+ # if it needs to wait to be submitted.
3861
+ # We cannot set the managed job to PENDING state in the
3862
+ # job template (jobs-controller.yaml.j2), as it may need
3863
+ # to wait for the run commands to be scheduled on the job
3864
+ # controller in high-load cases.
3865
+ job_submit_cmd += ' && ' + managed_job_code
3866
+ return job_submit_cmd
3862
3867
 
3863
- returncode, stdout, stderr = self.run_on_head(handle,
3864
- job_submit_cmd,
3865
- stream_logs=False,
3866
- require_outputs=True)
3867
- # Happens when someone calls `sky exec` but remote is outdated for
3868
- # running a job. Necessitating calling `sky launch`.
3869
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3870
- handle.cluster_name)
3871
- output = stdout + stderr
3872
- if ((returncode == 255 and 'too long' in output.lower()) or
3873
- (returncode == 1 and 'request-uri too large' in output.lower())):
3874
- # If the generated script is too long, we retry it with dumping
3875
- # the script to a file and running it with SSH. We use a general
3876
- # length limit check before but it could be inaccurate on some
3877
- # systems.
3878
- # When there is a cloudflare proxy in front of the remote, it could
3879
- # cause `414 Request-URI Too Large` error.
3880
- logger.debug('Failed to submit job due to command length limit. '
3881
- 'Dumping job to file and running it with SSH. '
3882
- f'Output: {output}')
3883
- _dump_code_to_file(codegen)
3884
- job_submit_cmd = f'{mkdir_code} && {code}'
3885
3868
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3869
+
3886
3870
  returncode, stdout, stderr = self.run_on_head(handle,
3887
3871
  job_submit_cmd,
3888
3872
  stream_logs=False,
3889
3873
  require_outputs=True)
3874
+ # Happens when someone calls `sky exec` but remote is outdated for
3875
+ # running a job. Necessitating calling `sky launch`.
3876
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3877
+ handle.cluster_name)
3878
+ output = stdout + stderr
3879
+ if _is_message_too_long(returncode, output=output):
3880
+ # If the job submit script is too long, we need to retry it
3881
+ # with dumping the script to a file and running it the script
3882
+ # on remote cluster instead.
3883
+ logger.debug(
3884
+ 'Failed to submit job due to command length limit. '
3885
+ 'Dumping job to file and running it with SSH. '
3886
+ f'Output: {output}')
3887
+ _dump_code_to_file(codegen)
3888
+ job_submit_cmd = f'{mkdir_code} && {code}'
3889
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3890
+ returncode, stdout, stderr = self.run_on_head(
3891
+ handle,
3892
+ job_submit_cmd,
3893
+ stream_logs=False,
3894
+ require_outputs=True)
3890
3895
 
3891
- subprocess_utils.handle_returncode(returncode,
3892
- job_submit_cmd,
3893
- f'Failed to submit job {job_id}.',
3894
- stderr=stdout + stderr)
3896
+ subprocess_utils.handle_returncode(
3897
+ returncode,
3898
+ job_submit_cmd,
3899
+ f'Failed to submit job {job_id}.',
3900
+ stderr=stdout + stderr)
3895
3901
 
3896
3902
  controller = controller_utils.Controllers.from_name(handle.cluster_name)
3897
3903
  if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
@@ -3900,61 +3906,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3900
3906
  logger.info(
3901
3907
  ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
3902
3908
  rich_utils.stop_safe_status()
3903
- if not detach_run:
3904
- if (handle.cluster_name == controller_utils.Controllers.
3905
- JOBS_CONTROLLER.value.cluster_name):
3906
- self.tail_managed_job_logs(handle, job_id)
3907
- else:
3908
- # Sky logs. Not using subprocess.run since it will make the
3909
- # ssh keep connected after ctrl-c.
3910
- self.tail_logs(handle, job_id)
3911
3909
 
3912
3910
  def _add_job(self, handle: CloudVmRayResourceHandle,
3913
3911
  job_name: Optional[str], resources_str: str,
3914
3912
  metadata: str) -> Tuple[int, str]:
3915
- code = job_lib.JobLibCodeGen.add_job(
3916
- job_name=job_name,
3917
- username=common_utils.get_user_hash(),
3918
- run_timestamp=self.run_timestamp,
3919
- resources_str=resources_str,
3920
- metadata=metadata)
3921
- returncode, result_str, stderr = self.run_on_head(handle,
3922
- code,
3923
- stream_logs=False,
3924
- require_outputs=True,
3925
- separate_stderr=True)
3926
- # Happens when someone calls `sky exec` but remote is outdated for
3927
- # adding a job. Necessitating calling `sky launch`.
3928
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3929
- handle.cluster_name)
3930
- # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3931
- # retry for this, after we figure out the reason.
3932
- subprocess_utils.handle_returncode(returncode, code,
3933
- 'Failed to fetch job id.', stderr)
3934
- try:
3935
- job_id_match = _JOB_ID_PATTERN.search(result_str)
3936
- if job_id_match is not None:
3937
- job_id = int(job_id_match.group(1))
3938
- else:
3939
- # For backward compatibility.
3940
- job_id = int(result_str)
3941
- log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3942
- if log_dir_match is not None:
3943
- log_dir = log_dir_match.group(1).strip()
3944
- else:
3945
- # For backward compatibility, use the same log dir as local.
3946
- log_dir = self.log_dir
3947
- except ValueError as e:
3948
- logger.error(stderr)
3949
- raise ValueError(f'Failed to parse job id: {result_str}; '
3950
- f'Returncode: {returncode}') from e
3913
+ use_legacy = not handle.is_grpc_enabled_with_flag
3914
+
3915
+ if not use_legacy:
3916
+ try:
3917
+ request = jobsv1_pb2.AddJobRequest(
3918
+ job_name=job_name,
3919
+ username=common_utils.get_user_hash(),
3920
+ run_timestamp=self.run_timestamp,
3921
+ resources_str=resources_str,
3922
+ metadata=metadata)
3923
+ response = backend_utils.invoke_skylet_with_retries(
3924
+ lambda: SkyletClient(handle.get_grpc_channel()).add_job(
3925
+ request))
3926
+ job_id = response.job_id
3927
+ log_dir = response.log_dir
3928
+ return job_id, log_dir
3929
+ except exceptions.SkyletMethodNotImplementedError:
3930
+ use_legacy = True
3931
+
3932
+ if use_legacy:
3933
+ code = job_lib.JobLibCodeGen.add_job(
3934
+ job_name=job_name,
3935
+ username=common_utils.get_user_hash(),
3936
+ run_timestamp=self.run_timestamp,
3937
+ resources_str=resources_str,
3938
+ metadata=metadata)
3939
+ returncode, result_str, stderr = self.run_on_head(
3940
+ handle,
3941
+ code,
3942
+ stream_logs=False,
3943
+ require_outputs=True,
3944
+ separate_stderr=True)
3945
+ # Happens when someone calls `sky exec` but remote is outdated for
3946
+ # adding a job. Necessitating calling `sky launch`.
3947
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3948
+ handle.cluster_name)
3949
+ # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3950
+ # retry for this, after we figure out the reason.
3951
+ subprocess_utils.handle_returncode(returncode, code,
3952
+ 'Failed to fetch job id.',
3953
+ stderr)
3954
+ try:
3955
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
3956
+ if job_id_match is not None:
3957
+ job_id = int(job_id_match.group(1))
3958
+ else:
3959
+ # For backward compatibility.
3960
+ job_id = int(result_str)
3961
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3962
+ if log_dir_match is not None:
3963
+ log_dir = log_dir_match.group(1).strip()
3964
+ else:
3965
+ # For backward compatibility, use the same log dir as local.
3966
+ log_dir = self.log_dir
3967
+ except ValueError as e:
3968
+ logger.error(stderr)
3969
+ raise ValueError(f'Failed to parse job id: {result_str}; '
3970
+ f'Returncode: {returncode}') from e
3951
3971
  return job_id, log_dir
3952
3972
 
3953
3973
  def _execute(
3954
3974
  self,
3955
3975
  handle: CloudVmRayResourceHandle,
3956
3976
  task: task_lib.Task,
3957
- detach_run: bool,
3958
3977
  dryrun: bool = False,
3959
3978
  ) -> Optional[int]:
3960
3979
  """Executes the task on the cluster.
@@ -4006,12 +4025,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4006
4025
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
4007
4026
  # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
4008
4027
  if num_actual_nodes > 1:
4009
- self._execute_task_n_nodes(handle, task_copy, job_id, detach_run,
4010
- log_dir)
4028
+ self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
4011
4029
  else:
4012
4030
  # Case: task_lib.Task(run, num_nodes=1)
4013
- self._execute_task_one_node(handle, task_copy, job_id, detach_run,
4014
- log_dir)
4031
+ self._execute_task_one_node(handle, task_copy, job_id, log_dir)
4015
4032
 
4016
4033
  return job_id
4017
4034
 
@@ -4054,7 +4071,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4054
4071
  is_identity_mismatch_and_purge = False
4055
4072
  try:
4056
4073
  backend_utils.check_owner_identity(cluster_name)
4057
- except exceptions.ClusterOwnerIdentityMismatchError as e:
4074
+ except (exceptions.ClusterOwnerIdentityMismatchError,
4075
+ exceptions.CloudUserIdentityError) as e:
4058
4076
  if purge:
4059
4077
  logger.error(e)
4060
4078
  verbed = 'terminated' if terminate else 'stopped'
@@ -4068,15 +4086,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4068
4086
  else:
4069
4087
  raise
4070
4088
  lock_id = backend_utils.cluster_status_lock_id(cluster_name)
4071
- lock = locks.get_lock(lock_id)
4089
+ lock = locks.get_lock(lock_id, timeout=1)
4072
4090
  # Retry in case new cluster operation comes in and holds the lock
4073
4091
  # right after the lock is removed.
4074
4092
  n_attempts = 2
4075
4093
  while True:
4076
4094
  n_attempts -= 1
4077
- # In case other running cluster operations are still holding the
4078
- # lock.
4079
- lock.force_unlock()
4080
4095
  # We have to kill the cluster requests, because `down` and `stop`
4081
4096
  # should be higher priority than the cluster requests, and we should
4082
4097
  # release the lock from other requests.
@@ -4094,6 +4109,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4094
4109
  'Failed to kill other launch requests for the '
4095
4110
  f'cluster {handle.cluster_name}: '
4096
4111
  f'{common_utils.format_exception(e, use_bracket=True)}')
4112
+ # In case other running cluster operations are still holding the
4113
+ # lock.
4114
+ lock.force_unlock()
4097
4115
  try:
4098
4116
  with lock:
4099
4117
  self.teardown_no_lock(
@@ -4126,6 +4144,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4126
4144
  job_ids: Optional[List[int]] = None,
4127
4145
  stream_logs: bool = True
4128
4146
  ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
4147
+ if handle.is_grpc_enabled_with_flag:
4148
+ try:
4149
+ request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
4150
+ response = backend_utils.invoke_skylet_with_retries(
4151
+ lambda: SkyletClient(handle.get_grpc_channel()
4152
+ ).get_job_status(request))
4153
+ statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
4154
+ job_id: job_lib.JobStatus.from_protobuf(proto_status)
4155
+ for job_id, proto_status in response.job_statuses.items()
4156
+ }
4157
+ return statuses
4158
+ except exceptions.SkyletMethodNotImplementedError:
4159
+ pass
4160
+
4129
4161
  code = job_lib.JobLibCodeGen.get_job_status(job_ids)
4130
4162
  returncode, stdout, stderr = self.run_on_head(handle,
4131
4163
  code,
@@ -4146,16 +4178,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4146
4178
 
4147
4179
  See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
4148
4180
  """
4149
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
4150
- returncode, stdout, _ = self.run_on_head(handle,
4151
- code,
4152
- stream_logs=False,
4153
- require_outputs=True)
4154
- subprocess_utils.handle_returncode(
4155
- returncode, code,
4156
- f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
4157
-
4158
- cancelled_ids = message_utils.decode_payload(stdout)
4181
+ use_legacy = not handle.is_grpc_enabled_with_flag
4182
+
4183
+ if not use_legacy:
4184
+ try:
4185
+ request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4186
+ cancel_all=cancel_all,
4187
+ user_hash=user_hash)
4188
+ response = backend_utils.invoke_skylet_with_retries(
4189
+ lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
4190
+ request))
4191
+ cancelled_ids = response.cancelled_job_ids
4192
+ except exceptions.SkyletMethodNotImplementedError:
4193
+ use_legacy = True
4194
+
4195
+ if use_legacy:
4196
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
4197
+ user_hash)
4198
+ returncode, stdout, _ = self.run_on_head(handle,
4199
+ code,
4200
+ stream_logs=False,
4201
+ require_outputs=True)
4202
+ subprocess_utils.handle_returncode(
4203
+ returncode, code,
4204
+ f'Failed to cancel jobs on cluster {handle.cluster_name}.',
4205
+ stdout)
4206
+ cancelled_ids = message_utils.decode_payload(stdout)
4159
4207
  if cancelled_ids:
4160
4208
  logger.info(
4161
4209
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
@@ -4172,20 +4220,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4172
4220
  Returns:
4173
4221
  A dictionary mapping job_id to log path.
4174
4222
  """
4175
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4176
- returncode, job_to_dir, stderr = self.run_on_head(handle,
4223
+ job_to_dir: Dict[str, str] = {}
4224
+ use_legacy = not handle.is_grpc_enabled_with_flag
4225
+
4226
+ if not use_legacy:
4227
+ try:
4228
+ int_job_ids = []
4229
+ if job_ids:
4230
+ for str_job_id in job_ids:
4231
+ if str_job_id.isdigit():
4232
+ int_job_ids.append(int(str_job_id))
4233
+ request = jobsv1_pb2.GetLogDirsForJobsRequest(
4234
+ job_ids=int_job_ids)
4235
+ response = backend_utils.invoke_skylet_with_retries(
4236
+ lambda: SkyletClient(handle.get_grpc_channel()
4237
+ ).get_log_dirs_for_jobs(request))
4238
+ job_log_dirs = response.job_log_dirs
4239
+ if not job_log_dirs:
4240
+ logger.info(f'{colorama.Fore.YELLOW}'
4241
+ 'No matching log directories found'
4242
+ f'{colorama.Style.RESET_ALL}')
4243
+ return {}
4244
+ for job_id, log_dir in job_log_dirs.items():
4245
+ # Convert to string for backwards compatibility
4246
+ job_to_dir[str(job_id)] = log_dir
4247
+ except exceptions.SkyletMethodNotImplementedError:
4248
+ use_legacy = True
4249
+
4250
+ if use_legacy:
4251
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4252
+ returncode, stdout, stderr = self.run_on_head(handle,
4177
4253
  code,
4178
4254
  stream_logs=False,
4179
4255
  require_outputs=True,
4180
4256
  separate_stderr=True)
4181
- subprocess_utils.handle_returncode(returncode, code,
4182
- 'Failed to sync logs.', stderr)
4183
- job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
4184
- if not job_to_dir:
4185
- logger.info(f'{colorama.Fore.YELLOW}'
4186
- 'No matching log directories found'
4187
- f'{colorama.Style.RESET_ALL}')
4188
- return {}
4257
+ subprocess_utils.handle_returncode(returncode, code,
4258
+ 'Failed to sync logs.', stderr)
4259
+ job_to_dir = message_utils.decode_payload(stdout)
4260
+ if not job_to_dir:
4261
+ logger.info(f'{colorama.Fore.YELLOW}'
4262
+ 'No matching log directories found'
4263
+ f'{colorama.Style.RESET_ALL}')
4264
+ return {}
4189
4265
 
4190
4266
  job_ids = list(job_to_dir.keys())
4191
4267
  dirs = list(job_to_dir.values())
@@ -4195,9 +4271,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4195
4271
  (dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
4196
4272
  constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
4197
4273
  ]
4198
- local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
4199
- if constants.SKY_LOGS_DIRECTORY in dir else
4200
- os.path.join(local_dir, dir)) for dir in dirs]
4274
+ # Include cluster name in local log directory path to avoid conflicts
4275
+ # when the same job_id exists on different clusters
4276
+ cluster_name = handle.cluster_name
4277
+ local_log_dirs = []
4278
+ for remote_log_dir in dirs:
4279
+ if constants.SKY_LOGS_DIRECTORY in remote_log_dir:
4280
+ # Extract the job-specific directory name from the full path
4281
+ # e.g., ~/sky_logs/1-job_name -> 1-job_name
4282
+ job_dir = remote_log_dir.replace(constants.SKY_LOGS_DIRECTORY,
4283
+ '').lstrip('/')
4284
+ local_log_dir = os.path.join(local_dir, cluster_name, job_dir)
4285
+ else:
4286
+ # remote_log_dir is already just the job directory name (e.g.,
4287
+ # "1-job_name")
4288
+ local_log_dir = os.path.join(local_dir, cluster_name,
4289
+ remote_log_dir)
4290
+ local_log_dirs.append(local_log_dir)
4201
4291
 
4202
4292
  runners = handle.get_command_runners()
4203
4293
 
@@ -4261,6 +4351,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4261
4351
  The exit code of the tail command. Returns code 100 if the job has
4262
4352
  failed. See exceptions.JobExitCode for possible return codes.
4263
4353
  """
4354
+ if handle.is_grpc_enabled_with_flag:
4355
+ last_exit_code = 0
4356
+ try:
4357
+ request = jobsv1_pb2.TailLogsRequest(
4358
+ job_id=job_id,
4359
+ managed_job_id=managed_job_id,
4360
+ follow=follow,
4361
+ tail=tail)
4362
+ for resp in backend_utils.invoke_skylet_streaming_with_retries(
4363
+ lambda: SkyletClient(handle.get_grpc_channel()
4364
+ ).tail_logs(request, timeout=None)):
4365
+ if resp.log_line:
4366
+ print(resp.log_line, end='', flush=True)
4367
+ last_exit_code = resp.exit_code
4368
+ return last_exit_code
4369
+ except exceptions.SkyletMethodNotImplementedError:
4370
+ pass
4371
+ except grpc.RpcError as e:
4372
+ if e.code() == grpc.StatusCode.CANCELLED:
4373
+ return last_exit_code
4374
+ raise e
4375
+
4264
4376
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
4265
4377
  managed_job_id=managed_job_id,
4266
4378
  follow=follow,
@@ -4298,6 +4410,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4298
4410
  tail: Optional[int] = None) -> int:
4299
4411
  # if job_name is not None, job_id should be None
4300
4412
  assert job_name is None or job_id is None, (job_name, job_id)
4413
+ # TODO(kevin): Migrate stream_logs to gRPC
4301
4414
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
4302
4415
  job_name, job_id, follow, controller, tail)
4303
4416
 
@@ -4343,20 +4456,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4343
4456
  assert job_name is None or job_id is None, (job_name, job_id)
4344
4457
 
4345
4458
  if job_id is None:
4346
- # generate code to get the job_id
4459
+ # get the job_id
4347
4460
  # if job_name is None, get all job_ids
4348
4461
  # TODO: Only get the latest job_id, since that's the only one we use
4349
- code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4350
- job_name=job_name)
4351
- returncode, job_ids, stderr = self.run_on_head(handle,
4352
- code,
4353
- stream_logs=False,
4354
- require_outputs=True,
4355
- separate_stderr=True)
4356
- subprocess_utils.handle_returncode(returncode, code,
4357
- 'Failed to sync down logs.',
4358
- stderr)
4359
- job_ids = message_utils.decode_payload(job_ids)
4462
+
4463
+ use_legacy = not handle.is_grpc_enabled_with_flag
4464
+ logger.info(f'handle.is_grpc_enabled_with_flag: '
4465
+ f'{handle.is_grpc_enabled_with_flag}')
4466
+ if not use_legacy:
4467
+ try:
4468
+ request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
4469
+ job_name=job_name)
4470
+ response = backend_utils.invoke_skylet_with_retries(
4471
+ lambda: SkyletClient(handle.get_grpc_channel(
4472
+ )).get_all_managed_job_ids_by_name(request))
4473
+ job_ids = list(response.job_ids)
4474
+ except exceptions.SkyletMethodNotImplementedError:
4475
+ use_legacy = True
4476
+
4477
+ if use_legacy:
4478
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4479
+ job_name=job_name)
4480
+ returncode, job_ids_payload, stderr = self.run_on_head(
4481
+ handle,
4482
+ code,
4483
+ stream_logs=False,
4484
+ require_outputs=True,
4485
+ separate_stderr=True)
4486
+ subprocess_utils.handle_returncode(returncode, code,
4487
+ 'Failed to sync down logs.',
4488
+ stderr)
4489
+ job_ids = message_utils.decode_payload(job_ids_payload)
4360
4490
  if not job_ids:
4361
4491
  logger.info(f'{colorama.Fore.YELLOW}'
4362
4492
  'No matching job found'
@@ -4384,18 +4514,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4384
4514
  else:
4385
4515
  # get the run_timestamp
4386
4516
  # the function takes in [job_id]
4387
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
4388
- returncode, run_timestamps_payload, stderr = self.run_on_head(
4389
- handle,
4390
- code,
4391
- stream_logs=False,
4392
- require_outputs=True,
4393
- separate_stderr=True)
4394
- subprocess_utils.handle_returncode(returncode, code,
4395
- 'Failed to sync logs.', stderr)
4396
- # returns with a dict of {job_id: run_timestamp}
4397
- run_timestamps = message_utils.decode_payload(
4398
- run_timestamps_payload)
4517
+ use_legacy = not handle.is_grpc_enabled_with_flag
4518
+ if not use_legacy:
4519
+ try:
4520
+ log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
4521
+ job_ids=[job_id])
4522
+ log_dirs_response = (
4523
+ backend_utils.invoke_skylet_with_retries(
4524
+ lambda: SkyletClient(handle.get_grpc_channel(
4525
+ )).get_log_dirs_for_jobs(log_dirs_request)))
4526
+ job_log_dirs = log_dirs_response.job_log_dirs
4527
+ # Convert back to the expected format
4528
+ # {job_id: run_timestamp}
4529
+ run_timestamps = {}
4530
+ for jid, log_dir in job_log_dirs.items():
4531
+ run_timestamps[int(jid)] = log_dir
4532
+ except exceptions.SkyletMethodNotImplementedError:
4533
+ use_legacy = True
4534
+
4535
+ if use_legacy:
4536
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
4537
+ [str(job_id)])
4538
+ returncode, run_timestamps_payload, stderr = self.run_on_head(
4539
+ handle,
4540
+ code,
4541
+ stream_logs=False,
4542
+ require_outputs=True,
4543
+ separate_stderr=True)
4544
+ subprocess_utils.handle_returncode(returncode, code,
4545
+ 'Failed to sync logs.',
4546
+ stderr)
4547
+ # returns with a dict of {job_id: run_timestamp}
4548
+ run_timestamps = message_utils.decode_payload(
4549
+ run_timestamps_payload)
4399
4550
  if not run_timestamps:
4400
4551
  logger.info(f'{colorama.Fore.YELLOW}'
4401
4552
  'No matching log directories found'
@@ -4462,11 +4613,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4462
4613
  exist_ok=True)
4463
4614
  log_file = os.path.join(local_log_dir, 'run.log')
4464
4615
 
4465
- code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4466
- job_id=job_id,
4467
- follow=False,
4468
- controller=False)
4469
-
4616
+ # TODO(kevin): Migrate stream_logs to gRPC
4617
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(
4618
+ job_name=None,
4619
+ job_id=int(job_id),
4620
+ follow=False,
4621
+ controller=False)
4470
4622
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4471
4623
  # kill the process, so we need to handle it manually here.
4472
4624
  if threading.current_thread() is threading.main_thread():
@@ -4507,6 +4659,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4507
4659
  Raises:
4508
4660
  RuntimeError: If the cluster fails to be terminated/stopped.
4509
4661
  """
4662
+ try:
4663
+ handle.close_skylet_ssh_tunnel()
4664
+ except Exception as e: # pylint: disable=broad-except
4665
+ # Not critical to the cluster teardown, just log a warning.
4666
+ logger.warning(
4667
+ 'Failed to close Skylet SSH tunnel for cluster '
4668
+ f'{handle.cluster_name}: '
4669
+ f'{common_utils.format_exception(e, use_bracket=True)}')
4670
+
4510
4671
  exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
4511
4672
  # We have to kill the cluster requests again within the lock, because
4512
4673
  # any pending requests on the same cluster should be cancelled after
@@ -4543,7 +4704,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4543
4704
  # observed in AWS. See also
4544
4705
  # _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
4545
4706
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
4546
- acquire_per_cluster_status_lock=False))
4707
+ cluster_lock_already_held=True,
4708
+ retry_if_missing=False))
4547
4709
  cluster_status_fetched = True
4548
4710
  except exceptions.ClusterStatusFetchingError:
4549
4711
  logger.warning(
@@ -4551,10 +4713,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4551
4713
  f'{handle.cluster_name!r}. Assuming the cluster is still '
4552
4714
  'up.')
4553
4715
  if not cluster_status_fetched:
4554
- record = global_user_state.get_cluster_from_name(
4716
+ status = global_user_state.get_status_from_cluster_name(
4555
4717
  handle.cluster_name)
4556
- prev_cluster_status = record[
4557
- 'status'] if record is not None else None
4718
+ prev_cluster_status = status if status is not None else None
4558
4719
  if prev_cluster_status is None:
4559
4720
  # When the cluster is not in the cluster table, we guarantee that
4560
4721
  # all related resources / cache / config are cleaned up, i.e. it
@@ -4786,7 +4947,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4786
4947
  config['provider'])
4787
4948
  ports_cleaned_up = True
4788
4949
  except exceptions.NotSupportedError:
4789
- pass
4950
+ ports_cleaned_up = True
4790
4951
  except exceptions.PortDoesNotExistError:
4791
4952
  logger.debug('Ports do not exist. Skipping cleanup.')
4792
4953
  except Exception as e: # pylint: disable=broad-except
@@ -4811,7 +4972,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4811
4972
  failover)
4812
4973
  custom_multi_network_cleaned_up = True
4813
4974
  except exceptions.NotSupportedError:
4814
- pass
4975
+ custom_multi_network_cleaned_up = True
4815
4976
  except Exception as e: # pylint: disable=broad-except
4816
4977
  if purge:
4817
4978
  msg = common_utils.format_exception(e, use_bracket=True)
@@ -4913,7 +5074,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4913
5074
  cluster_yaml_path = handle.cluster_yaml
4914
5075
  handle.cluster_yaml = None
4915
5076
  global_user_state.update_cluster_handle(handle.cluster_name, handle)
4916
- global_user_state.remove_cluster_yaml(handle.cluster_name)
5077
+ # Removing the cluster YAML can cause some unexpected stability issues.
5078
+ # See #5011.
5079
+ # global_user_state.remove_cluster_yaml(handle.cluster_name)
4917
5080
  common_utils.remove_file_if_exists(cluster_yaml_path)
4918
5081
 
4919
5082
  def set_autostop(self,
@@ -4974,9 +5137,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4974
5137
  autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
4975
5138
  down=down,
4976
5139
  )
4977
- backend_utils.invoke_skylet_with_retries(
4978
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
4979
- set_autostop(request))
5140
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
5141
+ handle.get_grpc_channel()).set_autostop(request))
4980
5142
  else:
4981
5143
  code = autostop_lib.AutostopCodeGen.set_autostop(
4982
5144
  idle_minutes_to_autostop, self.NAME, wait_for, down)
@@ -5015,8 +5177,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5015
5177
  try:
5016
5178
  request = autostopv1_pb2.IsAutostoppingRequest()
5017
5179
  response = backend_utils.invoke_skylet_with_retries(
5018
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
5019
- is_autostopping(request))
5180
+ lambda: SkyletClient(handle.get_grpc_channel()
5181
+ ).is_autostopping(request))
5020
5182
  return response.is_autostopping
5021
5183
  except Exception as e: # pylint: disable=broad-except
5022
5184
  # The cluster may have been terminated, causing the gRPC call
@@ -5128,7 +5290,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5128
5290
  exceptions.InvalidClusterNameError: If the cluster name is invalid.
5129
5291
  # TODO(zhwu): complete the list of exceptions.
5130
5292
  """
5131
- record = global_user_state.get_cluster_from_name(cluster_name)
5293
+ record = global_user_state.get_cluster_from_name(
5294
+ cluster_name, include_user_info=False, summary_response=True)
5132
5295
  if record is None:
5133
5296
  handle_before_refresh = None
5134
5297
  status_before_refresh = None
@@ -5148,7 +5311,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5148
5311
  record = backend_utils.refresh_cluster_record(
5149
5312
  cluster_name,
5150
5313
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
5151
- acquire_per_cluster_status_lock=False,
5314
+ cluster_lock_already_held=True,
5315
+ include_user_info=False,
5316
+ summary_response=True,
5152
5317
  )
5153
5318
  if record is not None:
5154
5319
  prev_cluster_status = record['status']
@@ -5264,33 +5429,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5264
5429
  common_utils.check_cluster_name_is_valid(cluster_name)
5265
5430
 
5266
5431
  if to_provision is None:
5267
- # The cluster is recently terminated either by autostop or manually
5268
- # terminated on the cloud. We should use the previously terminated
5269
- # resources to provision the cluster.
5270
- #
5271
- # FIXME(zongheng): this assert can be hit by using two terminals.
5272
- # First, create a 'dbg' cluster. Then:
5273
- # Terminal 1: sky down dbg -y
5274
- # Terminal 2: sky launch -c dbg -- echo
5275
- # Run it in order. Terminal 2 will show this error after terminal 1
5276
- # succeeds in downing the cluster and releasing the lock.
5277
- assert isinstance(
5278
- handle_before_refresh, CloudVmRayResourceHandle), (
5279
- f'Trying to launch cluster {cluster_name!r} recently '
5280
- 'terminated on the cloud, but the handle is not a '
5281
- f'CloudVmRayResourceHandle ({handle_before_refresh}).')
5282
- status_before_refresh_str = None
5283
- if status_before_refresh is not None:
5284
- status_before_refresh_str = status_before_refresh.value
5285
-
5286
- logger.info(
5287
- f'The cluster {cluster_name!r} (status: '
5288
- f'{status_before_refresh_str}) was not found on the cloud: it '
5289
- 'may be autodowned, manually terminated, or its launch never '
5290
- 'succeeded. Provisioning a new cluster by using the same '
5291
- 'resources as its original launch.')
5292
- to_provision = handle_before_refresh.launched_resources
5293
- self.check_resources_fit_cluster(handle_before_refresh, task)
5432
+ # Recently terminated after refresh. OPTIMIZE usually ran outside
5433
+ # the lock, so that decision may be stale by now. Under the lock,
5434
+ # ensure we always have a concrete plan via the following order:
5435
+ # 1) Reuse last placement snapshot (if available);
5436
+ # 2) Else, call injected planner for a fresh plan.
5437
+ # If we still have a pre-refresh handle snapshot with a concrete
5438
+ # placement, prefer reusing it.
5439
+ if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
5440
+ handle_before_refresh.launched_resources is not None):
5441
+ to_provision = handle_before_refresh.launched_resources
5442
+ # Ensure the requested task fits the previous placement.
5443
+ self.check_resources_fit_cluster(handle_before_refresh, task)
5444
+ # Mirror the original message for reuse path.
5445
+ status_before_refresh_str = None
5446
+ if status_before_refresh is not None:
5447
+ status_before_refresh_str = status_before_refresh.value
5448
+ logger.info(
5449
+ f'The cluster {cluster_name!r} (status: '
5450
+ f'{status_before_refresh_str}) was not found on the cloud: '
5451
+ 'it may be autodowned, manually terminated, or its launch '
5452
+ 'never succeeded. Provisioning a new cluster by using the '
5453
+ 'same resources as its original launch.')
5454
+ elif self._planner is not None:
5455
+ to_provision = self._planner(task)
5456
+ logger.info(
5457
+ 'Previous placement snapshot missing; computing a fresh '
5458
+ 'plan for provisioning.')
5459
+ else:
5460
+ # Without a snapshot or planner, we cannot proceed safely.
5461
+ # Surface a user-friendly error without a long traceback.
5462
+ with ux_utils.print_exception_no_traceback():
5463
+ raise RuntimeError(
5464
+ 'No concrete launch plan available after recent cloud '
5465
+ f'termination of cluster {cluster_name!r}. Ensure the '
5466
+ 'OPTIMIZE stage runs or provide concrete resources.')
5294
5467
 
5295
5468
  return RetryingVmProvisioner.ToProvisionConfig(
5296
5469
  cluster_name,
@@ -5639,7 +5812,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5639
5812
  def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
5640
5813
  handle: CloudVmRayResourceHandle) -> Dict[str, str]:
5641
5814
  """Returns the environment variables for the task."""
5642
- env_vars = task.envs_and_secrets
5815
+ env_vars = task_lib.get_plaintext_envs_and_secrets(
5816
+ task.envs_and_secrets)
5643
5817
  # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
5644
5818
  # by the controller.
5645
5819
  if constants.TASK_ID_ENV_VAR not in env_vars:
@@ -5651,9 +5825,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5651
5825
  env_vars.update(self._skypilot_predefined_env_vars(handle))
5652
5826
  return env_vars
5653
5827
 
5828
+ def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
5829
+ """Returns the user id for the managed job."""
5830
+ if task.managed_job_dag is not None:
5831
+ return task.envs[constants.USER_ID_ENV_VAR]
5832
+ return None
5833
+
5834
+ def _get_task_codegen_class(
5835
+ self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
5836
+ """Returns the appropriate TaskCodeGen for the given handle."""
5837
+ if isinstance(handle.launched_resources.cloud, clouds.Slurm):
5838
+ assert (handle.cached_cluster_info
5839
+ is not None), ('cached_cluster_info must be set')
5840
+ head_instance = handle.cached_cluster_info.get_head_instance()
5841
+ assert (head_instance is not None), (
5842
+ 'Head instance not found in cached cluster info')
5843
+ slurm_job_id = head_instance.tags.get('job_id')
5844
+ assert (slurm_job_id
5845
+ is not None), ('job_id tag not found in head instance')
5846
+ return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
5847
+ else:
5848
+ return task_codegen.RayCodeGen()
5849
+
5654
5850
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5655
5851
  task: task_lib.Task, job_id: int,
5656
- detach_run: bool, remote_log_dir: str) -> None:
5852
+ remote_log_dir: str) -> None:
5657
5853
  # Launch the command as a Ray task.
5658
5854
  log_dir = os.path.join(remote_log_dir, 'tasks')
5659
5855
 
@@ -5663,42 +5859,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5663
5859
 
5664
5860
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5665
5861
 
5666
- codegen = RayCodeGen()
5862
+ codegen = self._get_task_codegen_class(handle)
5863
+
5667
5864
  codegen.add_prologue(job_id)
5668
- codegen.add_gang_scheduling_placement_group_and_setup(
5865
+ codegen.add_setup(
5669
5866
  1,
5670
5867
  resources_dict,
5671
5868
  stable_cluster_internal_ips=internal_ips,
5672
5869
  env_vars=task_env_vars,
5870
+ log_dir=log_dir,
5673
5871
  setup_cmd=self._setup_cmd,
5674
- setup_log_path=os.path.join(log_dir, 'setup.log'),
5675
5872
  )
5676
5873
 
5677
- if callable(task.run):
5678
- run_fn_code = textwrap.dedent(inspect.getsource(task.run))
5679
- run_fn_name = task.run.__name__
5680
- codegen.register_run_fn(run_fn_code, run_fn_name)
5681
-
5682
- command_for_node = task.run if isinstance(task.run, str) else None
5683
- codegen.add_ray_task(
5684
- bash_script=command_for_node,
5874
+ codegen.add_task(
5875
+ 1,
5876
+ bash_script=task.run,
5685
5877
  env_vars=task_env_vars,
5686
5878
  task_name=task.name,
5687
- ray_resources_dict=backend_utils.get_task_demands_dict(task),
5879
+ resources_dict=backend_utils.get_task_demands_dict(task),
5688
5880
  log_dir=log_dir)
5689
5881
 
5690
5882
  codegen.add_epilogue()
5691
5883
 
5692
- self._exec_code_on_head(handle,
5693
- codegen.build(),
5694
- job_id,
5695
- detach_run=detach_run,
5696
- managed_job_dag=task.managed_job_dag,
5697
- remote_log_dir=remote_log_dir)
5884
+ self._exec_code_on_head(
5885
+ handle,
5886
+ codegen.build(),
5887
+ job_id,
5888
+ managed_job_dag=task.managed_job_dag,
5889
+ managed_job_user_id=self._get_managed_job_user_id(task),
5890
+ remote_log_dir=remote_log_dir)
5698
5891
 
5699
5892
  def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
5700
5893
  task: task_lib.Task, job_id: int,
5701
- detach_run: bool, remote_log_dir: str) -> None:
5894
+ remote_log_dir: str) -> None:
5702
5895
  # Strategy:
5703
5896
  # ray.init(...)
5704
5897
  # for node:
@@ -5712,42 +5905,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5712
5905
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
5713
5906
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5714
5907
 
5715
- codegen = RayCodeGen()
5908
+ codegen = self._get_task_codegen_class(handle)
5909
+
5716
5910
  codegen.add_prologue(job_id)
5717
- codegen.add_gang_scheduling_placement_group_and_setup(
5911
+ codegen.add_setup(
5718
5912
  num_actual_nodes,
5719
5913
  resources_dict,
5720
5914
  stable_cluster_internal_ips=internal_ips,
5721
5915
  env_vars=task_env_vars,
5916
+ log_dir=log_dir,
5722
5917
  setup_cmd=self._setup_cmd,
5723
- setup_log_path=os.path.join(log_dir, 'setup.log'),
5724
5918
  )
5725
5919
 
5726
- if callable(task.run):
5727
- run_fn_code = textwrap.dedent(inspect.getsource(task.run))
5728
- run_fn_name = task.run.__name__
5729
- codegen.register_run_fn(run_fn_code, run_fn_name)
5730
-
5731
- # TODO(zhwu): The resources limitation for multi-node ray.tune and
5732
- # horovod should be considered.
5733
- for i in range(num_actual_nodes):
5734
- command_for_node = task.run if isinstance(task.run, str) else None
5735
-
5736
- # Ray's per-node resources, to constrain scheduling each command to
5737
- # the corresponding node, represented by private IPs.
5738
- codegen.add_ray_task(
5739
- bash_script=command_for_node,
5740
- env_vars=task_env_vars,
5741
- task_name=task.name,
5742
- ray_resources_dict=backend_utils.get_task_demands_dict(task),
5743
- log_dir=log_dir,
5744
- gang_scheduling_id=i)
5920
+ codegen.add_task(
5921
+ num_actual_nodes,
5922
+ bash_script=task.run,
5923
+ env_vars=task_env_vars,
5924
+ task_name=task.name,
5925
+ resources_dict=backend_utils.get_task_demands_dict(task),
5926
+ log_dir=log_dir)
5745
5927
 
5746
5928
  codegen.add_epilogue()
5747
5929
  # TODO(zhanghao): Add help info for downloading logs.
5748
- self._exec_code_on_head(handle,
5749
- codegen.build(),
5750
- job_id,
5751
- detach_run=detach_run,
5752
- managed_job_dag=task.managed_job_dag,
5753
- remote_log_dir=remote_log_dir)
5930
+ self._exec_code_on_head(
5931
+ handle,
5932
+ codegen.build(),
5933
+ job_id,
5934
+ managed_job_dag=task.managed_job_dag,
5935
+ managed_job_user_id=self._get_managed_job_user_id(task),
5936
+ remote_log_dir=remote_log_dir)