skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -2,14 +2,15 @@
2
2
  import copy
3
3
  import dataclasses
4
4
  import enum
5
- import inspect
6
5
  import json
7
6
  import math
8
7
  import os
9
8
  import pathlib
9
+ import random
10
10
  import re
11
11
  import shlex
12
12
  import signal
13
+ import socket
13
14
  import subprocess
14
15
  import sys
15
16
  import tempfile
@@ -17,8 +18,8 @@ import textwrap
17
18
  import threading
18
19
  import time
19
20
  import typing
20
- from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
21
- Union)
21
+ from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
22
+ Set, Tuple, Union)
22
23
 
23
24
  import colorama
24
25
  import psutil
@@ -39,6 +40,7 @@ from sky import skypilot_config
39
40
  from sky import task as task_lib
40
41
  from sky.adaptors import common as adaptors_common
41
42
  from sky.backends import backend_utils
43
+ from sky.backends import task_codegen
42
44
  from sky.backends import wheel_utils
43
45
  from sky.clouds import cloud as sky_cloud
44
46
  from sky.clouds.utils import gcp_utils
@@ -48,14 +50,15 @@ from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
50
52
  from sky.provision import provisioner
53
+ from sky.provision.kubernetes import config as config_lib
51
54
  from sky.provision.kubernetes import utils as kubernetes_utils
55
+ from sky.serve import constants as serve_constants
52
56
  from sky.server.requests import requests as requests_lib
53
57
  from sky.skylet import autostop_lib
54
58
  from sky.skylet import constants
55
59
  from sky.skylet import job_lib
56
60
  from sky.skylet import log_lib
57
61
  from sky.usage import usage_lib
58
- from sky.utils import accelerator_registry
59
62
  from sky.utils import annotations
60
63
  from sky.utils import cluster_utils
61
64
  from sky.utils import command_runner
@@ -85,13 +88,34 @@ if typing.TYPE_CHECKING:
85
88
  from sky import dag
86
89
  from sky.schemas.generated import autostopv1_pb2
87
90
  from sky.schemas.generated import autostopv1_pb2_grpc
91
+ from sky.schemas.generated import jobsv1_pb2
92
+ from sky.schemas.generated import jobsv1_pb2_grpc
93
+ from sky.schemas.generated import managed_jobsv1_pb2
94
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
95
+ from sky.schemas.generated import servev1_pb2
96
+ from sky.schemas.generated import servev1_pb2_grpc
88
97
  else:
89
98
  # To avoid requiring grpcio to be installed on the client side.
90
- grpc = adaptors_common.LazyImport('grpc')
99
+ grpc = adaptors_common.LazyImport(
100
+ 'grpc',
101
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
102
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
103
+ if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
91
104
  autostopv1_pb2 = adaptors_common.LazyImport(
92
105
  'sky.schemas.generated.autostopv1_pb2')
93
106
  autostopv1_pb2_grpc = adaptors_common.LazyImport(
94
107
  'sky.schemas.generated.autostopv1_pb2_grpc')
108
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
109
+ jobsv1_pb2_grpc = adaptors_common.LazyImport(
110
+ 'sky.schemas.generated.jobsv1_pb2_grpc')
111
+ servev1_pb2 = adaptors_common.LazyImport(
112
+ 'sky.schemas.generated.servev1_pb2')
113
+ servev1_pb2_grpc = adaptors_common.LazyImport(
114
+ 'sky.schemas.generated.servev1_pb2_grpc')
115
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
116
+ 'sky.schemas.generated.managed_jobsv1_pb2')
117
+ managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
118
+ 'sky.schemas.generated.managed_jobsv1_pb2_grpc')
95
119
 
96
120
  Path = str
97
121
 
@@ -113,6 +137,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
113
137
  clouds.OCI: 300,
114
138
  clouds.Paperspace: 600,
115
139
  clouds.Kubernetes: 300,
140
+ clouds.Shadeform: 300,
116
141
  clouds.Vsphere: 240,
117
142
  }
118
143
 
@@ -179,6 +204,12 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
179
204
  # We use 100KB as a threshold to be safe for other arguments that
180
205
  # might be added during ssh.
181
206
  _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
207
+ _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
208
+ ('too long', 255),
209
+ ('request-uri too large', 1),
210
+ ('request header fields too large', 1),
211
+ ('400 bad request', 1), # CloudFlare 400 error
212
+ ]
182
213
 
183
214
  _RESOURCES_UNAVAILABLE_LOG = (
184
215
  'Reasons for provision failures (for details, please check the log above):')
@@ -199,6 +230,61 @@ def _is_command_length_over_limit(command: str) -> bool:
199
230
  return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
200
231
 
201
232
 
233
+ def _is_message_too_long(returncode: int,
234
+ output: Optional[str] = None,
235
+ file_path: Optional[str] = None) -> bool:
236
+ """Check if the message sent to the remote is too long.
237
+
238
+ We use inline script to run the setup or run command, i.e. the script will
239
+ be part of the message sent to the remote cluster. There is a chance that
240
+ the command is too long, when people has very long run or setup commands, or
241
+ there is a cloudflare proxy in front of the remote blocking the long
242
+ message. Several common causes are:
243
+ - SSH returning: `too long` in the error message.
244
+ - Cloudflare proxy returning: `414 Request-URI Too Large` or
245
+ `431 Request Header Fields Too Large` error.
246
+
247
+ We use a general length limit check before but it could be inaccurate on
248
+ some systems, e.g. cloudflare proxy, so this is necessary.
249
+
250
+ Args:
251
+ returncode: The return code of the setup command.
252
+ output: The output of the setup command.
253
+ file_path: The path to the setup log file.
254
+ """
255
+ assert (output is None) != (file_path is None), (
256
+ 'Either output or file_path must be provided.', output, file_path)
257
+ to_check = []
258
+ for (match_str,
259
+ desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
260
+ if desired_rc == returncode:
261
+ to_check.append(match_str)
262
+ if not to_check:
263
+ return False
264
+
265
+ def _check_output_for_match_str(output: str) -> bool:
266
+ for match_str in to_check:
267
+ if match_str.lower() in output.lower():
268
+ return True
269
+ return False
270
+
271
+ if file_path is not None:
272
+ try:
273
+ with open(os.path.expanduser(file_path), 'r',
274
+ encoding='utf-8') as f:
275
+ content = f.read()
276
+ return _check_output_for_match_str(content)
277
+ except Exception as e: # pylint: disable=broad-except
278
+ # We don't crash the setup if we cannot read the log file.
279
+ # Instead, we should retry the setup with dumping the script
280
+ # to a file to be safe.
281
+ logger.debug(f'Failed to read setup log file {file_path}: {e}')
282
+ return True
283
+ else:
284
+ assert output is not None, (output, file_path)
285
+ return _check_output_for_match_str(output)
286
+
287
+
202
288
  def _get_cluster_config_template(cloud):
203
289
  cloud_to_template = {
204
290
  clouds.AWS: 'aws-ray.yml.j2',
@@ -210,15 +296,18 @@ def _get_cluster_config_template(cloud):
210
296
  clouds.SCP: 'scp-ray.yml.j2',
211
297
  clouds.OCI: 'oci-ray.yml.j2',
212
298
  clouds.Paperspace: 'paperspace-ray.yml.j2',
299
+ clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
213
300
  clouds.DO: 'do-ray.yml.j2',
214
301
  clouds.RunPod: 'runpod-ray.yml.j2',
215
302
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
216
303
  clouds.SSH: 'kubernetes-ray.yml.j2',
304
+ clouds.Shadeform: 'shadeform-ray.yml.j2',
217
305
  clouds.Vsphere: 'vsphere-ray.yml.j2',
218
306
  clouds.Vast: 'vast-ray.yml.j2',
219
307
  clouds.Fluidstack: 'fluidstack-ray.yml.j2',
220
308
  clouds.Nebius: 'nebius-ray.yml.j2',
221
- clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
309
+ clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
310
+ clouds.Seeweb: 'seeweb-ray.yml.j2'
222
311
  }
223
312
  return cloud_to_template[type(cloud)]
224
313
 
@@ -248,511 +337,6 @@ def write_ray_up_script_with_patched_launch_hash_fn(
248
337
  return f.name
249
338
 
250
339
 
251
- class RayCodeGen:
252
- """Code generator of a Ray program that executes a sky.Task.
253
-
254
- Usage:
255
-
256
- >> codegen = RayCodegen()
257
- >> codegen.add_prologue()
258
-
259
- >> codegen.add_ray_task(...)
260
- >> codegen.add_ray_task(...)
261
-
262
- >> codegen.add_epilogue()
263
- >> code = codegen.build()
264
- """
265
-
266
- def __init__(self):
267
- # Code generated so far, to be joined via '\n'.
268
- self._code = []
269
- # Guard method calling order.
270
- self._has_prologue = False
271
- self._has_epilogue = False
272
-
273
- # For n nodes gang scheduling.
274
- self._has_gang_scheduling = False
275
- self._num_nodes = 0
276
-
277
- self._has_register_run_fn = False
278
-
279
- # job_id
280
- # Job ID is used to identify the job (also this generated code).
281
- # It is a int automatically generated by the DB on the cluster
282
- # and monotonically increasing starting from 1.
283
- # To generate the job ID, we use the following logic:
284
- # code = job_lib.JobLibCodeGen.add_job(username,
285
- # run_timestamp)
286
- # job_id = get_output(run_on_cluster(code))
287
- self.job_id = None
288
-
289
- def add_prologue(self, job_id: int) -> None:
290
- assert not self._has_prologue, 'add_prologue() called twice?'
291
- self._has_prologue = True
292
- self.job_id = job_id
293
- # Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
294
- # 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
295
- # Otherwise, ray will fail to get the placement group because of a bug
296
- # in ray job.
297
- ray_address = 'auto'
298
- self._code = [
299
- textwrap.dedent(f"""\
300
- import functools
301
- import getpass
302
- import hashlib
303
- import io
304
- import os
305
- import pathlib
306
- import selectors
307
- import shlex
308
- import subprocess
309
- import sys
310
- import tempfile
311
- import textwrap
312
- import time
313
- from typing import Dict, List, Optional, Tuple, Union
314
-
315
- # Set the environment variables to avoid deduplicating logs and
316
- # scheduler events. This should be set in driver code, since we are
317
- # not using `ray job submit` anymore, and the environment variables
318
- # from the ray cluster is not inherited.
319
- os.environ['RAY_DEDUP_LOGS'] = '0'
320
- os.environ['RAY_SCHEDULER_EVENTS'] = '0'
321
-
322
- import ray
323
- import ray.util as ray_util
324
-
325
- from sky.skylet import autostop_lib
326
- from sky.skylet import constants
327
- from sky.skylet import job_lib
328
- from sky.utils import log_utils
329
- from sky.utils import subprocess_utils
330
-
331
- SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
332
-
333
- kwargs = dict()
334
- # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
335
- # the directory exists for backward compatibility for the VM
336
- # launched before #1790.
337
- if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
338
- kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
339
- ray.init(
340
- address={ray_address!r},
341
- namespace='__sky__{job_id}__',
342
- log_to_driver=True,
343
- **kwargs
344
- )
345
- def get_or_fail(futures, pg) -> List[int]:
346
- \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
347
- if not futures:
348
- return []
349
- returncodes = [1] * len(futures)
350
- # Wait for 1 task to be ready.
351
- ready = []
352
- # Keep invoking ray.wait if ready is empty. This is because
353
- # ray.wait with timeout=None will only wait for 10**6 seconds,
354
- # which will cause tasks running for more than 12 days to return
355
- # before becoming ready.
356
- # (Such tasks are common in serving jobs.)
357
- # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
358
- while not ready:
359
- ready, unready = ray.wait(futures)
360
- idx = futures.index(ready[0])
361
- returncodes[idx] = ray.get(ready[0])
362
- while unready:
363
- if returncodes[idx] != 0:
364
- for task in unready:
365
- # ray.cancel without force fails to kill tasks.
366
- # We use force=True to kill unready tasks.
367
- ray.cancel(task, force=True)
368
- # Use SIGKILL=128+9 to indicate the task is forcely
369
- # killed.
370
- idx = futures.index(task)
371
- returncodes[idx] = 137
372
- break
373
- ready, unready = ray.wait(unready)
374
- idx = futures.index(ready[0])
375
- returncodes[idx] = ray.get(ready[0])
376
- # Remove the placement group after all tasks are done, so that
377
- # the next job can be scheduled on the released resources
378
- # immediately.
379
- ray_util.remove_placement_group(pg)
380
- sys.stdout.flush()
381
- return returncodes
382
-
383
- run_fn = None
384
- futures = []
385
- """),
386
- # FIXME: This is a hack to make sure that the functions can be found
387
- # by ray.remote. This should be removed once we have a better way to
388
- # specify dependencies for ray.
389
- inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
390
- inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
391
- inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
392
- inspect.getsource(log_lib.process_subprocess_stream),
393
- inspect.getsource(log_lib.run_with_log),
394
- inspect.getsource(log_lib.make_task_bash_script),
395
- inspect.getsource(log_lib.add_ray_env_vars),
396
- inspect.getsource(log_lib.run_bash_command_with_log),
397
- 'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
398
- ]
399
- # Currently, the codegen program is/can only be submitted to the head
400
- # node, due to using job_lib for updating job statuses, and using
401
- # autostop_lib here.
402
- self._code.append(
403
- # Use hasattr to handle backward compatibility.
404
- # TODO(zongheng): remove in ~1-2 minor releases (currently 0.2.x).
405
- textwrap.dedent("""\
406
- if hasattr(autostop_lib, 'set_last_active_time_to_now'):
407
- autostop_lib.set_last_active_time_to_now()
408
- """))
409
- self._code += [
410
- f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
411
- ]
412
-
413
- def add_gang_scheduling_placement_group_and_setup(
414
- self,
415
- num_nodes: int,
416
- resources_dict: Dict[str, float],
417
- stable_cluster_internal_ips: List[str],
418
- env_vars: Dict[str, str],
419
- setup_cmd: Optional[str] = None,
420
- setup_log_path: Optional[str] = None,
421
- ) -> None:
422
- """Create the gang scheduling placement group for a Task.
423
-
424
- cluster_ips_sorted is used to ensure that the SKY_NODE_RANK environment
425
- variable is assigned in a deterministic order whenever a new task is
426
- added.
427
- """
428
- assert self._has_prologue, (
429
- 'Call add_prologue() before '
430
- 'add_gang_scheduling_placement_group_and_setup().')
431
- self._has_gang_scheduling = True
432
- self._num_nodes = num_nodes
433
-
434
- bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
435
- # Set CPU to avoid ray hanging the resources allocation
436
- # for remote functions, since the task will request 1 CPU
437
- # by default.
438
- task_cpu_demand = resources_dict.pop('CPU')
439
-
440
- if resources_dict:
441
- assert len(resources_dict) == 1, (
442
- 'There can only be one type of accelerator per instance. '
443
- f'Found: {resources_dict}.')
444
- acc_name, acc_count = list(resources_dict.items())[0]
445
- gpu_dict = {'GPU': acc_count}
446
- # gpu_dict should be empty when the accelerator is not GPU.
447
- # TODO(zongheng,zhanghao): an alternative is to start the remote
448
- # cluster with custom resource 'GPU': <n> even if the accelerator(s)
449
- # are not GPU. We opt for the current solution for now.
450
- if accelerator_registry.is_schedulable_non_gpu_accelerator(
451
- acc_name):
452
- gpu_dict = {}
453
- for bundle in bundles:
454
- bundle.update({
455
- # Set the GPU to avoid ray hanging the resources allocation
456
- **gpu_dict,
457
- })
458
-
459
- streaming_message = (
460
- f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
461
- f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
462
- f'be killed){colorama.Style.RESET_ALL}')
463
- self._code += [
464
- textwrap.dedent(f"""\
465
- pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
466
- plural = 's' if {num_nodes} > 1 else ''
467
- node_str = f'{num_nodes} node{{plural}}'
468
- message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
469
- 'Waiting for task resources on '
470
- f'{{node_str}}.{colorama.Style.RESET_ALL}')
471
- print(message, flush=True)
472
- # FIXME: This will print the error message from autoscaler if
473
- # it is waiting for other task to finish. We should hide the
474
- # error message.
475
- ray.get(pg.ready())
476
- print({streaming_message!r}, flush=True)
477
- """)
478
- ]
479
-
480
- job_id = self.job_id
481
- if setup_cmd is not None:
482
- setup_envs = env_vars.copy()
483
- setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
484
- self._code += [
485
- textwrap.dedent(f"""\
486
- setup_cmd = {setup_cmd!r}
487
- _SETUP_CPUS = 0.0001
488
- # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
489
- # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
490
- # We unset it so that user setup command may properly use this env var.
491
- setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
492
- job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
493
-
494
- # The schedule_step should be called after the job status is set to non-PENDING,
495
- # otherwise, the scheduler will think the current job is not submitted yet, and
496
- # skip the scheduling step.
497
- job_lib.scheduler.schedule_step()
498
-
499
- total_num_nodes = len(ray.nodes())
500
- setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
501
- setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
502
- setup_workers = [run_bash_command_with_log \\
503
- .options(
504
- name='setup',
505
- num_cpus=_SETUP_CPUS,
506
- scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
507
- placement_group=setup_pg,
508
- placement_group_bundle_index=i)
509
- ) \\
510
- .remote(
511
- setup_cmd,
512
- os.path.expanduser({setup_log_path!r}),
513
- env_vars={setup_envs!r},
514
- stream_logs=True,
515
- with_ray=True,
516
- ) for i in range(total_num_nodes)]
517
- setup_returncodes = get_or_fail(setup_workers, setup_pg)
518
- if sum(setup_returncodes) != 0:
519
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
520
- # This waits for all streaming logs to finish.
521
- time.sleep(1)
522
- print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
523
- 'return code list:{colorama.Style.RESET_ALL}',
524
- setup_returncodes,
525
- flush=True)
526
- # Need this to set the job status in ray job to be FAILED.
527
- sys.exit(1)
528
- """)
529
- ]
530
-
531
- self._code.append(f'job_lib.set_job_started({self.job_id!r})')
532
- if setup_cmd is None:
533
- # Need to call schedule_step() to make sure the scheduler
534
- # schedule the next pending job.
535
- self._code.append('job_lib.scheduler.schedule_step()')
536
-
537
- # Export IP and node rank to the environment variables.
538
- self._code += [
539
- textwrap.dedent(f"""\
540
- @ray.remote
541
- def check_ip():
542
- return ray.util.get_node_ip_address()
543
- gang_scheduling_id_to_ip = ray.get([
544
- check_ip.options(
545
- num_cpus={task_cpu_demand},
546
- scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
547
- placement_group=pg,
548
- placement_group_bundle_index=i
549
- )).remote()
550
- for i in range(pg.bundle_count)
551
- ])
552
-
553
- cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
554
- job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
555
- job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
556
- job_ip_list_str = '\\n'.join(job_ip_rank_list)
557
- """),
558
- ]
559
-
560
- def register_run_fn(self, run_fn: str, run_fn_name: str) -> None:
561
- """Register the run function to be run on the remote cluster.
562
-
563
- Args:
564
- run_fn: The run function to be run on the remote cluster.
565
- """
566
- assert self._has_gang_scheduling, (
567
- 'Call add_gang_scheduling_placement_group_and_setup() '
568
- 'before register_run_fn().')
569
- assert not self._has_register_run_fn, (
570
- 'register_run_fn() called twice?')
571
- self._has_register_run_fn = True
572
-
573
- self._code += [
574
- run_fn,
575
- f'run_fn = {run_fn_name}',
576
- ]
577
-
578
- def add_ray_task(self,
579
- bash_script: Optional[str],
580
- task_name: Optional[str],
581
- ray_resources_dict: Dict[str, float],
582
- log_dir: str,
583
- env_vars: Optional[Dict[str, str]] = None,
584
- gang_scheduling_id: int = 0) -> None:
585
- """Generates code for a ray remote task that runs a bash command."""
586
- assert self._has_gang_scheduling, (
587
- 'Call add_gang_scheduling_placement_group_and_setup() before '
588
- 'add_ray_task().')
589
- assert (not self._has_register_run_fn or
590
- bash_script is None), ('bash_script should '
591
- 'be None when run_fn is registered.')
592
- task_cpu_demand = ray_resources_dict.pop('CPU')
593
- # Build remote_task.options(...)
594
- # resources=...
595
- # num_gpus=...
596
- options = []
597
- options.append(f'num_cpus={task_cpu_demand}')
598
-
599
- num_gpus = 0.0
600
- if ray_resources_dict:
601
- assert len(ray_resources_dict) == 1, (
602
- 'There can only be one type of accelerator per instance. '
603
- f'Found: {ray_resources_dict}.')
604
- num_gpus = list(ray_resources_dict.values())[0]
605
- options.append(f'resources={json.dumps(ray_resources_dict)}')
606
-
607
- resources_key = list(ray_resources_dict.keys())[0]
608
- if not accelerator_registry.is_schedulable_non_gpu_accelerator(
609
- resources_key):
610
- # `num_gpus` should be empty when the accelerator is not GPU.
611
- # FIXME: use a set of GPU types, instead of 'tpu' in the key.
612
-
613
- # Passing this ensures that the Ray remote task gets
614
- # CUDA_VISIBLE_DEVICES set correctly. If not passed, that flag
615
- # would be force-set to empty by Ray.
616
- options.append(f'num_gpus={num_gpus}')
617
- options.append(
618
- 'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
619
- 'placement_group=pg, '
620
- f'placement_group_bundle_index={gang_scheduling_id})')
621
-
622
- sky_env_vars_dict_str = [
623
- textwrap.dedent(f"""\
624
- sky_env_vars_dict = {{}}
625
- sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
626
- sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
627
- """)
628
- ]
629
-
630
- if env_vars is not None:
631
- sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
632
- for k, v in env_vars.items())
633
- sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
634
-
635
- options_str = ', '.join(options)
636
- logger.debug('Added Task with options: '
637
- f'{options_str}')
638
- # Script to block completion of a job until all storage mounted with
639
- # CACHED_MOUNT mode is uploaded to remote.
640
- rclone_flush_script = textwrap.dedent(f"""\
641
-
642
- # Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
643
- # findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
644
- # rclone for normal mounts as well.
645
- if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
646
- [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
647
- [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
648
- flushed=0
649
- # extra second on top of --vfs-cache-poll-interval to
650
- # avoid race condition between rclone log line creation and this check.
651
- sleep 1
652
- while [ $flushed -eq 0 ]; do
653
- # sleep for the same interval as --vfs-cache-poll-interval
654
- sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
655
- flushed=1
656
- for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
657
- exitcode=0
658
- tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
659
- if [ $exitcode -ne 0 ]; then
660
- echo "skypilot: cached mount is still uploading to remote"
661
- flushed=0
662
- break
663
- fi
664
- done
665
- done
666
- echo "skypilot: cached mount uploaded complete"
667
- fi""")
668
- self._code += [
669
- sky_env_vars_dict_str,
670
- textwrap.dedent(f"""\
671
- script = {bash_script!r}
672
- rclone_flush_script = {rclone_flush_script!r}
673
- if run_fn is not None:
674
- script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
675
-
676
- if script is not None:
677
- script += rclone_flush_script
678
- sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
679
-
680
- ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
681
- rank = job_ip_rank_map[ip]
682
-
683
- if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
684
- name_str = '{task_name},' if {task_name!r} != None else 'task,'
685
- log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
686
- else: # Single-node or multi-node task on multi-node cluster
687
- idx_in_cluster = cluster_ips_to_node_id[ip]
688
- if cluster_ips_to_node_id[ip] == 0:
689
- node_name = 'head'
690
- else:
691
- node_name = f'worker{{idx_in_cluster}}'
692
- name_str = f'{{node_name}}, rank={{rank}},'
693
- log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
694
- sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
695
-
696
- sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
697
-
698
- futures.append(run_bash_command_with_log \\
699
- .options(name=name_str, {options_str}) \\
700
- .remote(
701
- script,
702
- log_path,
703
- env_vars=sky_env_vars_dict,
704
- stream_logs=True,
705
- with_ray=True,
706
- ))""")
707
- ]
708
-
709
- def add_epilogue(self) -> None:
710
- """Generates code that waits for all tasks, then exits."""
711
- assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
712
- assert not self._has_epilogue, 'add_epilogue() called twice?'
713
- self._has_epilogue = True
714
-
715
- self._code += [
716
- textwrap.dedent(f"""\
717
- returncodes = get_or_fail(futures, pg)
718
- if sum(returncodes) != 0:
719
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
720
- # Schedule the next pending job immediately to make the job
721
- # scheduling more efficient.
722
- job_lib.scheduler.schedule_step()
723
- # This waits for all streaming logs to finish.
724
- time.sleep(0.5)
725
- reason = ''
726
- # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
727
- if any(r == 139 for r in returncodes):
728
- reason = '(likely due to Segmentation Fault)'
729
- if any(r == 137 for r in returncodes):
730
- # Find the first non-137 return code
731
- non_137 = next(r for r in returncodes if r != 137)
732
- reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
733
- print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
734
- 'return code list:{colorama.Style.RESET_ALL}',
735
- returncodes,
736
- reason,
737
- flush=True)
738
- # Need this to set the job status in ray job to be FAILED.
739
- sys.exit(1)
740
- else:
741
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
742
- # Schedule the next pending job immediately to make the job
743
- # scheduling more efficient.
744
- job_lib.scheduler.schedule_step()
745
- # This waits for all streaming logs to finish.
746
- time.sleep(0.5)
747
- """)
748
- ]
749
-
750
- def build(self) -> str:
751
- """Returns the entire generated program."""
752
- assert self._has_epilogue, 'Call add_epilogue() before build().'
753
- return '\n'.join(self._code)
754
-
755
-
756
340
  class GangSchedulingStatus(enum.Enum):
757
341
  """Enum for gang scheduling status."""
758
342
  CLUSTER_READY = 0
@@ -1340,6 +924,34 @@ class RetryingVmProvisioner(object):
1340
924
  zones = [clouds.Zone(name=to_provision.zone)]
1341
925
  yield zones
1342
926
 
927
+ def _insufficient_resources_msg(
928
+ self,
929
+ to_provision: resources_lib.Resources,
930
+ requested_resources: Set[resources_lib.Resources],
931
+ insufficient_resources: Optional[List[str]],
932
+ ) -> str:
933
+ insufficent_resource_msg = ('' if insufficient_resources is None else
934
+ f' ({", ".join(insufficient_resources)})')
935
+ message = f'Failed to acquire resources{insufficent_resource_msg} '
936
+ if to_provision.zone is not None:
937
+ message += (f'in {to_provision.zone} for {requested_resources}. ')
938
+ elif to_provision.region is not None and to_provision.cloud is not None:
939
+ # For public clouds, provision.region is always set.
940
+ if clouds.SSH().is_same_cloud(to_provision.cloud):
941
+ message += (
942
+ f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
943
+ f'for {requested_resources}. The SSH Node Pool may not '
944
+ 'have enough resources.')
945
+ elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
946
+ message += (f'in context {to_provision.region} for '
947
+ f'{requested_resources}. ')
948
+ else:
949
+ message += (f'in all zones in {to_provision.region} for '
950
+ f'{requested_resources}. ')
951
+ else:
952
+ message += (f'{to_provision.cloud} for {requested_resources}. ')
953
+ return message
954
+
1343
955
  def _retry_zones(
1344
956
  self,
1345
957
  to_provision: resources_lib.Resources,
@@ -1418,6 +1030,7 @@ class RetryingVmProvisioner(object):
1418
1030
  f'To request quotas, check the instruction: '
1419
1031
  f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
1420
1032
 
1033
+ insufficient_resources = None
1421
1034
  for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
1422
1035
  prev_cluster_status,
1423
1036
  prev_cluster_ever_up):
@@ -1630,6 +1243,24 @@ class RetryingVmProvisioner(object):
1630
1243
  # No teardown happens for this error.
1631
1244
  with ux_utils.print_exception_no_traceback():
1632
1245
  raise
1246
+ except config_lib.KubernetesError as e:
1247
+ if e.insufficent_resources:
1248
+ insufficient_resources = e.insufficent_resources
1249
+ # NOTE: We try to cleanup the cluster even if the previous
1250
+ # cluster does not exist. Also we are fast at
1251
+ # cleaning up clusters now if there is no existing node.
1252
+ CloudVmRayBackend().post_teardown_cleanup(
1253
+ handle,
1254
+ terminate=not prev_cluster_ever_up,
1255
+ remove_from_db=False,
1256
+ failover=True,
1257
+ )
1258
+ # TODO(suquark): other clouds may have different zone
1259
+ # blocking strategy. See '_update_blocklist_on_error'
1260
+ # for details.
1261
+ FailoverCloudErrorHandlerV2.update_blocklist_on_error(
1262
+ self._blocked_resources, to_provision, region, zones, e)
1263
+ continue
1633
1264
  except Exception as e: # pylint: disable=broad-except
1634
1265
  # NOTE: We try to cleanup the cluster even if the previous
1635
1266
  # cluster does not exist. Also we are fast at
@@ -1760,26 +1391,9 @@ class RetryingVmProvisioner(object):
1760
1391
  terminate=terminate_or_stop,
1761
1392
  remove_from_db=False)
1762
1393
 
1763
- if to_provision.zone is not None:
1764
- message = (
1765
- f'Failed to acquire resources in {to_provision.zone} for '
1766
- f'{requested_resources}. ')
1767
- elif to_provision.region is not None:
1768
- # For public clouds, provision.region is always set.
1769
- if clouds.SSH().is_same_cloud(to_provision.cloud):
1770
- message = ('Failed to acquire resources in SSH Node Pool '
1771
- f'({to_provision.region.lstrip("ssh-")}) for '
1772
- f'{requested_resources}. The SSH Node Pool may not '
1773
- 'have enough resources.')
1774
- elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
1775
- message = ('Failed to acquire resources in context '
1776
- f'{to_provision.region} for {requested_resources}. ')
1777
- else:
1778
- message = ('Failed to acquire resources in all zones in '
1779
- f'{to_provision.region} for {requested_resources}. ')
1780
- else:
1781
- message = (f'Failed to acquire resources in {to_provision.cloud} '
1782
- f'for {requested_resources}. ')
1394
+ message = self._insufficient_resources_msg(to_provision,
1395
+ requested_resources,
1396
+ insufficient_resources)
1783
1397
  # Do not failover to other locations if the cluster was ever up, since
1784
1398
  # the user can have some data on the cluster.
1785
1399
  raise exceptions.ResourcesUnavailableError(
@@ -2175,8 +1789,6 @@ class RetryingVmProvisioner(object):
2175
1789
  # terminated by _retry_zones().
2176
1790
  assert (prev_cluster_status == status_lib.ClusterStatus.INIT
2177
1791
  ), prev_cluster_status
2178
- assert global_user_state.get_handle_from_cluster_name(
2179
- cluster_name) is None, cluster_name
2180
1792
  logger.info(
2181
1793
  ux_utils.retry_message(
2182
1794
  f'Retrying provisioning with requested resources: '
@@ -2215,9 +1827,8 @@ class RetryingVmProvisioner(object):
2215
1827
  for (resource, exception) in resource_exceptions.items():
2216
1828
  table.add_row([
2217
1829
  resource.infra.formatted_str(),
2218
- resources_utils.format_resource(resource,
2219
- simplify=True),
2220
- exception
1830
+ resources_utils.format_resource(
1831
+ resource, simplified_only=True)[0], exception
2221
1832
  ])
2222
1833
  # Set the max width of REASON column to 80 to avoid the table
2223
1834
  # being wrapped in a unreadable way.
@@ -2239,6 +1850,18 @@ class SSHTunnelInfo:
2239
1850
  pid: int
2240
1851
 
2241
1852
 
1853
+ def _is_tunnel_healthy(tunnel: SSHTunnelInfo) -> bool:
1854
+ try:
1855
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1856
+ s.settimeout(0.5)
1857
+ s.connect(('localhost', tunnel.port))
1858
+ return True
1859
+ except socket.error as e:
1860
+ logger.warning(f'Failed to connect to tunnel on port {tunnel.port}: '
1861
+ f'{common_utils.format_exception(e)}')
1862
+ return False
1863
+
1864
+
2242
1865
  class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2243
1866
  """A pickle-able handle to a cluster created by CloudVmRayBackend.
2244
1867
 
@@ -2261,8 +1884,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2261
1884
  - (optional) Skylet SSH tunnel info.
2262
1885
  """
2263
1886
  # Bump if any fields get added/removed/changed, and add backward
2264
- # compaitibility logic in __setstate__.
2265
- _VERSION = 11
1887
+ # compatibility logic in __setstate__ and/or __getstate__.
1888
+ _VERSION = 12
2266
1889
 
2267
1890
  def __init__(
2268
1891
  self,
@@ -2296,7 +1919,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2296
1919
  self.launched_resources = launched_resources
2297
1920
  self.docker_user: Optional[str] = None
2298
1921
  self.is_grpc_enabled = True
2299
- self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
2300
1922
 
2301
1923
  def __repr__(self):
2302
1924
  return (f'ResourceHandle('
@@ -2313,12 +1935,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2313
1935
  f'{self.launched_resources}, '
2314
1936
  f'\n\tdocker_user={self.docker_user},'
2315
1937
  f'\n\tssh_user={self.ssh_user},'
2316
- f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
2317
- f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
1938
+ f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
2318
1939
 
2319
1940
  def get_cluster_name(self):
2320
1941
  return self.cluster_name
2321
1942
 
1943
+ def get_cluster_name_on_cloud(self):
1944
+ return self.cluster_name_on_cloud
1945
+
2322
1946
  def _use_internal_ips(self):
2323
1947
  """Returns whether to use internal IPs for SSH connections."""
2324
1948
  # Directly load the `use_internal_ips` flag from the cluster yaml
@@ -2345,7 +1969,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2345
1969
  def _update_cluster_info(self):
2346
1970
  # When a cluster is on a cloud that does not support the new
2347
1971
  # provisioner, we should skip updating cluster_info.
2348
- if (self.launched_resources.cloud.PROVISIONER_VERSION >=
1972
+ if (self.launched_resources.cloud is not None and
1973
+ self.launched_resources.cloud.PROVISIONER_VERSION >=
2349
1974
  clouds.ProvisionerVersion.SKYPILOT):
2350
1975
  provider_name = str(self.launched_resources.cloud).lower()
2351
1976
  config = {}
@@ -2643,64 +2268,199 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2643
2268
  cluster_config_file)
2644
2269
  self.docker_user = docker_user
2645
2270
 
2271
+ def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
2272
+ metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
2273
+ self.cluster_name)
2274
+ if metadata is None:
2275
+ return None
2276
+ return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
2277
+
2278
+ def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
2279
+ global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
2280
+ self.cluster_name,
2281
+ (tunnel.port, tunnel.pid) if tunnel is not None else None)
2282
+
2283
+ def close_skylet_ssh_tunnel(self) -> None:
2284
+ """Terminate the SSH tunnel process and clear its metadata."""
2285
+ tunnel = self._get_skylet_ssh_tunnel()
2286
+ if tunnel is None:
2287
+ return
2288
+ logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
2289
+ self.cluster_name, tunnel.port)
2290
+ try:
2291
+ self._terminate_ssh_tunnel_process(tunnel)
2292
+ finally:
2293
+ self._set_skylet_ssh_tunnel(None)
2294
+
2646
2295
  def get_grpc_channel(self) -> 'grpc.Channel':
2647
- if self.skylet_ssh_tunnel is None:
2648
- self.open_and_update_skylet_tunnel()
2649
- assert self.skylet_ssh_tunnel is not None
2650
- return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
2296
+ grpc_options = [
2297
+ # The task YAMLs can be large, so the default
2298
+ # max_receive_message_length of 4MB might not be enough.
2299
+ ('grpc.max_receive_message_length', -1),
2300
+ ]
2301
+ # It's fine to not grab the lock here, as we're only reading,
2302
+ # and writes are very rare.
2303
+ # It's acceptable to read while another process is opening a tunnel,
2304
+ # because it will only happen on:
2305
+ # 1. A new cluster who has no tunnel yet, or
2306
+ # 2. A cluster with an unhealthy tunnel
2307
+ # For (2), for processes that read the "stale" tunnel, it will fail
2308
+ # and on the next retry, it will call get_grpc_channel again
2309
+ # and get the new tunnel.
2310
+ tunnel = self._get_skylet_ssh_tunnel()
2311
+ if tunnel is not None:
2312
+ if _is_tunnel_healthy(tunnel):
2313
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2314
+ options=grpc_options)
2315
+ logger.debug('Failed to connect to SSH tunnel for cluster '
2316
+ f'{self.cluster_name!r} on port {tunnel.port}')
2317
+
2318
+ lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
2319
+ remaining_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
2320
+ start_time = time.perf_counter()
2321
+ attempt = 1
2322
+
2323
+ def _get_remaining_timeout() -> float:
2324
+ return max(0.0,
2325
+ remaining_timeout - (time.perf_counter() - start_time))
2326
+
2327
+ while remaining_timeout > 0:
2328
+ logger.debug(
2329
+ 'Attempting to acquire exclusive lock for %s (attempt %d)',
2330
+ lock_id, attempt)
2331
+ exclusive_lock = locks.get_lock(lock_id, remaining_timeout)
2332
+ try:
2333
+ with exclusive_lock.acquire(blocking=False):
2334
+ wait_elapsed = time.perf_counter() - start_time
2335
+ logger.debug(f'Acquired exclusive lock for {lock_id} after '
2336
+ f'{wait_elapsed:.2f}s')
2337
+ try:
2338
+ tunnel = self._open_and_update_skylet_tunnel()
2339
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2340
+ options=grpc_options)
2341
+ except Exception as e: # pylint: disable=broad-except
2342
+ # Failed to open tunnel, release the lock and retry.
2343
+ logger.warning(f'Failed to open tunnel for cluster '
2344
+ f'{self.cluster_name!r}: '
2345
+ f'{common_utils.format_exception(e)}')
2346
+ remaining_timeout = _get_remaining_timeout()
2347
+ attempt += 1
2348
+ continue
2349
+ except locks.LockTimeout:
2350
+ pass
2651
2351
 
2652
- def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
2653
- """Clean up an SSH tunnel by terminating the process."""
2352
+ remaining_timeout = _get_remaining_timeout()
2353
+ logger.debug(f'Could not acquire exclusive lock for {lock_id}, '
2354
+ f'waiting on shared lock (attempt {attempt})')
2355
+ try:
2356
+ # Use shared lock so that concurrent readers can
2357
+ # proceed in parallel.
2358
+ shared_lock = locks.get_lock(lock_id,
2359
+ remaining_timeout,
2360
+ shared_lock=True)
2361
+ # Wait for the exclusive lock to be released.
2362
+ shared_lock.acquire(blocking=True)
2363
+ # We only need the lock for signalling that the new tunnel has
2364
+ # been opened, not for checking the tunnel health.
2365
+ # Same reasoning as why we don't need to grab the lock in
2366
+ # the fast path at the start of this function.
2367
+ shared_lock.release()
2368
+ wait_elapsed = time.perf_counter() - start_time
2369
+ logger.debug(f'Acquired shared lock for {lock_id} after '
2370
+ f'{wait_elapsed:.2f}s')
2371
+ except locks.LockTimeout as e:
2372
+ raise RuntimeError(
2373
+ f'Failed to get gRPC channel for cluster '
2374
+ f'{self.cluster_name!r} due to a timeout when waiting '
2375
+ 'for the SSH tunnel to be opened. Please try again or '
2376
+ f'manually remove the lock at {lock_id}. '
2377
+ f'{common_utils.format_exception(e)}') from e
2378
+
2379
+ # Add small jitter before probing to smoothen the effects
2380
+ # of many readers waking up simultaneously.
2381
+ jitter = random.uniform(0.01, 0.05)
2382
+ time.sleep(jitter)
2383
+
2384
+ # Re-read the tunnel metadata and verify it's healthy.
2385
+ tunnel = self._get_skylet_ssh_tunnel()
2386
+ if tunnel is not None:
2387
+ if _is_tunnel_healthy(tunnel):
2388
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2389
+ options=grpc_options)
2390
+ logger.debug('Failed to connect to SSH tunnel for cluster '
2391
+ f'{self.cluster_name!r} on port {tunnel.port}')
2392
+ # Tunnel is still unhealthy or missing, try again with updated
2393
+ # timeout. This could happen in the case where the thread who
2394
+ # held the exclusive lock to open the tunnel crashed.
2395
+ remaining_timeout = _get_remaining_timeout()
2396
+ attempt += 1
2397
+ raise RuntimeError('Timeout waiting for gRPC channel for cluster '
2398
+ f'{self.cluster_name!r} to be ready.')
2399
+
2400
+ def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
2401
+ """Terminate the SSH tunnel process."""
2654
2402
  try:
2655
2403
  proc = psutil.Process(tunnel_info.pid)
2656
2404
  if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
2657
2405
  logger.debug(
2658
2406
  f'Terminating SSH tunnel process {tunnel_info.pid}')
2659
- proc.terminate()
2660
- try:
2661
- proc.wait(timeout=3)
2662
- except psutil.TimeoutExpired:
2663
- proc.kill()
2664
- proc.wait(timeout=1)
2407
+ subprocess_utils.kill_children_processes(proc.pid)
2665
2408
  except psutil.NoSuchProcess:
2666
2409
  pass
2667
2410
  except Exception as e: # pylint: disable=broad-except
2668
2411
  logger.warning(
2669
2412
  f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
2670
2413
 
2671
- def open_and_update_skylet_tunnel(self) -> None:
2414
+ def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
2672
2415
  """Opens an SSH tunnel to the Skylet on the head node,
2673
2416
  updates the cluster handle, and persists it to the database."""
2674
- local_port = common_utils.find_free_port(10000)
2675
- runners = self.get_command_runners()
2676
- head_runner = runners[0]
2677
- if isinstance(head_runner, command_runner.SSHCommandRunner):
2678
- # Disabling ControlMaster makes things easier to reason about
2679
- # with respect to resource management/ownership,
2680
- # as killing the process will close the tunnel too.
2681
- head_runner.disable_control_master = True
2682
-
2683
- cmd = head_runner.port_forward_command([(local_port,
2684
- constants.SKYLET_GRPC_PORT)])
2685
- ssh_tunnel_proc = subprocess.Popen(cmd)
2686
- tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
2417
+ max_attempts = 3
2418
+ # There could be a race condition here, as multiple processes may
2419
+ # attempt to open the same port at the same time.
2420
+ for attempt in range(max_attempts):
2421
+ runners = self.get_command_runners()
2422
+ head_runner = runners[0]
2423
+ local_port = random.randint(10000, 65535)
2424
+ try:
2425
+ ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
2426
+ head_runner, (local_port, constants.SKYLET_GRPC_PORT))
2427
+ except exceptions.CommandError as e:
2428
+ # Don't retry if the error is due to timeout,
2429
+ # connection refused, Kubernetes pods not found,
2430
+ # or an in-progress termination.
2431
+ if (e.detailed_reason is not None and
2432
+ (backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
2433
+ e.detailed_reason) or
2434
+ backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
2435
+ e.detailed_reason) or attempt == max_attempts - 1)):
2436
+ raise e
2437
+ logger.warning(
2438
+ f'Failed to open SSH tunnel on port {local_port} '
2439
+ f'({attempt + 1}/{max_attempts}). '
2440
+ f'{e.error_msg}\n{e.detailed_reason}')
2441
+ continue
2442
+ tunnel_info = SSHTunnelInfo(port=local_port,
2443
+ pid=ssh_tunnel_proc.pid)
2444
+ break
2445
+
2687
2446
  try:
2688
2447
  grpc.channel_ready_future(
2689
2448
  grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
2690
2449
  timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
2691
2450
  # Clean up existing tunnel before setting up the new one.
2692
- if self.skylet_ssh_tunnel is not None:
2693
- self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
2694
- self.skylet_ssh_tunnel = tunnel_info
2695
- global_user_state.update_cluster_handle(self.cluster_name, self)
2451
+ old_tunnel = self._get_skylet_ssh_tunnel()
2452
+ if old_tunnel is not None:
2453
+ self._terminate_ssh_tunnel_process(old_tunnel)
2454
+ self._set_skylet_ssh_tunnel(tunnel_info)
2455
+ return tunnel_info
2696
2456
  except grpc.FutureTimeoutError as e:
2697
- self._cleanup_ssh_tunnel(tunnel_info)
2457
+ self._terminate_ssh_tunnel_process(tunnel_info)
2698
2458
  logger.warning(
2699
2459
  f'Skylet gRPC channel for cluster {self.cluster_name} not '
2700
2460
  f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
2701
2461
  raise e
2702
2462
  except Exception as e:
2703
- self._cleanup_ssh_tunnel(tunnel_info)
2463
+ self._terminate_ssh_tunnel_process(tunnel_info)
2704
2464
  raise e
2705
2465
 
2706
2466
  @property
@@ -2713,6 +2473,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2713
2473
  def cluster_yaml(self, value: Optional[str]):
2714
2474
  self._cluster_yaml = value
2715
2475
 
2476
+ @property
2477
+ def instance_ids(self):
2478
+ if self.cached_cluster_info is not None:
2479
+ return self.cached_cluster_info.instance_ids()
2480
+ return None
2481
+
2716
2482
  @property
2717
2483
  def ssh_user(self):
2718
2484
  if self.cached_cluster_info is not None:
@@ -2752,6 +2518,13 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2752
2518
  """Returns whether this handle has gRPC enabled and gRPC flag is set."""
2753
2519
  return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
2754
2520
 
2521
+ def __getstate__(self):
2522
+ state = self.__dict__.copy()
2523
+ # For backwards compatibility. Refer to
2524
+ # https://github.com/skypilot-org/skypilot/pull/7133
2525
+ state.setdefault('skylet_ssh_tunnel', None)
2526
+ return state
2527
+
2755
2528
  def __setstate__(self, state):
2756
2529
  self._version = self._VERSION
2757
2530
 
@@ -2809,6 +2582,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2809
2582
  state['is_grpc_enabled'] = False
2810
2583
  state['skylet_ssh_tunnel'] = None
2811
2584
 
2585
+ if version >= 12:
2586
+ # DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
2587
+ state.pop('skylet_ssh_tunnel', None)
2588
+
2812
2589
  self.__dict__.update(state)
2813
2590
 
2814
2591
  # Because the update_cluster_ips and update_ssh_ports
@@ -2886,21 +2663,180 @@ class SkyletClient:
2886
2663
 
2887
2664
  def __init__(self, channel: 'grpc.Channel'):
2888
2665
  self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
2666
+ self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
2667
+ self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
2668
+ self._managed_jobs_stub = (
2669
+ managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
2889
2670
 
2890
2671
  def set_autostop(
2891
2672
  self,
2892
2673
  request: 'autostopv1_pb2.SetAutostopRequest',
2893
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2674
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2894
2675
  ) -> 'autostopv1_pb2.SetAutostopResponse':
2895
2676
  return self._autostop_stub.SetAutostop(request, timeout=timeout)
2896
2677
 
2897
2678
  def is_autostopping(
2898
2679
  self,
2899
2680
  request: 'autostopv1_pb2.IsAutostoppingRequest',
2900
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2681
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2901
2682
  ) -> 'autostopv1_pb2.IsAutostoppingResponse':
2902
2683
  return self._autostop_stub.IsAutostopping(request, timeout=timeout)
2903
2684
 
2685
+ def add_job(
2686
+ self,
2687
+ request: 'jobsv1_pb2.AddJobRequest',
2688
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2689
+ ) -> 'jobsv1_pb2.AddJobResponse':
2690
+ return self._jobs_stub.AddJob(request, timeout=timeout)
2691
+
2692
+ def queue_job(
2693
+ self,
2694
+ request: 'jobsv1_pb2.QueueJobRequest',
2695
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2696
+ ) -> 'jobsv1_pb2.QueueJobResponse':
2697
+ return self._jobs_stub.QueueJob(request, timeout=timeout)
2698
+
2699
+ def update_status(
2700
+ self,
2701
+ request: 'jobsv1_pb2.UpdateStatusRequest',
2702
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2703
+ ) -> 'jobsv1_pb2.UpdateStatusResponse':
2704
+ return self._jobs_stub.UpdateStatus(request, timeout=timeout)
2705
+
2706
+ def get_job_queue(
2707
+ self,
2708
+ request: 'jobsv1_pb2.GetJobQueueRequest',
2709
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2710
+ ) -> 'jobsv1_pb2.GetJobQueueResponse':
2711
+ return self._jobs_stub.GetJobQueue(request, timeout=timeout)
2712
+
2713
+ def cancel_jobs(
2714
+ self,
2715
+ request: 'jobsv1_pb2.CancelJobsRequest',
2716
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2717
+ ) -> 'jobsv1_pb2.CancelJobsResponse':
2718
+ return self._jobs_stub.CancelJobs(request, timeout=timeout)
2719
+
2720
+ def fail_all_in_progress_jobs(
2721
+ self,
2722
+ request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
2723
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2724
+ ) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
2725
+ return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
2726
+
2727
+ def get_job_status(
2728
+ self,
2729
+ request: 'jobsv1_pb2.GetJobStatusRequest',
2730
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2731
+ ) -> 'jobsv1_pb2.GetJobStatusResponse':
2732
+ return self._jobs_stub.GetJobStatus(request, timeout=timeout)
2733
+
2734
+ def get_job_submitted_timestamp(
2735
+ self,
2736
+ request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
2737
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2738
+ ) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
2739
+ return self._jobs_stub.GetJobSubmittedTimestamp(request,
2740
+ timeout=timeout)
2741
+
2742
+ def get_job_ended_timestamp(
2743
+ self,
2744
+ request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
2745
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2746
+ ) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
2747
+ return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
2748
+
2749
+ def get_log_dirs_for_jobs(
2750
+ self,
2751
+ request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
2752
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2753
+ ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
2754
+ return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
2755
+
2756
+ def tail_logs(
2757
+ self,
2758
+ request: 'jobsv1_pb2.TailLogsRequest',
2759
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2760
+ ) -> Iterator['jobsv1_pb2.TailLogsResponse']:
2761
+ return self._jobs_stub.TailLogs(request, timeout=timeout)
2762
+
2763
+ def get_service_status(
2764
+ self,
2765
+ request: 'servev1_pb2.GetServiceStatusRequest',
2766
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2767
+ ) -> 'servev1_pb2.GetServiceStatusResponse':
2768
+ return self._serve_stub.GetServiceStatus(request, timeout=timeout)
2769
+
2770
+ def add_serve_version(
2771
+ self,
2772
+ request: 'servev1_pb2.AddVersionRequest',
2773
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2774
+ ) -> 'servev1_pb2.AddVersionResponse':
2775
+ return self._serve_stub.AddVersion(request, timeout=timeout)
2776
+
2777
+ def terminate_services(
2778
+ self,
2779
+ request: 'servev1_pb2.TerminateServicesRequest',
2780
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2781
+ ) -> 'servev1_pb2.TerminateServicesResponse':
2782
+ return self._serve_stub.TerminateServices(request, timeout=timeout)
2783
+
2784
+ def terminate_replica(
2785
+ self,
2786
+ request: 'servev1_pb2.TerminateReplicaRequest',
2787
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2788
+ ) -> 'servev1_pb2.TerminateReplicaResponse':
2789
+ return self._serve_stub.TerminateReplica(request, timeout=timeout)
2790
+
2791
+ def wait_service_registration(
2792
+ self,
2793
+ request: 'servev1_pb2.WaitServiceRegistrationRequest',
2794
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2795
+ ) -> 'servev1_pb2.WaitServiceRegistrationResponse':
2796
+ # set timeout to at least 10 seconds more than service register
2797
+ # constant to make sure that timeouts will not occur.
2798
+ if timeout is not None:
2799
+ timeout = max(timeout,
2800
+ serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
2801
+ return self._serve_stub.WaitServiceRegistration(request,
2802
+ timeout=timeout)
2803
+
2804
+ def update_service(
2805
+ self,
2806
+ request: 'servev1_pb2.UpdateServiceRequest',
2807
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2808
+ ) -> 'servev1_pb2.UpdateServiceResponse':
2809
+ return self._serve_stub.UpdateService(request, timeout=timeout)
2810
+
2811
+ def get_managed_job_controller_version(
2812
+ self,
2813
+ request: 'managed_jobsv1_pb2.GetVersionRequest',
2814
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2815
+ ) -> 'managed_jobsv1_pb2.GetVersionResponse':
2816
+ return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
2817
+
2818
+ def get_managed_job_table(
2819
+ self,
2820
+ request: 'managed_jobsv1_pb2.GetJobTableRequest',
2821
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2822
+ ) -> 'managed_jobsv1_pb2.GetJobTableResponse':
2823
+ return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
2824
+
2825
+ def get_all_managed_job_ids_by_name(
2826
+ self,
2827
+ request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
2828
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2829
+ ) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
2830
+ return self._managed_jobs_stub.GetAllJobIdsByName(request,
2831
+ timeout=timeout)
2832
+
2833
+ def cancel_managed_jobs(
2834
+ self,
2835
+ request: 'managed_jobsv1_pb2.CancelJobsRequest',
2836
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2837
+ ) -> 'managed_jobsv1_pb2.CancelJobsResponse':
2838
+ return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
2839
+
2904
2840
 
2905
2841
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2906
2842
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
@@ -2931,6 +2867,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2931
2867
  self._requested_features = set()
2932
2868
  self._dump_final_script = False
2933
2869
  self._is_managed = False
2870
+ # Optional planner (via register_info): used under the per-cluster lock
2871
+ # to produce a fresh concrete plan when neither a reusable snapshot nor
2872
+ # a caller plan is available.
2873
+ self._planner = None
2934
2874
 
2935
2875
  # Command for running the setup script. It is only set when the
2936
2876
  # setup needs to be run outside the self._setup() and as part of
@@ -2948,6 +2888,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2948
2888
  self._requested_features)
2949
2889
  self._dump_final_script = kwargs.pop('dump_final_script', False)
2950
2890
  self._is_managed = kwargs.pop('is_managed', False)
2891
+ # Optional planner callback for a fresh plan under lock when no
2892
+ # reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
2893
+ self._planner = kwargs.pop('planner', self._planner)
2951
2894
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2952
2895
 
2953
2896
  def check_resources_fit_cluster(
@@ -2974,9 +2917,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2974
2917
  # Usage Collection:
2975
2918
  usage_lib.messages.usage.update_cluster_resources(
2976
2919
  handle.launched_nodes, launched_resources)
2977
- record = global_user_state.get_cluster_from_name(cluster_name)
2978
- if record is not None:
2979
- usage_lib.messages.usage.update_cluster_status(record['status'])
2920
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
2921
+ if status is not None:
2922
+ usage_lib.messages.usage.update_cluster_status(status)
2980
2923
 
2981
2924
  assert launched_resources.region is not None, handle
2982
2925
 
@@ -3115,7 +3058,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3115
3058
  colorama.Style.RESET_ALL +
3116
3059
  colorama.Style.DIM +
3117
3060
  'Check concurrent requests: ' +
3118
- 'sky api status '))
3061
+ 'sky api status -v | grep '
3062
+ f'{cluster_name}'))
3119
3063
 
3120
3064
  def _locked_provision(
3121
3065
  self,
@@ -3172,8 +3116,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3172
3116
  try:
3173
3117
  retry_provisioner = RetryingVmProvisioner(
3174
3118
  self.log_dir,
3175
- self._dag,
3176
- self._optimize_target,
3119
+ self._dag, # type: ignore[arg-type]
3120
+ self._optimize_target, # type: ignore[arg-type]
3177
3121
  self._requested_features,
3178
3122
  local_wheel_path,
3179
3123
  wheel_hash,
@@ -3204,9 +3148,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3204
3148
  gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
3205
3149
  retry_message = ux_utils.retry_message(
3206
3150
  f'Retry after {gap_seconds:.0f}s ')
3207
- hint_message = (f'\n{retry_message} '
3208
- f'{ux_utils.log_path_hint(log_path)}'
3209
- f'{colorama.Style.RESET_ALL}')
3151
+ hint_message = (
3152
+ f'\n{retry_message} '
3153
+ f'{ux_utils.provision_hint(cluster_name)}'
3154
+ f'{colorama.Style.RESET_ALL}')
3210
3155
 
3211
3156
  # Add cluster event for retry.
3212
3157
  global_user_state.add_cluster_event(
@@ -3235,7 +3180,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3235
3180
  logger.error(
3236
3181
  ux_utils.error_message(
3237
3182
  'Failed to provision resources. '
3238
- f'{ux_utils.log_path_hint(log_path)}'))
3183
+ f'{ux_utils.provision_hint(cluster_name)}'))
3239
3184
  error_message += (
3240
3185
  '\nTo keep retrying until the cluster is up, use '
3241
3186
  'the `--retry-until-up` flag.')
@@ -3244,8 +3189,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3244
3189
  error_message + '\n' + str(e),
3245
3190
  failover_history=e.failover_history) from None
3246
3191
  if dryrun:
3247
- record = global_user_state.get_cluster_from_name(cluster_name)
3248
- return record['handle'] if record is not None else None, False
3192
+ handle = global_user_state.get_handle_from_cluster_name(
3193
+ cluster_name)
3194
+ return handle if handle is not None else None, False
3249
3195
 
3250
3196
  if config_dict['provisioning_skipped']:
3251
3197
  # Skip further provisioning.
@@ -3253,10 +3199,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3253
3199
  # ('handle', 'provision_record', 'resources_vars')
3254
3200
  # We need to return the handle - but it should be the existing
3255
3201
  # handle for the cluster.
3256
- record = global_user_state.get_cluster_from_name(cluster_name)
3257
- assert record is not None and record['handle'] is not None, (
3258
- cluster_name, record)
3259
- return record['handle'], True
3202
+ handle = global_user_state.get_handle_from_cluster_name(
3203
+ cluster_name)
3204
+ assert handle is not None, (cluster_name, handle)
3205
+ return handle, True
3260
3206
 
3261
3207
  if 'provision_record' in config_dict:
3262
3208
  # New provisioner is used here.
@@ -3279,7 +3225,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3279
3225
  global_user_state.ClusterEventType.STATUS_CHANGE)
3280
3226
 
3281
3227
  cluster_info = provisioner.post_provision_runtime_setup(
3282
- repr(handle.launched_resources.cloud),
3228
+ handle.launched_resources,
3283
3229
  resources_utils.ClusterName(handle.cluster_name,
3284
3230
  handle.cluster_name_on_cloud),
3285
3231
  handle.cluster_yaml,
@@ -3293,6 +3239,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3293
3239
  # manually or by the cloud provider.
3294
3240
  # Optimize the case where the cluster's IPs can be retrieved
3295
3241
  # from cluster_info.
3242
+ handle.cached_cluster_info = cluster_info
3296
3243
  handle.docker_user = cluster_info.docker_user
3297
3244
  handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
3298
3245
  cluster_info=cluster_info)
@@ -3304,7 +3251,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3304
3251
 
3305
3252
  self._update_after_cluster_provisioned(
3306
3253
  handle, to_provision_config.prev_handle, task,
3307
- prev_cluster_status, lock_id, config_hash)
3254
+ prev_cluster_status, config_hash)
3308
3255
  return handle, False
3309
3256
 
3310
3257
  cluster_config_file = config_dict['ray']
@@ -3376,7 +3323,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3376
3323
 
3377
3324
  self._update_after_cluster_provisioned(
3378
3325
  handle, to_provision_config.prev_handle, task,
3379
- prev_cluster_status, lock_id, config_hash)
3326
+ prev_cluster_status, config_hash)
3380
3327
  return handle, False
3381
3328
 
3382
3329
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3394,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3394
3341
  prev_handle: Optional[CloudVmRayResourceHandle],
3395
3342
  task: task_lib.Task,
3396
3343
  prev_cluster_status: Optional[status_lib.ClusterStatus],
3397
- lock_id: str, config_hash: str) -> None:
3344
+ config_hash: str) -> None:
3398
3345
  usage_lib.messages.usage.update_cluster_resources(
3399
3346
  handle.launched_nodes, handle.launched_resources)
3400
3347
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3406,16 +3353,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3406
3353
  # update_status will query the ray job status for all INIT /
3407
3354
  # PENDING / RUNNING jobs for the real status, since we do not
3408
3355
  # know the actual previous status of the cluster.
3409
- cmd = job_lib.JobLibCodeGen.update_status()
3410
3356
  logger.debug('Update job queue on remote cluster.')
3411
3357
  with rich_utils.safe_status(
3412
3358
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3413
- returncode, _, stderr = self.run_on_head(handle,
3414
- cmd,
3415
- require_outputs=True)
3416
- subprocess_utils.handle_returncode(returncode, cmd,
3417
- 'Failed to update job status.',
3418
- stderr)
3359
+ use_legacy = not handle.is_grpc_enabled_with_flag
3360
+
3361
+ if not use_legacy:
3362
+ try:
3363
+ request = jobsv1_pb2.UpdateStatusRequest()
3364
+ backend_utils.invoke_skylet_with_retries(
3365
+ lambda: SkyletClient(handle.get_grpc_channel()
3366
+ ).update_status(request))
3367
+ except exceptions.SkyletMethodNotImplementedError:
3368
+ use_legacy = True
3369
+
3370
+ if use_legacy:
3371
+ cmd = job_lib.JobLibCodeGen.update_status()
3372
+ returncode, _, stderr = self.run_on_head(
3373
+ handle, cmd, require_outputs=True)
3374
+ subprocess_utils.handle_returncode(
3375
+ returncode, cmd, 'Failed to update job status.', stderr)
3419
3376
  if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
3420
3377
  # Safely set all the previous jobs to FAILED since the cluster
3421
3378
  # is restarted
@@ -3423,14 +3380,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3423
3380
  # 1. A job finishes RUNNING, but right before it update itself
3424
3381
  # to SUCCEEDED, the cluster is STOPPED by `sky stop`.
3425
3382
  # 2. On next `sky start`, it gets reset to FAILED.
3426
- cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3427
- returncode, stdout, stderr = self.run_on_head(handle,
3428
- cmd,
3429
- require_outputs=True)
3430
- subprocess_utils.handle_returncode(
3431
- returncode, cmd,
3432
- 'Failed to set previously in-progress jobs to FAILED',
3433
- stdout + stderr)
3383
+ use_legacy = not handle.is_grpc_enabled_with_flag
3384
+
3385
+ if not use_legacy:
3386
+ try:
3387
+ fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3388
+ backend_utils.invoke_skylet_with_retries(
3389
+ lambda: SkyletClient(handle.get_grpc_channel(
3390
+ )).fail_all_in_progress_jobs(fail_request))
3391
+ except exceptions.SkyletMethodNotImplementedError:
3392
+ use_legacy = True
3393
+
3394
+ if use_legacy:
3395
+ cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3396
+ returncode, stdout, stderr = self.run_on_head(
3397
+ handle, cmd, require_outputs=True)
3398
+ subprocess_utils.handle_returncode(
3399
+ returncode, cmd,
3400
+ 'Failed to set previously in-progress jobs to FAILED',
3401
+ stdout + stderr)
3434
3402
 
3435
3403
  prev_ports = None
3436
3404
  if prev_handle is not None:
@@ -3485,8 +3453,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3485
3453
  handle.cached_external_ssh_ports, handle.docker_user,
3486
3454
  handle.ssh_user)
3487
3455
 
3488
- locks.get_lock(lock_id).force_unlock()
3489
-
3490
3456
  def _sync_workdir(self, handle: CloudVmRayResourceHandle,
3491
3457
  workdir: Union[Path, Dict[str, Any]],
3492
3458
  envs_and_secrets: Dict[str, str]) -> None:
@@ -3618,6 +3584,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3618
3584
  self._set_storage_mounts_metadata(handle.cluster_name,
3619
3585
  storage_mounts)
3620
3586
 
3587
+ def _get_num_gpus(self, task: task_lib.Task) -> int:
3588
+ if task.resources is not None:
3589
+ for resource in task.resources:
3590
+ if (resource.accelerators is not None and
3591
+ isinstance(resource.accelerators, dict)):
3592
+ if len(resource.accelerators) > 0:
3593
+ return math.ceil(
3594
+ list(resource.accelerators.values())[0])
3595
+ return 0
3596
+
3621
3597
  def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
3622
3598
  detach_setup: bool) -> None:
3623
3599
  start = time.time()
@@ -3630,13 +3606,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3630
3606
  remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
3631
3607
  # Need this `-i` option to make sure `source ~/.bashrc` work
3632
3608
  setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
3609
+ unset_ray_env_vars = ' && '.join(
3610
+ [f'unset {var}' for var in task_codegen.UNSET_RAY_ENV_VARS])
3611
+ setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
3633
3612
  runners = handle.get_command_runners(avoid_ssh_control=True)
3634
3613
 
3635
3614
  def _setup_node(node_id: int) -> None:
3636
- setup_envs = task.envs_and_secrets
3615
+ setup_envs = task_lib.get_plaintext_envs_and_secrets(
3616
+ task.envs_and_secrets)
3637
3617
  setup_envs.update(self._skypilot_predefined_env_vars(handle))
3638
3618
  setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
3639
3619
  setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
3620
+ setup_envs[constants.SKYPILOT_SETUP_NUM_GPUS_PER_NODE] = (str(
3621
+ self._get_num_gpus(task)))
3622
+
3640
3623
  runner = runners[node_id]
3641
3624
  setup_script = log_lib.make_task_bash_script(setup,
3642
3625
  env_vars=setup_envs)
@@ -3693,29 +3676,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3693
3676
 
3694
3677
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3695
3678
 
3696
- def _load_setup_log_and_match(match_str: str) -> bool:
3697
- try:
3698
- with open(os.path.expanduser(setup_log_path),
3699
- 'r',
3700
- encoding='utf-8') as f:
3701
- return match_str.lower() in f.read().lower()
3702
- except Exception as e: # pylint: disable=broad-except
3703
- # We don't crash the setup if we cannot read the log file.
3704
- # Instead, we should retry the setup with dumping the script
3705
- # to a file to be safe.
3706
- logger.debug(
3707
- f'Failed to read setup log file {setup_log_path}: {e}')
3708
- return True
3709
-
3710
- if ((returncode == 255 and _load_setup_log_and_match('too long')) or
3711
- (returncode == 1 and
3712
- _load_setup_log_and_match('request-uri too large'))):
3713
- # If the setup script is too long, we retry it with dumping
3714
- # the script to a file and running it with SSH. We use a
3715
- # general length limit check before but it could be
3716
- # inaccurate on some systems.
3717
- # When there is a cloudflare proxy in front of the remote, it
3718
- # could cause `414 Request-URI Too Large` error.
3679
+ if _is_message_too_long(returncode, file_path=setup_log_path):
3680
+ # If the setup script is too long, we need to retry it
3681
+ # with dumping the script to a file and running it the script
3682
+ # on remote cluster instead.
3719
3683
  logger.debug('Failed to run setup command inline due to '
3720
3684
  'command length limit. Dumping setup script to '
3721
3685
  'file and running it with SSH.')
@@ -3779,119 +3743,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3779
3743
  logger.info(
3780
3744
  ux_utils.finishing_message('Setup completed.', setup_log_path))
3781
3745
 
3746
+ def _download_file(self, handle: CloudVmRayResourceHandle,
3747
+ local_file_path: str, remote_file_path: str) -> None:
3748
+ """Syncs file from remote to local."""
3749
+ runners = handle.get_command_runners()
3750
+ head_runner = runners[0]
3751
+ head_runner.rsync(
3752
+ source=local_file_path,
3753
+ target=remote_file_path,
3754
+ up=False,
3755
+ stream_logs=False,
3756
+ )
3757
+
3782
3758
  def _exec_code_on_head(
3783
3759
  self,
3784
3760
  handle: CloudVmRayResourceHandle,
3785
3761
  codegen: str,
3786
3762
  job_id: int,
3787
- detach_run: bool = False,
3788
3763
  managed_job_dag: Optional['dag.Dag'] = None,
3764
+ managed_job_user_id: Optional[str] = None,
3789
3765
  remote_log_dir: Optional[str] = None,
3790
3766
  ) -> None:
3791
3767
  """Executes generated code on the head node."""
3792
- script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3768
+ use_legacy = not handle.is_grpc_enabled_with_flag
3769
+ file_name = f'sky_job_{job_id}'
3770
+ script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
3793
3771
  if remote_log_dir is None:
3794
3772
  remote_log_dir = self.log_dir
3795
3773
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3796
3774
 
3797
- cd = f'cd {SKY_REMOTE_WORKDIR}'
3775
+ def _dump_code_to_file(codegen: str,
3776
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3777
+ runners = handle.get_command_runners()
3778
+ head_runner = runners[0]
3779
+ with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3780
+ fp.write(codegen)
3781
+ fp.flush()
3782
+ script_path = os.path.join(target_dir, file_name)
3783
+ # We choose to sync code + exec, because the alternative of
3784
+ # 'ray submit' may not work as it may use system python
3785
+ # (python2) to execute the script. Happens for AWS.
3786
+ head_runner.rsync(source=fp.name,
3787
+ target=script_path,
3788
+ up=True,
3789
+ stream_logs=False)
3798
3790
 
3791
+ cd = f'cd {SKY_REMOTE_WORKDIR}'
3799
3792
  mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3800
3793
  f'touch {remote_log_path}')
3801
3794
  encoded_script = shlex.quote(codegen)
3802
3795
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3803
3796
  job_submit_cmd = (
3804
- # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3805
- # with pid is the same driver process.
3797
+ # JOB_CMD_IDENTIFIER is used for identifying the process
3798
+ # retrieved with pid is the same driver process.
3806
3799
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3807
3800
  f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3808
3801
  # Do not use &>, which is not POSIX and may not work.
3809
3802
  # Note that the order of ">filename 2>&1" matters.
3810
3803
  f'> {remote_log_path} 2>&1')
3811
-
3812
3804
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3813
3805
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3814
3806
 
3815
- def _dump_code_to_file(codegen: str,
3816
- target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3817
- runners = handle.get_command_runners()
3818
- head_runner = runners[0]
3819
- with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3820
- fp.write(codegen)
3821
- fp.flush()
3822
- script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3823
- # We choose to sync code + exec, because the alternative of 'ray
3824
- # submit' may not work as it may use system python (python2) to
3825
- # execute the script. Happens for AWS.
3826
- head_runner.rsync(source=fp.name,
3827
- target=script_path,
3828
- up=True,
3829
- stream_logs=False)
3830
-
3831
3807
  # Should also be ealier than _is_command_length_over_limit
3832
3808
  # Same reason as in _setup
3833
3809
  if self._dump_final_script:
3834
3810
  _dump_code_to_file(job_submit_cmd,
3835
3811
  constants.PERSISTENT_RUN_SCRIPT_DIR)
3836
3812
 
3837
- if _is_command_length_over_limit(job_submit_cmd):
3838
- _dump_code_to_file(codegen)
3839
- job_submit_cmd = f'{mkdir_code} && {code}'
3840
-
3841
- def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
3842
- if managed_job_dag is not None:
3843
- # Add the managed job to job queue database.
3844
- managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3845
- managed_job_code = managed_job_codegen.set_pending(
3846
- job_id,
3847
- managed_job_dag,
3848
- skypilot_config.get_active_workspace(
3849
- force_user_workspace=True),
3850
- entrypoint=common_utils.get_current_command())
3851
- # Set the managed job to PENDING state to make sure that this
3852
- # managed job appears in the `sky jobs queue`, even if it needs
3853
- # to wait to be submitted.
3854
- # We cannot set the managed job to PENDING state in the job
3855
- # template (jobs-controller.yaml.j2), as it may need to wait for
3856
- # the run commands to be scheduled on the job controller in
3857
- # high-load cases.
3858
- job_submit_cmd += ' && ' + managed_job_code
3859
- return job_submit_cmd
3860
-
3861
- job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3813
+ if not use_legacy:
3814
+ try:
3815
+ managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
3816
+ if managed_job_dag is not None:
3817
+ workspace = skypilot_config.get_active_workspace(
3818
+ force_user_workspace=True)
3819
+ entrypoint = common_utils.get_current_command()
3820
+
3821
+ managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
3822
+ for task_id, task in enumerate(managed_job_dag.tasks):
3823
+ resources_str = backend_utils.get_task_resources_str(
3824
+ task, is_managed_job=True)
3825
+ managed_job_tasks.append(
3826
+ jobsv1_pb2.ManagedJobTask(
3827
+ task_id=task_id,
3828
+ name=task.name,
3829
+ resources_str=resources_str,
3830
+ metadata_json=task.metadata_json))
3831
+
3832
+ managed_job_info = jobsv1_pb2.ManagedJobInfo(
3833
+ name=managed_job_dag.name,
3834
+ pool=managed_job_dag.pool,
3835
+ workspace=workspace,
3836
+ entrypoint=entrypoint,
3837
+ tasks=managed_job_tasks,
3838
+ user_id=managed_job_user_id)
3839
+
3840
+ if _is_command_length_over_limit(codegen):
3841
+ _dump_code_to_file(codegen)
3842
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
3843
+ job_id=job_id,
3844
+ # codegen not set - server assumes script uploaded
3845
+ remote_log_dir=remote_log_dir,
3846
+ managed_job=managed_job_info,
3847
+ script_path=script_path)
3848
+ else:
3849
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
3850
+ job_id=job_id,
3851
+ codegen=codegen,
3852
+ remote_log_dir=remote_log_dir,
3853
+ managed_job=managed_job_info,
3854
+ script_path=script_path)
3855
+
3856
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
3857
+ handle.get_grpc_channel()).queue_job(queue_job_request))
3858
+ except exceptions.SkyletMethodNotImplementedError:
3859
+ use_legacy = True
3860
+
3861
+ if use_legacy:
3862
+ if _is_command_length_over_limit(job_submit_cmd):
3863
+ _dump_code_to_file(codegen)
3864
+ job_submit_cmd = f'{mkdir_code} && {code}'
3865
+
3866
+ def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
3867
+ if managed_job_dag is not None:
3868
+ # Add the managed job to job queue database.
3869
+ managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3870
+ managed_job_code = managed_job_codegen.set_pending(
3871
+ job_id,
3872
+ managed_job_dag,
3873
+ skypilot_config.get_active_workspace(
3874
+ force_user_workspace=True),
3875
+ entrypoint=common_utils.get_current_command(),
3876
+ user_hash=managed_job_user_id)
3877
+ # Set the managed job to PENDING state to make sure that
3878
+ # this managed job appears in the `sky jobs queue`, even
3879
+ # if it needs to wait to be submitted.
3880
+ # We cannot set the managed job to PENDING state in the
3881
+ # job template (jobs-controller.yaml.j2), as it may need
3882
+ # to wait for the run commands to be scheduled on the job
3883
+ # controller in high-load cases.
3884
+ job_submit_cmd += ' && ' + managed_job_code
3885
+ return job_submit_cmd
3862
3886
 
3863
- returncode, stdout, stderr = self.run_on_head(handle,
3864
- job_submit_cmd,
3865
- stream_logs=False,
3866
- require_outputs=True)
3867
- # Happens when someone calls `sky exec` but remote is outdated for
3868
- # running a job. Necessitating calling `sky launch`.
3869
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3870
- handle.cluster_name)
3871
- output = stdout + stderr
3872
- if ((returncode == 255 and 'too long' in output.lower()) or
3873
- (returncode == 1 and 'request-uri too large' in output.lower())):
3874
- # If the generated script is too long, we retry it with dumping
3875
- # the script to a file and running it with SSH. We use a general
3876
- # length limit check before but it could be inaccurate on some
3877
- # systems.
3878
- # When there is a cloudflare proxy in front of the remote, it could
3879
- # cause `414 Request-URI Too Large` error.
3880
- logger.debug('Failed to submit job due to command length limit. '
3881
- 'Dumping job to file and running it with SSH. '
3882
- f'Output: {output}')
3883
- _dump_code_to_file(codegen)
3884
- job_submit_cmd = f'{mkdir_code} && {code}'
3885
3887
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3888
+
3886
3889
  returncode, stdout, stderr = self.run_on_head(handle,
3887
3890
  job_submit_cmd,
3888
3891
  stream_logs=False,
3889
3892
  require_outputs=True)
3893
+ # Happens when someone calls `sky exec` but remote is outdated for
3894
+ # running a job. Necessitating calling `sky launch`.
3895
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3896
+ handle.cluster_name)
3897
+ output = stdout + stderr
3898
+ if _is_message_too_long(returncode, output=output):
3899
+ # If the job submit script is too long, we need to retry it
3900
+ # with dumping the script to a file and running it the script
3901
+ # on remote cluster instead.
3902
+ logger.debug(
3903
+ 'Failed to submit job due to command length limit. '
3904
+ 'Dumping job to file and running it with SSH. '
3905
+ f'Output: {output}')
3906
+ _dump_code_to_file(codegen)
3907
+ job_submit_cmd = f'{mkdir_code} && {code}'
3908
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3909
+ returncode, stdout, stderr = self.run_on_head(
3910
+ handle,
3911
+ job_submit_cmd,
3912
+ stream_logs=False,
3913
+ require_outputs=True)
3890
3914
 
3891
- subprocess_utils.handle_returncode(returncode,
3892
- job_submit_cmd,
3893
- f'Failed to submit job {job_id}.',
3894
- stderr=stdout + stderr)
3915
+ subprocess_utils.handle_returncode(
3916
+ returncode,
3917
+ job_submit_cmd,
3918
+ f'Failed to submit job {job_id}.',
3919
+ stderr=stdout + stderr)
3895
3920
 
3896
3921
  controller = controller_utils.Controllers.from_name(handle.cluster_name)
3897
3922
  if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
@@ -3900,61 +3925,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3900
3925
  logger.info(
3901
3926
  ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
3902
3927
  rich_utils.stop_safe_status()
3903
- if not detach_run:
3904
- if (handle.cluster_name == controller_utils.Controllers.
3905
- JOBS_CONTROLLER.value.cluster_name):
3906
- self.tail_managed_job_logs(handle, job_id)
3907
- else:
3908
- # Sky logs. Not using subprocess.run since it will make the
3909
- # ssh keep connected after ctrl-c.
3910
- self.tail_logs(handle, job_id)
3911
3928
 
3912
3929
  def _add_job(self, handle: CloudVmRayResourceHandle,
3913
3930
  job_name: Optional[str], resources_str: str,
3914
3931
  metadata: str) -> Tuple[int, str]:
3915
- code = job_lib.JobLibCodeGen.add_job(
3916
- job_name=job_name,
3917
- username=common_utils.get_user_hash(),
3918
- run_timestamp=self.run_timestamp,
3919
- resources_str=resources_str,
3920
- metadata=metadata)
3921
- returncode, result_str, stderr = self.run_on_head(handle,
3922
- code,
3923
- stream_logs=False,
3924
- require_outputs=True,
3925
- separate_stderr=True)
3926
- # Happens when someone calls `sky exec` but remote is outdated for
3927
- # adding a job. Necessitating calling `sky launch`.
3928
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3929
- handle.cluster_name)
3930
- # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3931
- # retry for this, after we figure out the reason.
3932
- subprocess_utils.handle_returncode(returncode, code,
3933
- 'Failed to fetch job id.', stderr)
3934
- try:
3935
- job_id_match = _JOB_ID_PATTERN.search(result_str)
3936
- if job_id_match is not None:
3937
- job_id = int(job_id_match.group(1))
3938
- else:
3939
- # For backward compatibility.
3940
- job_id = int(result_str)
3941
- log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3942
- if log_dir_match is not None:
3943
- log_dir = log_dir_match.group(1).strip()
3944
- else:
3945
- # For backward compatibility, use the same log dir as local.
3946
- log_dir = self.log_dir
3947
- except ValueError as e:
3948
- logger.error(stderr)
3949
- raise ValueError(f'Failed to parse job id: {result_str}; '
3950
- f'Returncode: {returncode}') from e
3932
+ use_legacy = not handle.is_grpc_enabled_with_flag
3933
+
3934
+ if not use_legacy:
3935
+ try:
3936
+ request = jobsv1_pb2.AddJobRequest(
3937
+ job_name=job_name,
3938
+ username=common_utils.get_user_hash(),
3939
+ run_timestamp=self.run_timestamp,
3940
+ resources_str=resources_str,
3941
+ metadata=metadata)
3942
+ response = backend_utils.invoke_skylet_with_retries(
3943
+ lambda: SkyletClient(handle.get_grpc_channel()).add_job(
3944
+ request))
3945
+ job_id = response.job_id
3946
+ log_dir = response.log_dir
3947
+ return job_id, log_dir
3948
+ except exceptions.SkyletMethodNotImplementedError:
3949
+ use_legacy = True
3950
+
3951
+ if use_legacy:
3952
+ code = job_lib.JobLibCodeGen.add_job(
3953
+ job_name=job_name,
3954
+ username=common_utils.get_user_hash(),
3955
+ run_timestamp=self.run_timestamp,
3956
+ resources_str=resources_str,
3957
+ metadata=metadata)
3958
+ returncode, result_str, stderr = self.run_on_head(
3959
+ handle,
3960
+ code,
3961
+ stream_logs=False,
3962
+ require_outputs=True,
3963
+ separate_stderr=True)
3964
+ # Happens when someone calls `sky exec` but remote is outdated for
3965
+ # adding a job. Necessitating calling `sky launch`.
3966
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3967
+ handle.cluster_name)
3968
+ # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3969
+ # retry for this, after we figure out the reason.
3970
+ subprocess_utils.handle_returncode(returncode, code,
3971
+ 'Failed to fetch job id.',
3972
+ stderr)
3973
+ try:
3974
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
3975
+ if job_id_match is not None:
3976
+ job_id = int(job_id_match.group(1))
3977
+ else:
3978
+ # For backward compatibility.
3979
+ job_id = int(result_str)
3980
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3981
+ if log_dir_match is not None:
3982
+ log_dir = log_dir_match.group(1).strip()
3983
+ else:
3984
+ # For backward compatibility, use the same log dir as local.
3985
+ log_dir = self.log_dir
3986
+ except ValueError as e:
3987
+ logger.error(stderr)
3988
+ raise ValueError(f'Failed to parse job id: {result_str}; '
3989
+ f'Returncode: {returncode}') from e
3951
3990
  return job_id, log_dir
3952
3991
 
3953
3992
  def _execute(
3954
3993
  self,
3955
3994
  handle: CloudVmRayResourceHandle,
3956
3995
  task: task_lib.Task,
3957
- detach_run: bool,
3958
3996
  dryrun: bool = False,
3959
3997
  ) -> Optional[int]:
3960
3998
  """Executes the task on the cluster.
@@ -4006,12 +4044,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4006
4044
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
4007
4045
  # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
4008
4046
  if num_actual_nodes > 1:
4009
- self._execute_task_n_nodes(handle, task_copy, job_id, detach_run,
4010
- log_dir)
4047
+ self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
4011
4048
  else:
4012
4049
  # Case: task_lib.Task(run, num_nodes=1)
4013
- self._execute_task_one_node(handle, task_copy, job_id, detach_run,
4014
- log_dir)
4050
+ self._execute_task_one_node(handle, task_copy, job_id, log_dir)
4015
4051
 
4016
4052
  return job_id
4017
4053
 
@@ -4054,7 +4090,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4054
4090
  is_identity_mismatch_and_purge = False
4055
4091
  try:
4056
4092
  backend_utils.check_owner_identity(cluster_name)
4057
- except exceptions.ClusterOwnerIdentityMismatchError as e:
4093
+ except (exceptions.ClusterOwnerIdentityMismatchError,
4094
+ exceptions.CloudUserIdentityError) as e:
4058
4095
  if purge:
4059
4096
  logger.error(e)
4060
4097
  verbed = 'terminated' if terminate else 'stopped'
@@ -4068,15 +4105,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4068
4105
  else:
4069
4106
  raise
4070
4107
  lock_id = backend_utils.cluster_status_lock_id(cluster_name)
4071
- lock = locks.get_lock(lock_id)
4108
+ lock = locks.get_lock(lock_id, timeout=1)
4072
4109
  # Retry in case new cluster operation comes in and holds the lock
4073
4110
  # right after the lock is removed.
4074
4111
  n_attempts = 2
4075
4112
  while True:
4076
4113
  n_attempts -= 1
4077
- # In case other running cluster operations are still holding the
4078
- # lock.
4079
- lock.force_unlock()
4080
4114
  # We have to kill the cluster requests, because `down` and `stop`
4081
4115
  # should be higher priority than the cluster requests, and we should
4082
4116
  # release the lock from other requests.
@@ -4094,6 +4128,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4094
4128
  'Failed to kill other launch requests for the '
4095
4129
  f'cluster {handle.cluster_name}: '
4096
4130
  f'{common_utils.format_exception(e, use_bracket=True)}')
4131
+ # In case other running cluster operations are still holding the
4132
+ # lock.
4133
+ lock.force_unlock()
4097
4134
  try:
4098
4135
  with lock:
4099
4136
  self.teardown_no_lock(
@@ -4126,6 +4163,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4126
4163
  job_ids: Optional[List[int]] = None,
4127
4164
  stream_logs: bool = True
4128
4165
  ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
4166
+ if handle.is_grpc_enabled_with_flag:
4167
+ try:
4168
+ request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
4169
+ response = backend_utils.invoke_skylet_with_retries(
4170
+ lambda: SkyletClient(handle.get_grpc_channel()
4171
+ ).get_job_status(request))
4172
+ statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
4173
+ job_id: job_lib.JobStatus.from_protobuf(proto_status)
4174
+ for job_id, proto_status in response.job_statuses.items()
4175
+ }
4176
+ return statuses
4177
+ except exceptions.SkyletMethodNotImplementedError:
4178
+ pass
4179
+
4129
4180
  code = job_lib.JobLibCodeGen.get_job_status(job_ids)
4130
4181
  returncode, stdout, stderr = self.run_on_head(handle,
4131
4182
  code,
@@ -4146,16 +4197,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4146
4197
 
4147
4198
  See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
4148
4199
  """
4149
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
4150
- returncode, stdout, _ = self.run_on_head(handle,
4151
- code,
4152
- stream_logs=False,
4153
- require_outputs=True)
4154
- subprocess_utils.handle_returncode(
4155
- returncode, code,
4156
- f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
4157
-
4158
- cancelled_ids = message_utils.decode_payload(stdout)
4200
+ use_legacy = not handle.is_grpc_enabled_with_flag
4201
+
4202
+ if not use_legacy:
4203
+ try:
4204
+ request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4205
+ cancel_all=cancel_all,
4206
+ user_hash=user_hash)
4207
+ response = backend_utils.invoke_skylet_with_retries(
4208
+ lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
4209
+ request))
4210
+ cancelled_ids = response.cancelled_job_ids
4211
+ except exceptions.SkyletMethodNotImplementedError:
4212
+ use_legacy = True
4213
+
4214
+ if use_legacy:
4215
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
4216
+ user_hash)
4217
+ returncode, stdout, _ = self.run_on_head(handle,
4218
+ code,
4219
+ stream_logs=False,
4220
+ require_outputs=True)
4221
+ subprocess_utils.handle_returncode(
4222
+ returncode, code,
4223
+ f'Failed to cancel jobs on cluster {handle.cluster_name}.',
4224
+ stdout)
4225
+ cancelled_ids = message_utils.decode_payload(stdout)
4159
4226
  if cancelled_ids:
4160
4227
  logger.info(
4161
4228
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
@@ -4172,20 +4239,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4172
4239
  Returns:
4173
4240
  A dictionary mapping job_id to log path.
4174
4241
  """
4175
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4176
- returncode, job_to_dir, stderr = self.run_on_head(handle,
4242
+ job_to_dir: Dict[str, str] = {}
4243
+ use_legacy = not handle.is_grpc_enabled_with_flag
4244
+
4245
+ if not use_legacy:
4246
+ try:
4247
+ int_job_ids = []
4248
+ if job_ids:
4249
+ for str_job_id in job_ids:
4250
+ if str_job_id.isdigit():
4251
+ int_job_ids.append(int(str_job_id))
4252
+ request = jobsv1_pb2.GetLogDirsForJobsRequest(
4253
+ job_ids=int_job_ids)
4254
+ response = backend_utils.invoke_skylet_with_retries(
4255
+ lambda: SkyletClient(handle.get_grpc_channel()
4256
+ ).get_log_dirs_for_jobs(request))
4257
+ job_log_dirs = response.job_log_dirs
4258
+ if not job_log_dirs:
4259
+ logger.info(f'{colorama.Fore.YELLOW}'
4260
+ 'No matching log directories found'
4261
+ f'{colorama.Style.RESET_ALL}')
4262
+ return {}
4263
+ for job_id, log_dir in job_log_dirs.items():
4264
+ # Convert to string for backwards compatibility
4265
+ job_to_dir[str(job_id)] = log_dir
4266
+ except exceptions.SkyletMethodNotImplementedError:
4267
+ use_legacy = True
4268
+
4269
+ if use_legacy:
4270
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4271
+ returncode, stdout, stderr = self.run_on_head(handle,
4177
4272
  code,
4178
4273
  stream_logs=False,
4179
4274
  require_outputs=True,
4180
4275
  separate_stderr=True)
4181
- subprocess_utils.handle_returncode(returncode, code,
4182
- 'Failed to sync logs.', stderr)
4183
- job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
4184
- if not job_to_dir:
4185
- logger.info(f'{colorama.Fore.YELLOW}'
4186
- 'No matching log directories found'
4187
- f'{colorama.Style.RESET_ALL}')
4188
- return {}
4276
+ subprocess_utils.handle_returncode(returncode, code,
4277
+ 'Failed to sync logs.', stderr)
4278
+ job_to_dir = message_utils.decode_payload(stdout)
4279
+ if not job_to_dir:
4280
+ logger.info(f'{colorama.Fore.YELLOW}'
4281
+ 'No matching log directories found'
4282
+ f'{colorama.Style.RESET_ALL}')
4283
+ return {}
4189
4284
 
4190
4285
  job_ids = list(job_to_dir.keys())
4191
4286
  dirs = list(job_to_dir.values())
@@ -4195,9 +4290,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4195
4290
  (dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
4196
4291
  constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
4197
4292
  ]
4198
- local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
4199
- if constants.SKY_LOGS_DIRECTORY in dir else
4200
- os.path.join(local_dir, dir)) for dir in dirs]
4293
+ # Include cluster name in local log directory path to avoid conflicts
4294
+ # when the same job_id exists on different clusters
4295
+ cluster_name = handle.cluster_name
4296
+ local_log_dirs = []
4297
+ for remote_log_dir in dirs:
4298
+ if constants.SKY_LOGS_DIRECTORY in remote_log_dir:
4299
+ # Extract the job-specific directory name from the full path
4300
+ # e.g., ~/sky_logs/1-job_name -> 1-job_name
4301
+ job_dir = remote_log_dir.replace(constants.SKY_LOGS_DIRECTORY,
4302
+ '').lstrip('/')
4303
+ local_log_dir = os.path.join(local_dir, cluster_name, job_dir)
4304
+ else:
4305
+ # remote_log_dir is already just the job directory name (e.g.,
4306
+ # "1-job_name")
4307
+ local_log_dir = os.path.join(local_dir, cluster_name,
4308
+ remote_log_dir)
4309
+ local_log_dirs.append(local_log_dir)
4201
4310
 
4202
4311
  runners = handle.get_command_runners()
4203
4312
 
@@ -4261,6 +4370,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4261
4370
  The exit code of the tail command. Returns code 100 if the job has
4262
4371
  failed. See exceptions.JobExitCode for possible return codes.
4263
4372
  """
4373
+ if handle.is_grpc_enabled_with_flag:
4374
+ last_exit_code = 0
4375
+ try:
4376
+ request = jobsv1_pb2.TailLogsRequest(
4377
+ job_id=job_id,
4378
+ managed_job_id=managed_job_id,
4379
+ follow=follow,
4380
+ tail=tail)
4381
+ for resp in backend_utils.invoke_skylet_streaming_with_retries(
4382
+ lambda: SkyletClient(handle.get_grpc_channel()
4383
+ ).tail_logs(request, timeout=None)):
4384
+ if resp.log_line:
4385
+ print(resp.log_line, end='', flush=True)
4386
+ last_exit_code = resp.exit_code
4387
+ return last_exit_code
4388
+ except exceptions.SkyletMethodNotImplementedError:
4389
+ pass
4390
+ except grpc.RpcError as e:
4391
+ if e.code() == grpc.StatusCode.CANCELLED:
4392
+ return last_exit_code
4393
+ raise e
4394
+
4264
4395
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
4265
4396
  managed_job_id=managed_job_id,
4266
4397
  follow=follow,
@@ -4298,6 +4429,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4298
4429
  tail: Optional[int] = None) -> int:
4299
4430
  # if job_name is not None, job_id should be None
4300
4431
  assert job_name is None or job_id is None, (job_name, job_id)
4432
+ # TODO(kevin): Migrate stream_logs to gRPC
4301
4433
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
4302
4434
  job_name, job_id, follow, controller, tail)
4303
4435
 
@@ -4343,20 +4475,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4343
4475
  assert job_name is None or job_id is None, (job_name, job_id)
4344
4476
 
4345
4477
  if job_id is None:
4346
- # generate code to get the job_id
4478
+ # get the job_id
4347
4479
  # if job_name is None, get all job_ids
4348
4480
  # TODO: Only get the latest job_id, since that's the only one we use
4349
- code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4350
- job_name=job_name)
4351
- returncode, job_ids, stderr = self.run_on_head(handle,
4352
- code,
4353
- stream_logs=False,
4354
- require_outputs=True,
4355
- separate_stderr=True)
4356
- subprocess_utils.handle_returncode(returncode, code,
4357
- 'Failed to sync down logs.',
4358
- stderr)
4359
- job_ids = message_utils.decode_payload(job_ids)
4481
+
4482
+ use_legacy = not handle.is_grpc_enabled_with_flag
4483
+ logger.info(f'handle.is_grpc_enabled_with_flag: '
4484
+ f'{handle.is_grpc_enabled_with_flag}')
4485
+ if not use_legacy:
4486
+ try:
4487
+ request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
4488
+ job_name=job_name)
4489
+ response = backend_utils.invoke_skylet_with_retries(
4490
+ lambda: SkyletClient(handle.get_grpc_channel(
4491
+ )).get_all_managed_job_ids_by_name(request))
4492
+ job_ids = list(response.job_ids)
4493
+ except exceptions.SkyletMethodNotImplementedError:
4494
+ use_legacy = True
4495
+
4496
+ if use_legacy:
4497
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4498
+ job_name=job_name)
4499
+ returncode, job_ids_payload, stderr = self.run_on_head(
4500
+ handle,
4501
+ code,
4502
+ stream_logs=False,
4503
+ require_outputs=True,
4504
+ separate_stderr=True)
4505
+ subprocess_utils.handle_returncode(returncode, code,
4506
+ 'Failed to sync down logs.',
4507
+ stderr)
4508
+ job_ids = message_utils.decode_payload(job_ids_payload)
4360
4509
  if not job_ids:
4361
4510
  logger.info(f'{colorama.Fore.YELLOW}'
4362
4511
  'No matching job found'
@@ -4384,18 +4533,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4384
4533
  else:
4385
4534
  # get the run_timestamp
4386
4535
  # the function takes in [job_id]
4387
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
4388
- returncode, run_timestamps_payload, stderr = self.run_on_head(
4389
- handle,
4390
- code,
4391
- stream_logs=False,
4392
- require_outputs=True,
4393
- separate_stderr=True)
4394
- subprocess_utils.handle_returncode(returncode, code,
4395
- 'Failed to sync logs.', stderr)
4396
- # returns with a dict of {job_id: run_timestamp}
4397
- run_timestamps = message_utils.decode_payload(
4398
- run_timestamps_payload)
4536
+ use_legacy = not handle.is_grpc_enabled_with_flag
4537
+ if not use_legacy:
4538
+ try:
4539
+ log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
4540
+ job_ids=[job_id])
4541
+ log_dirs_response = (
4542
+ backend_utils.invoke_skylet_with_retries(
4543
+ lambda: SkyletClient(handle.get_grpc_channel(
4544
+ )).get_log_dirs_for_jobs(log_dirs_request)))
4545
+ job_log_dirs = log_dirs_response.job_log_dirs
4546
+ # Convert back to the expected format
4547
+ # {job_id: run_timestamp}
4548
+ run_timestamps = {}
4549
+ for jid, log_dir in job_log_dirs.items():
4550
+ run_timestamps[int(jid)] = log_dir
4551
+ except exceptions.SkyletMethodNotImplementedError:
4552
+ use_legacy = True
4553
+
4554
+ if use_legacy:
4555
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
4556
+ [str(job_id)])
4557
+ returncode, run_timestamps_payload, stderr = self.run_on_head(
4558
+ handle,
4559
+ code,
4560
+ stream_logs=False,
4561
+ require_outputs=True,
4562
+ separate_stderr=True)
4563
+ subprocess_utils.handle_returncode(returncode, code,
4564
+ 'Failed to sync logs.',
4565
+ stderr)
4566
+ # returns with a dict of {job_id: run_timestamp}
4567
+ run_timestamps = message_utils.decode_payload(
4568
+ run_timestamps_payload)
4399
4569
  if not run_timestamps:
4400
4570
  logger.info(f'{colorama.Fore.YELLOW}'
4401
4571
  'No matching log directories found'
@@ -4462,11 +4632,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4462
4632
  exist_ok=True)
4463
4633
  log_file = os.path.join(local_log_dir, 'run.log')
4464
4634
 
4465
- code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4466
- job_id=job_id,
4467
- follow=False,
4468
- controller=False)
4469
-
4635
+ # TODO(kevin): Migrate stream_logs to gRPC
4636
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(
4637
+ job_name=None,
4638
+ job_id=int(job_id),
4639
+ follow=False,
4640
+ controller=False)
4470
4641
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4471
4642
  # kill the process, so we need to handle it manually here.
4472
4643
  if threading.current_thread() is threading.main_thread():
@@ -4507,6 +4678,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4507
4678
  Raises:
4508
4679
  RuntimeError: If the cluster fails to be terminated/stopped.
4509
4680
  """
4681
+ try:
4682
+ handle.close_skylet_ssh_tunnel()
4683
+ except Exception as e: # pylint: disable=broad-except
4684
+ # Not critical to the cluster teardown, just log a warning.
4685
+ logger.warning(
4686
+ 'Failed to close Skylet SSH tunnel for cluster '
4687
+ f'{handle.cluster_name}: '
4688
+ f'{common_utils.format_exception(e, use_bracket=True)}')
4689
+
4510
4690
  exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
4511
4691
  # We have to kill the cluster requests again within the lock, because
4512
4692
  # any pending requests on the same cluster should be cancelled after
@@ -4543,7 +4723,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4543
4723
  # observed in AWS. See also
4544
4724
  # _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
4545
4725
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
4546
- acquire_per_cluster_status_lock=False))
4726
+ cluster_lock_already_held=True,
4727
+ retry_if_missing=False))
4547
4728
  cluster_status_fetched = True
4548
4729
  except exceptions.ClusterStatusFetchingError:
4549
4730
  logger.warning(
@@ -4551,10 +4732,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4551
4732
  f'{handle.cluster_name!r}. Assuming the cluster is still '
4552
4733
  'up.')
4553
4734
  if not cluster_status_fetched:
4554
- record = global_user_state.get_cluster_from_name(
4735
+ status = global_user_state.get_status_from_cluster_name(
4555
4736
  handle.cluster_name)
4556
- prev_cluster_status = record[
4557
- 'status'] if record is not None else None
4737
+ prev_cluster_status = status if status is not None else None
4558
4738
  if prev_cluster_status is None:
4559
4739
  # When the cluster is not in the cluster table, we guarantee that
4560
4740
  # all related resources / cache / config are cleaned up, i.e. it
@@ -4786,7 +4966,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4786
4966
  config['provider'])
4787
4967
  ports_cleaned_up = True
4788
4968
  except exceptions.NotSupportedError:
4789
- pass
4969
+ ports_cleaned_up = True
4790
4970
  except exceptions.PortDoesNotExistError:
4791
4971
  logger.debug('Ports do not exist. Skipping cleanup.')
4792
4972
  except Exception as e: # pylint: disable=broad-except
@@ -4811,7 +4991,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4811
4991
  failover)
4812
4992
  custom_multi_network_cleaned_up = True
4813
4993
  except exceptions.NotSupportedError:
4814
- pass
4994
+ custom_multi_network_cleaned_up = True
4815
4995
  except Exception as e: # pylint: disable=broad-except
4816
4996
  if purge:
4817
4997
  msg = common_utils.format_exception(e, use_bracket=True)
@@ -4913,7 +5093,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4913
5093
  cluster_yaml_path = handle.cluster_yaml
4914
5094
  handle.cluster_yaml = None
4915
5095
  global_user_state.update_cluster_handle(handle.cluster_name, handle)
4916
- global_user_state.remove_cluster_yaml(handle.cluster_name)
5096
+ # Removing the cluster YAML can cause some unexpected stability issues.
5097
+ # See #5011.
5098
+ # global_user_state.remove_cluster_yaml(handle.cluster_name)
4917
5099
  common_utils.remove_file_if_exists(cluster_yaml_path)
4918
5100
 
4919
5101
  def set_autostop(self,
@@ -4974,9 +5156,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4974
5156
  autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
4975
5157
  down=down,
4976
5158
  )
4977
- backend_utils.invoke_skylet_with_retries(
4978
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
4979
- set_autostop(request))
5159
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
5160
+ handle.get_grpc_channel()).set_autostop(request))
4980
5161
  else:
4981
5162
  code = autostop_lib.AutostopCodeGen.set_autostop(
4982
5163
  idle_minutes_to_autostop, self.NAME, wait_for, down)
@@ -5015,8 +5196,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5015
5196
  try:
5016
5197
  request = autostopv1_pb2.IsAutostoppingRequest()
5017
5198
  response = backend_utils.invoke_skylet_with_retries(
5018
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
5019
- is_autostopping(request))
5199
+ lambda: SkyletClient(handle.get_grpc_channel()
5200
+ ).is_autostopping(request))
5020
5201
  return response.is_autostopping
5021
5202
  except Exception as e: # pylint: disable=broad-except
5022
5203
  # The cluster may have been terminated, causing the gRPC call
@@ -5128,7 +5309,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5128
5309
  exceptions.InvalidClusterNameError: If the cluster name is invalid.
5129
5310
  # TODO(zhwu): complete the list of exceptions.
5130
5311
  """
5131
- record = global_user_state.get_cluster_from_name(cluster_name)
5312
+ record = global_user_state.get_cluster_from_name(
5313
+ cluster_name, include_user_info=False, summary_response=True)
5132
5314
  if record is None:
5133
5315
  handle_before_refresh = None
5134
5316
  status_before_refresh = None
@@ -5148,7 +5330,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5148
5330
  record = backend_utils.refresh_cluster_record(
5149
5331
  cluster_name,
5150
5332
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
5151
- acquire_per_cluster_status_lock=False,
5333
+ cluster_lock_already_held=True,
5334
+ include_user_info=False,
5335
+ summary_response=True,
5152
5336
  )
5153
5337
  if record is not None:
5154
5338
  prev_cluster_status = record['status']
@@ -5264,33 +5448,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5264
5448
  common_utils.check_cluster_name_is_valid(cluster_name)
5265
5449
 
5266
5450
  if to_provision is None:
5267
- # The cluster is recently terminated either by autostop or manually
5268
- # terminated on the cloud. We should use the previously terminated
5269
- # resources to provision the cluster.
5270
- #
5271
- # FIXME(zongheng): this assert can be hit by using two terminals.
5272
- # First, create a 'dbg' cluster. Then:
5273
- # Terminal 1: sky down dbg -y
5274
- # Terminal 2: sky launch -c dbg -- echo
5275
- # Run it in order. Terminal 2 will show this error after terminal 1
5276
- # succeeds in downing the cluster and releasing the lock.
5277
- assert isinstance(
5278
- handle_before_refresh, CloudVmRayResourceHandle), (
5279
- f'Trying to launch cluster {cluster_name!r} recently '
5280
- 'terminated on the cloud, but the handle is not a '
5281
- f'CloudVmRayResourceHandle ({handle_before_refresh}).')
5282
- status_before_refresh_str = None
5283
- if status_before_refresh is not None:
5284
- status_before_refresh_str = status_before_refresh.value
5285
-
5286
- logger.info(
5287
- f'The cluster {cluster_name!r} (status: '
5288
- f'{status_before_refresh_str}) was not found on the cloud: it '
5289
- 'may be autodowned, manually terminated, or its launch never '
5290
- 'succeeded. Provisioning a new cluster by using the same '
5291
- 'resources as its original launch.')
5292
- to_provision = handle_before_refresh.launched_resources
5293
- self.check_resources_fit_cluster(handle_before_refresh, task)
5451
+ # Recently terminated after refresh. OPTIMIZE usually ran outside
5452
+ # the lock, so that decision may be stale by now. Under the lock,
5453
+ # ensure we always have a concrete plan via the following order:
5454
+ # 1) Reuse last placement snapshot (if available);
5455
+ # 2) Else, call injected planner for a fresh plan.
5456
+ # If we still have a pre-refresh handle snapshot with a concrete
5457
+ # placement, prefer reusing it.
5458
+ if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
5459
+ handle_before_refresh.launched_resources is not None):
5460
+ to_provision = handle_before_refresh.launched_resources
5461
+ # Ensure the requested task fits the previous placement.
5462
+ self.check_resources_fit_cluster(handle_before_refresh, task)
5463
+ # Mirror the original message for reuse path.
5464
+ status_before_refresh_str = None
5465
+ if status_before_refresh is not None:
5466
+ status_before_refresh_str = status_before_refresh.value
5467
+ logger.info(
5468
+ f'The cluster {cluster_name!r} (status: '
5469
+ f'{status_before_refresh_str}) was not found on the cloud: '
5470
+ 'it may be autodowned, manually terminated, or its launch '
5471
+ 'never succeeded. Provisioning a new cluster by using the '
5472
+ 'same resources as its original launch.')
5473
+ elif self._planner is not None:
5474
+ to_provision = self._planner(task)
5475
+ logger.info(
5476
+ 'Previous placement snapshot missing; computing a fresh '
5477
+ 'plan for provisioning.')
5478
+ else:
5479
+ # Without a snapshot or planner, we cannot proceed safely.
5480
+ # Surface a user-friendly error without a long traceback.
5481
+ with ux_utils.print_exception_no_traceback():
5482
+ raise RuntimeError(
5483
+ 'No concrete launch plan available after recent cloud '
5484
+ f'termination of cluster {cluster_name!r}. Ensure the '
5485
+ 'OPTIMIZE stage runs or provide concrete resources.')
5294
5486
 
5295
5487
  return RetryingVmProvisioner.ToProvisionConfig(
5296
5488
  cluster_name,
@@ -5639,7 +5831,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5639
5831
  def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
5640
5832
  handle: CloudVmRayResourceHandle) -> Dict[str, str]:
5641
5833
  """Returns the environment variables for the task."""
5642
- env_vars = task.envs_and_secrets
5834
+ env_vars = task_lib.get_plaintext_envs_and_secrets(
5835
+ task.envs_and_secrets)
5643
5836
  # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
5644
5837
  # by the controller.
5645
5838
  if constants.TASK_ID_ENV_VAR not in env_vars:
@@ -5651,9 +5844,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5651
5844
  env_vars.update(self._skypilot_predefined_env_vars(handle))
5652
5845
  return env_vars
5653
5846
 
5847
+ def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
5848
+ """Returns the user id for the managed job."""
5849
+ if task.managed_job_dag is not None:
5850
+ return task.envs[constants.USER_ID_ENV_VAR]
5851
+ return None
5852
+
5654
5853
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5655
5854
  task: task_lib.Task, job_id: int,
5656
- detach_run: bool, remote_log_dir: str) -> None:
5855
+ remote_log_dir: str) -> None:
5657
5856
  # Launch the command as a Ray task.
5658
5857
  log_dir = os.path.join(remote_log_dir, 'tasks')
5659
5858
 
@@ -5663,9 +5862,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5663
5862
 
5664
5863
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5665
5864
 
5666
- codegen = RayCodeGen()
5865
+ codegen = task_codegen.RayCodeGen()
5667
5866
  codegen.add_prologue(job_id)
5668
- codegen.add_gang_scheduling_placement_group_and_setup(
5867
+ codegen.add_setup(
5669
5868
  1,
5670
5869
  resources_dict,
5671
5870
  stable_cluster_internal_ips=internal_ips,
@@ -5674,31 +5873,27 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5674
5873
  setup_log_path=os.path.join(log_dir, 'setup.log'),
5675
5874
  )
5676
5875
 
5677
- if callable(task.run):
5678
- run_fn_code = textwrap.dedent(inspect.getsource(task.run))
5679
- run_fn_name = task.run.__name__
5680
- codegen.register_run_fn(run_fn_code, run_fn_name)
5681
-
5682
- command_for_node = task.run if isinstance(task.run, str) else None
5683
- codegen.add_ray_task(
5684
- bash_script=command_for_node,
5876
+ codegen.add_task(
5877
+ 1,
5878
+ bash_script=task.run,
5685
5879
  env_vars=task_env_vars,
5686
5880
  task_name=task.name,
5687
- ray_resources_dict=backend_utils.get_task_demands_dict(task),
5881
+ resources_dict=backend_utils.get_task_demands_dict(task),
5688
5882
  log_dir=log_dir)
5689
5883
 
5690
5884
  codegen.add_epilogue()
5691
5885
 
5692
- self._exec_code_on_head(handle,
5693
- codegen.build(),
5694
- job_id,
5695
- detach_run=detach_run,
5696
- managed_job_dag=task.managed_job_dag,
5697
- remote_log_dir=remote_log_dir)
5886
+ self._exec_code_on_head(
5887
+ handle,
5888
+ codegen.build(),
5889
+ job_id,
5890
+ managed_job_dag=task.managed_job_dag,
5891
+ managed_job_user_id=self._get_managed_job_user_id(task),
5892
+ remote_log_dir=remote_log_dir)
5698
5893
 
5699
5894
  def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
5700
5895
  task: task_lib.Task, job_id: int,
5701
- detach_run: bool, remote_log_dir: str) -> None:
5896
+ remote_log_dir: str) -> None:
5702
5897
  # Strategy:
5703
5898
  # ray.init(...)
5704
5899
  # for node:
@@ -5712,9 +5907,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5712
5907
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
5713
5908
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5714
5909
 
5715
- codegen = RayCodeGen()
5910
+ codegen = task_codegen.RayCodeGen()
5716
5911
  codegen.add_prologue(job_id)
5717
- codegen.add_gang_scheduling_placement_group_and_setup(
5912
+ codegen.add_setup(
5718
5913
  num_actual_nodes,
5719
5914
  resources_dict,
5720
5915
  stable_cluster_internal_ips=internal_ips,
@@ -5723,31 +5918,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5723
5918
  setup_log_path=os.path.join(log_dir, 'setup.log'),
5724
5919
  )
5725
5920
 
5726
- if callable(task.run):
5727
- run_fn_code = textwrap.dedent(inspect.getsource(task.run))
5728
- run_fn_name = task.run.__name__
5729
- codegen.register_run_fn(run_fn_code, run_fn_name)
5730
-
5731
- # TODO(zhwu): The resources limitation for multi-node ray.tune and
5732
- # horovod should be considered.
5733
- for i in range(num_actual_nodes):
5734
- command_for_node = task.run if isinstance(task.run, str) else None
5735
-
5736
- # Ray's per-node resources, to constrain scheduling each command to
5737
- # the corresponding node, represented by private IPs.
5738
- codegen.add_ray_task(
5739
- bash_script=command_for_node,
5740
- env_vars=task_env_vars,
5741
- task_name=task.name,
5742
- ray_resources_dict=backend_utils.get_task_demands_dict(task),
5743
- log_dir=log_dir,
5744
- gang_scheduling_id=i)
5921
+ codegen.add_task(
5922
+ num_actual_nodes,
5923
+ bash_script=task.run,
5924
+ env_vars=task_env_vars,
5925
+ task_name=task.name,
5926
+ resources_dict=backend_utils.get_task_demands_dict(task),
5927
+ log_dir=log_dir)
5745
5928
 
5746
5929
  codegen.add_epilogue()
5747
5930
  # TODO(zhanghao): Add help info for downloading logs.
5748
- self._exec_code_on_head(handle,
5749
- codegen.build(),
5750
- job_id,
5751
- detach_run=detach_run,
5752
- managed_job_dag=task.managed_job_dag,
5753
- remote_log_dir=remote_log_dir)
5931
+ self._exec_code_on_head(
5932
+ handle,
5933
+ codegen.build(),
5934
+ job_id,
5935
+ managed_job_dag=task.managed_job_dag,
5936
+ managed_job_user_id=self._get_managed_job_user_id(task),
5937
+ remote_log_dir=remote_log_dir)