skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/task.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Task: a coarse-grained stage in an application."""
2
2
  import collections
3
- import inspect
4
3
  import json
5
4
  import os
6
5
  import re
@@ -8,6 +7,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
8
7
  Union)
9
8
 
10
9
  import colorama
10
+ from pydantic import SecretStr
11
11
 
12
12
  from sky import clouds
13
13
  from sky import dag as dag_lib
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
20
20
  from sky.serve import service_spec
21
21
  from sky.skylet import constants
22
22
  from sky.utils import common_utils
23
+ from sky.utils import git
23
24
  from sky.utils import registry
24
25
  from sky.utils import schemas
25
26
  from sky.utils import ux_utils
@@ -28,10 +29,6 @@ from sky.utils import yaml_utils
28
29
 
29
30
  logger = sky_logging.init_logger(__name__)
30
31
 
31
- # A lambda generating commands (node rank_i, node addrs -> cmd_i).
32
- CommandGen = Callable[[int, List[str]], Optional[str]]
33
- CommandOrCommandGen = Union[str, CommandGen]
34
-
35
32
  _VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
36
33
  _VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
37
34
  ' uppercase letters, digits, underscores, periods,'
@@ -116,7 +113,7 @@ def _fill_in_env_vars(
116
113
 
117
114
 
118
115
  def _check_docker_login_config(task_envs: Dict[str, str],
119
- task_secrets: Dict[str, str]) -> bool:
116
+ task_secrets: Dict[str, SecretStr]) -> bool:
120
117
  """Validates a valid docker login config in task_envs and task_secrets.
121
118
 
122
119
  Docker login variables must be specified together either in envs OR secrets,
@@ -177,12 +174,13 @@ def _with_docker_login_config(
177
174
  resources: Union[Set['resources_lib.Resources'],
178
175
  List['resources_lib.Resources']],
179
176
  task_envs: Dict[str, str],
180
- task_secrets: Dict[str, str],
177
+ task_secrets: Dict[str, SecretStr],
181
178
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
182
179
  if not _check_docker_login_config(task_envs, task_secrets):
183
180
  return resources
184
181
  envs = task_envs.copy()
185
- envs.update(task_secrets)
182
+ for key, value in task_secrets.items():
183
+ envs[key] = value.get_secret_value()
186
184
  docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
187
185
 
188
186
  def _add_docker_login_config(resources: 'resources_lib.Resources'):
@@ -211,10 +209,11 @@ def _with_docker_username_for_runpod(
211
209
  resources: Union[Set['resources_lib.Resources'],
212
210
  List['resources_lib.Resources']],
213
211
  task_envs: Dict[str, str],
214
- task_secrets: Dict[str, str],
212
+ task_secrets: Dict[str, SecretStr],
215
213
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
216
214
  envs = task_envs.copy()
217
- envs.update(task_secrets)
215
+ for key, value in task_secrets.items():
216
+ envs[key] = value.get_secret_value()
218
217
  docker_username_for_runpod = envs.get(
219
218
  constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
220
219
 
@@ -227,6 +226,18 @@ def _with_docker_username_for_runpod(
227
226
  for r in resources))
228
227
 
229
228
 
229
+ def get_plaintext_envs_and_secrets(
230
+ envs_and_secrets: Dict[str, Union[str, SecretStr]],) -> Dict[str, str]:
231
+ return {
232
+ k: v.get_secret_value() if isinstance(v, SecretStr) else v
233
+ for k, v in envs_and_secrets.items()
234
+ }
235
+
236
+
237
+ def get_plaintext_secrets(secrets: Dict[str, SecretStr]) -> Dict[str, str]:
238
+ return {k: v.get_secret_value() for k, v in secrets.items()}
239
+
240
+
230
241
  class Task:
231
242
  """Task: a computation to be run on the cloud."""
232
243
 
@@ -235,14 +246,14 @@ class Task:
235
246
  name: Optional[str] = None,
236
247
  *,
237
248
  setup: Optional[Union[str, List[str]]] = None,
238
- run: Optional[Union[CommandOrCommandGen, List[str]]] = None,
249
+ run: Optional[Union[str, List[str]]] = None,
239
250
  envs: Optional[Dict[str, str]] = None,
240
251
  secrets: Optional[Dict[str, str]] = None,
241
252
  workdir: Optional[Union[str, Dict[str, Any]]] = None,
242
253
  num_nodes: Optional[int] = None,
243
254
  file_mounts: Optional[Dict[str, str]] = None,
244
255
  storage_mounts: Optional[Dict[str, storage_lib.Storage]] = None,
245
- volumes: Optional[Dict[str, str]] = None,
256
+ volumes: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None,
246
257
  resources: Optional[Union['resources_lib.Resources',
247
258
  List['resources_lib.Resources'],
248
259
  Set['resources_lib.Resources']]] = None,
@@ -321,7 +332,10 @@ class Task:
321
332
  object}``, where mount_path is the path inside the remote VM(s)
322
333
  where the Storage object will be mounted on.
323
334
  volumes: A dict of volumes to be mounted for the task. The dict has
324
- the form of ``{mount_path: volume_name}``.
335
+ the form of ``{mount_path: volume_name}`` for external persistent
336
+ volumes, or ``{mount_path: volume_config}`` for ephemeral volumes
337
+ where volume_config is a dict with 'size', and optional type,
338
+ labels, and 'config' fields, etc.
325
339
  resources: either a sky.Resources, a set of them, or a list of them.
326
340
  A set or a list of resources asks the optimizer to "pick the
327
341
  best of these resources" to run this task.
@@ -344,11 +358,13 @@ class Task:
344
358
  self.storage_plans: Dict[storage_lib.Storage,
345
359
  storage_lib.StoreType] = {}
346
360
  self._envs = envs or {}
347
- self._secrets = secrets or {}
361
+ self._secrets = {}
362
+ if secrets is not None:
363
+ self._secrets = {k: SecretStr(v) for k, v in secrets.items()}
348
364
  self._volumes = volumes or {}
349
365
 
350
366
  # concatenate commands if given as list
351
- def _concat(commands):
367
+ def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
352
368
  if isinstance(commands, list):
353
369
  return '\n'.join(commands)
354
370
  return commands
@@ -446,42 +462,9 @@ class Task:
446
462
 
447
463
  def validate_run(self):
448
464
  """Validates if the run command is valid."""
449
- if callable(self.run):
450
- run_sig = inspect.signature(self.run)
451
- # Check that run is a function with 2 arguments.
452
- if len(run_sig.parameters) != 2:
453
- with ux_utils.print_exception_no_traceback():
454
- raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
455
-
456
- type_list = [int, List[str]]
457
- # Check annotations, if exists
458
- for i, param in enumerate(run_sig.parameters.values()):
459
- if param.annotation != inspect.Parameter.empty:
460
- if param.annotation != type_list[i]:
461
- with ux_utils.print_exception_no_traceback():
462
- raise ValueError(
463
- _RUN_FN_CHECK_FAIL_MSG.format(run_sig))
464
-
465
- # Check self containedness.
466
- run_closure = inspect.getclosurevars(self.run)
467
- if run_closure.nonlocals:
468
- with ux_utils.print_exception_no_traceback():
469
- raise ValueError(
470
- 'run command generator must be self contained. '
471
- f'Found nonlocals: {run_closure.nonlocals}')
472
- if run_closure.globals:
473
- with ux_utils.print_exception_no_traceback():
474
- raise ValueError(
475
- 'run command generator must be self contained. '
476
- f'Found globals: {run_closure.globals}')
477
- if run_closure.unbound:
478
- # Do not raise an error here. Import statements, which are
479
- # allowed, will be considered as unbounded.
480
- pass
481
- elif self.run is not None and not isinstance(self.run, str):
465
+ if self.run is not None and not isinstance(self.run, str):
482
466
  with ux_utils.print_exception_no_traceback():
483
- raise ValueError('run must be either a shell script (str) or '
484
- f'a command generator ({CommandGen}). '
467
+ raise ValueError('run must be a shell script (str). '
485
468
  f'Got {type(self.run)}')
486
469
 
487
470
  def expand_and_validate_file_mounts(self):
@@ -648,6 +631,10 @@ class Task:
648
631
  config['workdir'] = _fill_in_env_vars(config['workdir'],
649
632
  env_and_secrets)
650
633
 
634
+ if config.get('volumes') is not None:
635
+ config['volumes'] = _fill_in_env_vars(config['volumes'],
636
+ env_and_secrets)
637
+
651
638
  task = Task(
652
639
  config.pop('name', None),
653
640
  run=config.pop('run', None),
@@ -737,34 +724,9 @@ class Task:
737
724
  task.set_outputs(outputs=outputs,
738
725
  estimated_size_gigabytes=estimated_size_gigabytes)
739
726
 
740
- # Experimental configs.
741
- experimental_configs = config.pop('experimental', None)
742
-
743
727
  # Handle the top-level config field
744
728
  config_override = config.pop('config', None)
745
729
 
746
- # Handle backward compatibility with experimental.config_overrides
747
- # TODO: Remove experimental.config_overrides in 0.11.0.
748
- if experimental_configs is not None:
749
- exp_config_override = experimental_configs.pop(
750
- 'config_overrides', None)
751
- if exp_config_override is not None:
752
- logger.warning(
753
- f'{colorama.Fore.YELLOW}`experimental.config_overrides` '
754
- 'field is deprecated in the task YAML. Use the `config` '
755
- f'field to set config overrides.{colorama.Style.RESET_ALL}')
756
- if config_override is not None:
757
- logger.warning(
758
- f'{colorama.Fore.YELLOW}Both top-level `config` and '
759
- f'`experimental.config_overrides` are specified. '
760
- f'Using top-level `config`.{colorama.Style.RESET_ALL}')
761
- else:
762
- config_override = exp_config_override
763
- logger.debug('Overriding skypilot config with task-level config: '
764
- f'{config_override}')
765
- assert not experimental_configs, ('Invalid task args: '
766
- f'{experimental_configs.keys()}')
767
-
768
730
  # Store the final config override for use in resource setup
769
731
  cluster_config_override = config_override
770
732
 
@@ -830,16 +792,27 @@ class Task:
830
792
  # https://github.com/yaml/pyyaml/issues/165#issuecomment-430074049
831
793
  # to raise errors on duplicate keys.
832
794
  user_specified_yaml = f.read()
833
- config = yaml_utils.safe_load(user_specified_yaml)
795
+ return Task.from_yaml_str(user_specified_yaml)
796
+
797
+ @staticmethod
798
+ def from_yaml_str(yaml_str: str) -> 'Task':
799
+ """Initializes a task from a task YAML string.
800
+
801
+ Example:
802
+ .. code-block:: python
803
+
804
+ task = sky.Task.from_yaml_str('yaml_str')
805
+ """
806
+ config = yaml_utils.safe_load(yaml_str)
834
807
 
835
808
  if isinstance(config, str):
836
809
  with ux_utils.print_exception_no_traceback():
837
810
  raise ValueError('YAML loaded as str, not as dict. '
838
- f'Is it correct? Path: {yaml_path}')
811
+ f'Is it correct? content:\n{yaml_str}')
839
812
 
840
813
  if config is None:
841
814
  config = {}
842
- config['_user_specified_yaml'] = user_specified_yaml
815
+ config['_user_specified_yaml'] = yaml_str
843
816
  return Task.from_yaml_config(config)
844
817
 
845
818
  def resolve_and_validate_volumes(self) -> None:
@@ -860,13 +833,26 @@ class Task:
860
833
  volume_mounts: List[volume_lib.VolumeMount] = []
861
834
  for dst_path, vol in self._volumes.items():
862
835
  self._validate_mount_path(dst_path, location='volumes')
863
- # Shortcut for `dst_path: volume_name`
836
+ # Shortcut for `dst_path: volume_name` (external persistent volume)
864
837
  if isinstance(vol, str):
865
838
  volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
866
839
  elif isinstance(vol, dict):
867
- assert 'name' in vol, 'Volume name must be set.'
868
- volume_mount = volume_lib.VolumeMount.resolve(
869
- dst_path, vol['name'])
840
+ # Check if this is an ephemeral volume config or external volume
841
+ # with 'size' field
842
+ if 'size' in vol:
843
+ # This is an ephemeral volume config
844
+ volume_mount = (
845
+ volume_lib.VolumeMount.resolve_ephemeral_config(
846
+ dst_path, vol))
847
+ elif 'name' in vol:
848
+ # External volume with 'name' field
849
+ volume_mount = volume_lib.VolumeMount.resolve(
850
+ dst_path, vol['name'])
851
+ else:
852
+ raise ValueError(
853
+ f'Invalid volume config: {dst_path}: {vol}. '
854
+ 'Either "size" (for ephemeral volume) or "name" '
855
+ '(for external volume) must be set.')
870
856
  else:
871
857
  raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
872
858
  volume_mounts.append(volume_mount)
@@ -895,6 +881,9 @@ class Task:
895
881
  if access_mode in disabled_modes:
896
882
  raise ValueError(f'Volume {vol.volume_name} with '
897
883
  f'{disabled_modes[access_mode]}')
884
+ # Skip ephemeral volumes for topology check
885
+ if vol.is_ephemeral:
886
+ continue
898
887
  # Check topology
899
888
  for key, (vol_name, previous_req) in topology.items():
900
889
  req = getattr(vol.volume_config, key)
@@ -931,6 +920,8 @@ class Task:
931
920
  vol_req)
932
921
  else:
933
922
  override_params[key] = vol_req
923
+ logger.debug(
924
+ f'Override resources with volume constraints: {override_params}')
934
925
  self.set_resources_override(override_params)
935
926
  self.volume_mounts = volume_mounts
936
927
 
@@ -961,22 +952,26 @@ class Task:
961
952
  return self._envs
962
953
 
963
954
  @property
964
- def secrets(self) -> Dict[str, str]:
955
+ def secrets(self) -> Dict[str, SecretStr]:
965
956
  return self._secrets
966
957
 
967
958
  @property
968
- def volumes(self) -> Dict[str, str]:
959
+ def volumes(self) -> Dict[str, Union[str, Dict[str, Any]]]:
969
960
  return self._volumes
970
961
 
971
- def set_volumes(self, volumes: Dict[str, str]) -> None:
962
+ def set_volumes(self, volumes: Dict[str, Union[str, Dict[str,
963
+ Any]]]) -> None:
972
964
  """Sets the volumes for this task.
973
965
 
974
966
  Args:
975
- volumes: a dict of ``{mount_path: volume_name}``.
967
+ volumes: a dict of ``{mount_path: volume_name}`` for external
968
+ persistent volumes, or ``{mount_path: volume_config}`` for
969
+ ephemeral volumes.
976
970
  """
977
971
  self._volumes = volumes
978
972
 
979
- def update_volumes(self, volumes: Dict[str, str]) -> None:
973
+ def update_volumes(self, volumes: Dict[str, Union[str, Dict[str,
974
+ Any]]]) -> None:
980
975
  """Updates the volumes for this task."""
981
976
  self._volumes.update(volumes)
982
977
 
@@ -1064,7 +1059,8 @@ class Task:
1064
1059
  raise ValueError(
1065
1060
  'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
1066
1061
  f'{secrets}')
1067
- self._secrets.update(secrets)
1062
+ for key, value in secrets.items():
1063
+ self._secrets[key] = SecretStr(value)
1068
1064
  # Validate Docker login configuration if needed
1069
1065
  if _check_docker_login_config(self._envs, self._secrets):
1070
1066
  self.resources = _with_docker_login_config(self.resources,
@@ -1079,7 +1075,7 @@ class Task:
1079
1075
  return any(r.use_spot for r in self.resources)
1080
1076
 
1081
1077
  @property
1082
- def envs_and_secrets(self) -> Dict[str, str]:
1078
+ def envs_and_secrets(self) -> Dict[str, Union[str, SecretStr]]:
1083
1079
  envs = self.envs.copy()
1084
1080
  envs.update(self.secrets)
1085
1081
  return envs
@@ -1125,7 +1121,7 @@ class Task:
1125
1121
  def set_resources(
1126
1122
  self, resources: Union['resources_lib.Resources',
1127
1123
  List['resources_lib.Resources'],
1128
- Set['resources_lib.Resources']]
1124
+ Set['resources_lib.Resources'], Dict[str, Any]]
1129
1125
  ) -> 'Task':
1130
1126
  """Sets the required resources to execute this task.
1131
1127
 
@@ -1139,7 +1135,9 @@ class Task:
1139
1135
  Returns:
1140
1136
  self: The current task, with resources set.
1141
1137
  """
1142
- if isinstance(resources, resources_lib.Resources):
1138
+ if isinstance(resources, dict):
1139
+ resources = resources_lib.Resources.from_yaml_config(resources)
1140
+ elif isinstance(resources, resources_lib.Resources):
1143
1141
  resources = {resources}
1144
1142
  # TODO(woosuk): Check if the resources are None.
1145
1143
  self.resources = _with_docker_login_config(resources, self.envs,
@@ -1167,6 +1165,10 @@ class Task:
1167
1165
  self.set_resources(type(self.resources)(new_resources_list))
1168
1166
  return self
1169
1167
 
1168
+ def get_resource_config(self) -> Dict[str, Any]:
1169
+ return _resources_to_config(self.resources,
1170
+ factor_out_common_fields=True)
1171
+
1170
1172
  @property
1171
1173
  def service(self) -> Optional[service_spec.SkyServiceSpec]:
1172
1174
  return self._service
@@ -1547,6 +1549,16 @@ class Task:
1547
1549
  self.update_file_mounts({
1548
1550
  mnt_path: blob_path,
1549
1551
  })
1552
+ elif store_type is storage_lib.StoreType.COREWEAVE:
1553
+ if storage.source is not None and not isinstance(
1554
+ storage.source,
1555
+ list) and storage.source.startswith('cw://'):
1556
+ blob_path = storage.source
1557
+ else:
1558
+ blob_path = 'cw://' + storage.name
1559
+ self.update_file_mounts({
1560
+ mnt_path: blob_path,
1561
+ })
1550
1562
  else:
1551
1563
  with ux_utils.print_exception_no_traceback():
1552
1564
  raise ValueError(f'Storage Type {store_type} '
@@ -1596,6 +1608,69 @@ class Task:
1596
1608
  d[k] = v
1597
1609
  return d
1598
1610
 
1611
+ def update_workdir(self, workdir: Optional[str], git_url: Optional[str],
1612
+ git_ref: Optional[str]) -> 'Task':
1613
+ """Updates the task workdir.
1614
+
1615
+ Args:
1616
+ workdir: The workdir to update.
1617
+ git_url: The git url to update.
1618
+ git_ref: The git ref to update.
1619
+ """
1620
+ if self.workdir is None or isinstance(self.workdir, str):
1621
+ if workdir is not None:
1622
+ self.workdir = workdir
1623
+ return self
1624
+ if git_url is not None:
1625
+ self.workdir = {}
1626
+ self.workdir['url'] = git_url
1627
+ if git_ref is not None:
1628
+ self.workdir['ref'] = git_ref
1629
+ return self
1630
+ return self
1631
+ if git_url is not None:
1632
+ self.workdir['url'] = git_url
1633
+ if git_ref is not None:
1634
+ self.workdir['ref'] = git_ref
1635
+ return self
1636
+
1637
+ def update_envs_and_secrets_from_workdir(self) -> 'Task':
1638
+ """Updates the task envs and secrets from the workdir."""
1639
+ if self.workdir is None:
1640
+ return self
1641
+ if not isinstance(self.workdir, dict):
1642
+ return self
1643
+ url = self.workdir['url']
1644
+ ref = self.workdir.get('ref', '')
1645
+ token = os.environ.get(git.GIT_TOKEN_ENV_VAR)
1646
+ ssh_key_path = os.environ.get(git.GIT_SSH_KEY_PATH_ENV_VAR)
1647
+ try:
1648
+ git_repo = git.GitRepo(url, ref, token, ssh_key_path)
1649
+ clone_info = git_repo.get_repo_clone_info()
1650
+ if clone_info is None:
1651
+ return self
1652
+ self.envs[git.GIT_URL_ENV_VAR] = clone_info.url
1653
+ if ref:
1654
+ ref_type = git_repo.get_ref_type()
1655
+ if ref_type == git.GitRefType.COMMIT:
1656
+ self.envs[git.GIT_COMMIT_HASH_ENV_VAR] = ref
1657
+ elif ref_type == git.GitRefType.BRANCH:
1658
+ self.envs[git.GIT_BRANCH_ENV_VAR] = ref
1659
+ elif ref_type == git.GitRefType.TAG:
1660
+ self.envs[git.GIT_TAG_ENV_VAR] = ref
1661
+ if clone_info.token is None and clone_info.ssh_key is None:
1662
+ return self
1663
+ if clone_info.token is not None:
1664
+ self.secrets[git.GIT_TOKEN_ENV_VAR] = SecretStr(
1665
+ clone_info.token)
1666
+ if clone_info.ssh_key is not None:
1667
+ self.secrets[git.GIT_SSH_KEY_ENV_VAR] = SecretStr(
1668
+ clone_info.ssh_key)
1669
+ except exceptions.GitError as e:
1670
+ with ux_utils.print_exception_no_traceback():
1671
+ raise ValueError(f'{str(e)}') from None
1672
+ return self
1673
+
1599
1674
  def to_yaml_config(self,
1600
1675
  use_user_specified_yaml: bool = False) -> Dict[str, Any]:
1601
1676
  """Returns a yaml-style dict representation of the task.
@@ -1622,16 +1697,7 @@ class Task:
1622
1697
 
1623
1698
  add_if_not_none('name', self.name)
1624
1699
 
1625
- tmp_resource_config: Union[Dict[str, Union[str, int]],
1626
- Dict[str, List[Dict[str, Union[str, int]]]]]
1627
- if len(self.resources) > 1:
1628
- resource_list = []
1629
- for r in self.resources:
1630
- resource_list.append(r.to_yaml_config())
1631
- key = 'ordered' if isinstance(self.resources, list) else 'any_of'
1632
- tmp_resource_config = {key: resource_list}
1633
- else:
1634
- tmp_resource_config = list(self.resources)[0].to_yaml_config()
1700
+ tmp_resource_config = _resources_to_config(self.resources)
1635
1701
 
1636
1702
  add_if_not_none('resources', tmp_resource_config)
1637
1703
 
@@ -1657,8 +1723,10 @@ class Task:
1657
1723
  add_if_not_none('envs', self.envs, no_empty=True)
1658
1724
 
1659
1725
  secrets = self.secrets
1660
- if secrets and redact_secrets:
1661
- secrets = {k: '<redacted>' for k in secrets}
1726
+ if secrets and not redact_secrets:
1727
+ secrets = {k: v.get_secret_value() for k, v in secrets.items()}
1728
+ elif secrets and redact_secrets:
1729
+ secrets = {k: '<redacted>' for k, v in secrets.items()}
1662
1730
  add_if_not_none('secrets', secrets, no_empty=True)
1663
1731
 
1664
1732
  add_if_not_none('file_mounts', {})
@@ -1709,7 +1777,12 @@ class Task:
1709
1777
  return required_features
1710
1778
 
1711
1779
  def __rshift__(self, b):
1712
- dag_lib.get_current_dag().add_edge(self, b)
1780
+ dag = dag_lib.get_current_dag()
1781
+ if dag is None:
1782
+ raise RuntimeError(
1783
+ 'Cannot use >> operator outside of a DAG context. '
1784
+ 'Please use "with sky.Dag() as dag:" to create a DAG context.')
1785
+ dag.add_edge(self, b)
1713
1786
 
1714
1787
  def __repr__(self):
1715
1788
  if isinstance(self.run, str):
@@ -1744,3 +1817,47 @@ class Task:
1744
1817
  else:
1745
1818
  s += '\n resources: default instances'
1746
1819
  return s
1820
+
1821
+
1822
+ def _resources_to_config(
1823
+ resources: Union[List['resources_lib.Resources'],
1824
+ Set['resources_lib.Resources']],
1825
+ factor_out_common_fields: bool = False) -> Dict[str, Any]:
1826
+ if len(resources) > 1:
1827
+ resource_list: List[Dict[str, Union[str, int]]] = []
1828
+ for r in resources:
1829
+ resource_list.append(r.to_yaml_config())
1830
+ group_key = 'ordered' if isinstance(resources, list) else 'any_of'
1831
+ if factor_out_common_fields:
1832
+ return _factor_out_common_resource_fields(resource_list, group_key)
1833
+ return {group_key: resource_list}
1834
+ else:
1835
+ return list(resources)[0].to_yaml_config()
1836
+
1837
+
1838
+ def _factor_out_common_resource_fields(configs: List[Dict[str, Union[str,
1839
+ int]]],
1840
+ group_key: str) -> Dict[str, Any]:
1841
+ """Factors out the fields that are common to all resources."""
1842
+ return_config: Dict[str, Any] = configs[0].copy()
1843
+ if len(configs) > 1:
1844
+ for config in configs[1:]:
1845
+ for key, value in config.items():
1846
+ if key in return_config and return_config[key] != value:
1847
+ del return_config[key]
1848
+ num_empty_configs = 0
1849
+ for config in configs:
1850
+ keys_to_delete = []
1851
+ for key, value in config.items():
1852
+ if key in return_config:
1853
+ keys_to_delete.append(key)
1854
+ for key in keys_to_delete:
1855
+ del config[key]
1856
+ if not config:
1857
+ num_empty_configs += 1
1858
+
1859
+ if num_empty_configs == len(configs):
1860
+ return return_config
1861
+ if len(configs) > 0:
1862
+ return_config[group_key] = configs
1863
+ return return_config
@@ -190,6 +190,7 @@ setup_commands:
190
190
  {{ conda_installation_commands }}
191
191
  conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true;
192
192
  {{ ray_skypilot_installation_commands }}
193
+ {{ copy_skypilot_templates_commands }}
193
194
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
194
195
  {%- if docker_image is none %}
195
196
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -118,6 +118,7 @@ setup_commands:
118
118
  - mkdir -p ~/.ssh; touch ~/.ssh/config;
119
119
  {{ conda_installation_commands }}
120
120
  {{ ray_skypilot_installation_commands }}
121
+ {{ copy_skypilot_templates_commands }}
121
122
  touch ~/.sudo_as_admin_successful;
122
123
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
123
124
  {%- if docker_image is none %}
@@ -68,6 +68,7 @@ setup_commands:
68
68
  mkdir -p ~/.ssh; touch ~/.ssh/config;
69
69
  {{ conda_installation_commands }}
70
70
  {{ ray_skypilot_installation_commands }}
71
+ {{ copy_skypilot_templates_commands }}
71
72
  touch ~/.sudo_as_admin_successful;
72
73
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
73
74
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -89,6 +89,7 @@ setup_commands:
89
89
  mkdir -p ~/.ssh; touch ~/.ssh/config;
90
90
  {{ conda_installation_commands }}
91
91
  {{ ray_skypilot_installation_commands }}
92
+ {{ copy_skypilot_templates_commands }}
92
93
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
93
94
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
94
95
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
@@ -69,6 +69,7 @@ setup_commands:
69
69
  mkdir -p ~/.ssh; touch ~/.ssh/config;
70
70
  {{ conda_installation_commands }}
71
71
  {{ ray_skypilot_installation_commands }}
72
+ {{ copy_skypilot_templates_commands }}
72
73
  touch ~/.sudo_as_admin_successful;
73
74
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
74
75
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -276,6 +276,7 @@ setup_commands:
276
276
  grep "export TPU_NAME=" ~/.bashrc && echo "TPU_NAME already set" || echo "export TPU_NAME={{tpu_node_name}}" >> ~/.bashrc;
277
277
  {%- endif %}
278
278
  {{ ray_skypilot_installation_commands }}
279
+ {{ copy_skypilot_templates_commands }}
279
280
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
280
281
  {%- if docker_image is none %}
281
282
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
@@ -62,6 +62,7 @@ setup_commands:
62
62
  which patch > /dev/null || sudo apt install -y patch;
63
63
  {{ conda_installation_commands }}
64
64
  {{ ray_skypilot_installation_commands }}
65
+ {{ copy_skypilot_templates_commands }}
65
66
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
66
67
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
67
68
  {{ ssh_max_sessions_config }}
@@ -102,6 +102,7 @@ setup_commands:
102
102
  mkdir -p ~/.ssh; touch ~/.ssh/config;
103
103
  {{ conda_installation_commands }}
104
104
  {{ ray_skypilot_installation_commands }}
105
+ {{ copy_skypilot_templates_commands }}
105
106
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
106
107
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
107
108
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -121,7 +122,7 @@ head_start_ray_commands:
121
122
  # all the sessions to be reloaded. This is a workaround.
122
123
  - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
123
124
  which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
124
- {{dump_port_command}}; {{ray_head_wait_initialized_command}}
125
+ {{dump_port_command}} {{ray_head_wait_initialized_command}}
125
126
 
126
127
  {%- if num_nodes > 1 %}
127
128
  worker_start_ray_commands:
@@ -36,6 +36,9 @@ setup: |
36
36
  grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
37
37
  {% endif %}
38
38
 
39
+ # This is used by the skylet events to check if we are a jobs controller.
40
+ touch {{job_controller_indicator_file}}
41
+
39
42
  run: |
40
43
  {%- if consolidation_mode_job_id is none %}
41
44
  {{ sky_activate_python_env }}