skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,16 @@
1
+ """Utilities for handling resource handles."""
2
+ import copy
3
+ import typing
4
+
5
+
6
+ def prepare_handle_for_backwards_compatibility(
7
+ handle: typing.Any) -> typing.Any:
8
+ """Prepare a handle for backwards compatibility with older clients."""
9
+ # skylet_ssh_tunnel was causing backwards compatibility issues with older
10
+ # clients: AttributeError: Can't get attribute 'SSHTunnelInfo'
11
+ #
12
+ # But it is not needed on the client side, so we can just remove it.
13
+ if handle is not None and hasattr(handle, 'skylet_ssh_tunnel'):
14
+ handle = copy.deepcopy(handle)
15
+ handle.skylet_ssh_tunnel = None
16
+ return handle
sky/utils/status_lib.py CHANGED
@@ -54,3 +54,13 @@ class StorageStatus(enum.Enum):
54
54
 
55
55
  # Finished uploading, in terminal state
56
56
  READY = 'READY'
57
+
58
+
59
+ class VolumeStatus(enum.Enum):
60
+ """Volume status as recorded in table 'volumes'."""
61
+
62
+ # Volume is ready to be used
63
+ READY = 'READY'
64
+
65
+ # Volume is being used
66
+ IN_USE = 'IN_USE'
@@ -6,18 +6,20 @@ import random
6
6
  import resource
7
7
  import shlex
8
8
  import subprocess
9
+ import sys
9
10
  import threading
10
11
  import time
11
12
  import typing
12
- from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
13
+ from typing import (Any, Callable, Dict, List, Optional, Protocol, Set, Tuple,
14
+ Union)
13
15
 
14
16
  import colorama
15
17
 
16
18
  from sky import exceptions
17
19
  from sky import sky_logging
18
20
  from sky.adaptors import common as adaptors_common
19
- from sky.skylet import constants
20
21
  from sky.skylet import log_lib
22
+ from sky.skylet import subprocess_daemon
21
23
  from sky.utils import common_utils
22
24
  from sky.utils import timeline
23
25
  from sky.utils import ux_utils
@@ -107,7 +109,7 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
107
109
 
108
110
 
109
111
  def run_in_parallel(func: Callable,
110
- args: List[Any],
112
+ args: Union[List[Any], Set[Any]],
111
113
  num_threads: Optional[int] = None) -> List[Any]:
112
114
  """Run a function in parallel on a list of arguments.
113
115
 
@@ -128,7 +130,7 @@ def run_in_parallel(func: Callable,
128
130
  if len(args) == 0:
129
131
  return []
130
132
  if len(args) == 1:
131
- return [func(args[0])]
133
+ return [func(list(args)[0])]
132
134
 
133
135
  processes = (num_threads
134
136
  if num_threads is not None else get_parallel_threads())
@@ -208,8 +210,11 @@ def kill_children_processes(parent_pids: Optional[Union[
208
210
  kill_process_with_grace_period(child, force=force)
209
211
 
210
212
 
211
- def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
212
- psutil.Process],
213
+ GenericProcess = Union[multiprocessing.Process, psutil.Process,
214
+ subprocess.Popen]
215
+
216
+
217
+ def kill_process_with_grace_period(proc: GenericProcess,
213
218
  force: bool = False,
214
219
  grace_period: int = 10) -> None:
215
220
  """Kill a process with SIGTERM and wait for it to exit.
@@ -223,6 +228,9 @@ def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
223
228
  if isinstance(proc, psutil.Process):
224
229
  alive = proc.is_running
225
230
  wait = proc.wait
231
+ elif isinstance(proc, subprocess.Popen):
232
+ alive = lambda: proc.poll() is None
233
+ wait = proc.wait
226
234
  else:
227
235
  alive = proc.is_alive
228
236
  wait = proc.join
@@ -240,11 +248,10 @@ def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
240
248
  # The child process may have already been terminated.
241
249
  return
242
250
  except psutil.TimeoutExpired:
243
- # Pass to finally to force kill the process.
244
- pass
245
- finally:
246
251
  logger.debug(f'Process {proc.pid} did not terminate after '
247
252
  f'{grace_period} seconds')
253
+ # Continue to finally to force kill the process.
254
+ finally:
248
255
  # Attempt to force kill if the normal termination fails
249
256
  if not force:
250
257
  logger.debug(f'Force killing process {proc.pid}')
@@ -300,11 +307,17 @@ def run_with_retries(
300
307
  return returncode, stdout, stderr
301
308
 
302
309
 
303
- def kill_process_daemon(process_pid: int) -> None:
310
+ def kill_process_daemon(process_pid: int, use_kill_pg: bool = False) -> None:
304
311
  """Start a daemon as a safety net to kill the process.
305
312
 
306
313
  Args:
307
314
  process_pid: The PID of the process to kill.
315
+ use_kill_pg: Whether to use kill process group to kill the process. If
316
+ True, the process will use os.killpg() to kill the target process
317
+ group on UNIX system, which is more efficient than using the daemon
318
+ to refresh the process tree in the daemon. Note that both
319
+ implementations have corner cases where subprocesses might not be
320
+ killed. Refer to subprocess_daemon.py for more details.
308
321
  """
309
322
  # Get initial children list
310
323
  try:
@@ -317,12 +330,8 @@ def kill_process_daemon(process_pid: int) -> None:
317
330
  daemon_script = os.path.join(
318
331
  os.path.dirname(os.path.abspath(log_lib.__file__)),
319
332
  'subprocess_daemon.py')
320
- python_path = subprocess.check_output(constants.SKY_GET_PYTHON_PATH_CMD,
321
- shell=True,
322
- stderr=subprocess.DEVNULL,
323
- encoding='utf-8').strip()
324
333
  daemon_cmd = [
325
- python_path,
334
+ sys.executable,
326
335
  daemon_script,
327
336
  '--parent-pid',
328
337
  str(parent_pid),
@@ -335,6 +344,10 @@ def kill_process_daemon(process_pid: int) -> None:
335
344
  ','.join(map(str, initial_children)),
336
345
  ]
337
346
 
347
+ env = os.environ.copy()
348
+ if use_kill_pg:
349
+ env[subprocess_daemon.USE_KILL_PG_ENV_VAR] = '1'
350
+
338
351
  # We do not need to set `start_new_session=True` here, as the
339
352
  # daemon script will detach itself from the parent process with
340
353
  # fork to avoid being killed by parent process. See the reason we
@@ -346,6 +359,7 @@ def kill_process_daemon(process_pid: int) -> None:
346
359
  stderr=subprocess.DEVNULL,
347
360
  # Disable input
348
361
  stdin=subprocess.DEVNULL,
362
+ env=env,
349
363
  )
350
364
 
351
365
 
@@ -436,3 +450,12 @@ def slow_start_processes(processes: List[Startable],
436
450
  break
437
451
  batch_size = min(batch_size * 2, max_batch_size)
438
452
  time.sleep(delay)
453
+
454
+
455
+ def is_process_alive(pid: int) -> bool:
456
+ """Check if a process is alive."""
457
+ try:
458
+ process = psutil.Process(pid)
459
+ return process.is_running()
460
+ except psutil.NoSuchProcess:
461
+ return False
sky/utils/tempstore.py ADDED
@@ -0,0 +1,70 @@
1
+ """Temporary storage context manager."""
2
+
3
+ import contextlib
4
+ import contextvars
5
+ import functools
6
+ import os
7
+ import tempfile
8
+ import typing
9
+ from typing import Any, Callable, Iterator, Optional, TypeVar
10
+
11
+ _TEMP_DIR: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar(
12
+ 'temp_store_dir', default=None)
13
+
14
+
15
+ @contextlib.contextmanager
16
+ def tempdir() -> Iterator[str]:
17
+ """Context manager for temporary directory of current context.
18
+
19
+ This wraps tempfile.TemporaryDirectory and makes the temp dir available
20
+ throughout the context, eliminating the need to pass the temp dir to
21
+ the nested functions that need it.
22
+
23
+ This context manager is nestable - nested calls will create new temp dirs
24
+ and restore the previous temp dir when exiting.
25
+ """
26
+ with tempfile.TemporaryDirectory(prefix='sky-tmp') as temp_dir:
27
+ token = _TEMP_DIR.set(temp_dir)
28
+ try:
29
+ yield temp_dir
30
+ finally:
31
+ _TEMP_DIR.reset(token)
32
+
33
+
34
+ # Keep the function signature same as tempfile.mkdtemp.
35
+ # pylint: disable=redefined-builtin
36
+ def mkdtemp(suffix: Optional[str] = None,
37
+ prefix: Optional[str] = None,
38
+ dir: Optional[str] = None) -> str:
39
+ """Create a temporary directory in the temp dir of current context.
40
+
41
+ The directory will be cleaned when the current context exits.
42
+ If there is no temp dir in current context, this function is equivalent to
43
+ tempfile.mkdtemp.
44
+ """
45
+ context_temp_dir = _TEMP_DIR.get()
46
+
47
+ if context_temp_dir is not None and dir is None:
48
+ dir = context_temp_dir
49
+ elif context_temp_dir is not None and dir is not None:
50
+ dir = os.path.join(context_temp_dir, dir)
51
+ os.makedirs(dir, exist_ok=True)
52
+
53
+ return tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir)
54
+
55
+
56
+ F = TypeVar('F', bound=Callable[..., Any])
57
+
58
+
59
+ def with_tempdir(func: F) -> F:
60
+ """Decorator that wraps a function call with tempdir() context manager.
61
+
62
+ Refer to `tempdir` for more details.
63
+ """
64
+
65
+ @functools.wraps(func)
66
+ def wrapper(*args, **kwargs):
67
+ with tempdir():
68
+ return func(*args, **kwargs)
69
+
70
+ return typing.cast(F, wrapper)
sky/utils/timeline.py CHANGED
@@ -4,7 +4,6 @@ The timeline follows the trace event format defined here:
4
4
  https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
5
5
  """ # pylint: disable=line-too-long
6
6
  import atexit
7
- import functools
8
7
  import json
9
8
  import os
10
9
  import threading
@@ -12,13 +11,15 @@ import time
12
11
  import traceback
13
12
  from typing import Callable, Optional, Union
14
13
 
15
- import filelock
16
-
17
14
  from sky.utils import common_utils
18
15
 
19
16
  _events = []
20
17
 
21
18
 
19
+ def _get_events_file_path():
20
+ return os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
21
+
22
+
22
23
  class Event:
23
24
  """Record an event.
24
25
 
@@ -28,6 +29,10 @@ class Event:
28
29
  """
29
30
 
30
31
  def __init__(self, name: str, message: Optional[str] = None):
32
+ self._skipped = False
33
+ if not _get_events_file_path():
34
+ self._skipped = True
35
+ return
31
36
  self._name = name
32
37
  self._message = message
33
38
  # See the module doc for the event format.
@@ -44,6 +49,8 @@ class Event:
44
49
  self._event['args'] = {'message': self._message}
45
50
 
46
51
  def begin(self):
52
+ if self._skipped:
53
+ return
47
54
  event_begin = self._event.copy()
48
55
  event_begin.update({
49
56
  'ph': 'B',
@@ -55,6 +62,8 @@ class Event:
55
62
  _events.append(event_begin)
56
63
 
57
64
  def end(self):
65
+ if self._skipped:
66
+ return
58
67
  event_end = self._event.copy()
59
68
  event_end.update({
60
69
  'ph': 'E',
@@ -76,63 +85,26 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
76
85
  return common_utils.make_decorator(Event, name_or_fn, message=message)
77
86
 
78
87
 
79
- class FileLockEvent:
80
- """Serve both as a file lock and event for the lock."""
81
-
82
- def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
83
- self._lockfile = lockfile
84
- os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
85
- exist_ok=True)
86
- self._lock = filelock.FileLock(self._lockfile, timeout)
87
- self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
88
-
89
- def acquire(self):
90
- was_locked = self._lock.is_locked
91
- with Event(f'[FileLock.acquire]:{self._lockfile}'):
92
- self._lock.acquire()
93
- if not was_locked and self._lock.is_locked:
94
- # start holding the lock after initial acquiring
95
- self._hold_lock_event.begin()
96
-
97
- def release(self):
98
- was_locked = self._lock.is_locked
99
- self._lock.release()
100
- if was_locked and not self._lock.is_locked:
101
- # stop holding the lock after initial releasing
102
- self._hold_lock_event.end()
103
-
104
- def __enter__(self):
105
- self.acquire()
106
- return self
107
-
108
- def __exit__(self, exc_type, exc_val, exc_tb):
109
- self.release()
110
-
111
- def __call__(self, f):
112
- # Make this class callable as a decorator.
113
- @functools.wraps(f)
114
- def wrapper(*args, **kwargs):
115
- with self:
116
- return f(*args, **kwargs)
117
-
118
- return wrapper
119
-
120
-
121
88
  def save_timeline():
122
- file_path = os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
123
- if not file_path:
89
+ events_file_path = _get_events_file_path()
90
+ if not events_file_path:
124
91
  return
92
+ global _events
93
+ events_to_write = _events
94
+ _events = []
125
95
  json_output = {
126
- 'traceEvents': _events,
96
+ 'traceEvents': events_to_write,
127
97
  'displayTimeUnit': 'ms',
128
98
  'otherData': {
129
- 'log_dir': os.path.dirname(os.path.abspath(file_path)),
99
+ 'log_dir': os.path.dirname(os.path.abspath(events_file_path)),
130
100
  }
131
101
  }
132
- os.makedirs(os.path.dirname(os.path.abspath(file_path)), exist_ok=True)
133
- with open(file_path, 'w', encoding='utf-8') as f:
102
+ os.makedirs(os.path.dirname(os.path.abspath(events_file_path)),
103
+ exist_ok=True)
104
+ with open(events_file_path, 'w', encoding='utf-8') as f:
134
105
  json.dump(json_output, f)
106
+ del events_to_write
135
107
 
136
108
 
137
- if os.environ.get('SKYPILOT_TIMELINE_FILE_PATH'):
109
+ if _get_events_file_path():
138
110
  atexit.register(save_timeline)
sky/utils/ux_utils.py CHANGED
@@ -1,17 +1,19 @@
1
1
  """Utility functions for UX."""
2
2
  import contextlib
3
3
  import enum
4
+ import fnmatch
4
5
  import os
5
6
  import sys
6
7
  import traceback
7
8
  import typing
8
- from typing import Callable, Optional, Union
9
+ from typing import Callable, Iterable, List, Optional, Union
9
10
 
10
11
  import colorama
11
12
 
12
13
  from sky import sky_logging
13
14
  from sky.skylet import constants
14
15
  from sky.utils import common_utils
16
+ from sky.utils import env_options
15
17
  from sky.utils import rich_console_utils
16
18
 
17
19
  if typing.TYPE_CHECKING:
@@ -25,9 +27,16 @@ BOLD = '\033[1m'
25
27
  RESET_BOLD = '\033[0m'
26
28
 
27
29
  # Log path hint in the spinner during launching
30
+ # (old, kept for backward compatibility)
28
31
  _LOG_PATH_HINT = (f'{colorama.Style.DIM}View logs: sky api logs -l '
29
32
  '{log_path}'
30
33
  f'{colorama.Style.RESET_ALL}')
34
+ # Log hint: recommend sky logs --provision <cluster_name>
35
+ _PROVISION_LOG_HINT = (
36
+ f'{colorama.Style.DIM}View logs: '
37
+ f'{BOLD}sky logs --provision {{cluster_name}}{RESET_BOLD}'
38
+ f'{colorama.Style.RESET_ALL}')
39
+ # Legacy path hint retained for local-only cases where we don't have cluster
31
40
  _LOG_PATH_HINT_LOCAL = (f'{colorama.Style.DIM}View logs: '
32
41
  '{log_path}'
33
42
  f'{colorama.Style.RESET_ALL}')
@@ -57,10 +66,14 @@ def print_exception_no_traceback():
57
66
  if error():
58
67
  raise ValueError('...')
59
68
  """
60
- original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
61
- sys.tracebacklimit = 0
62
- yield
63
- sys.tracebacklimit = original_tracelimit
69
+ if env_options.Options.SHOW_DEBUG_INFO.get():
70
+ # When SKYPILOT_DEBUG is set, show the full traceback
71
+ yield
72
+ else:
73
+ original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
74
+ sys.tracebacklimit = 0
75
+ yield
76
+ sys.tracebacklimit = original_tracelimit
64
77
 
65
78
 
66
79
  @contextlib.contextmanager
@@ -121,7 +134,10 @@ class RedirectOutputForProcess:
121
134
 
122
135
  def log_path_hint(log_path: Union[str, 'pathlib.Path'],
123
136
  is_local: bool = False) -> str:
124
- """Gets the log path hint for the given log path."""
137
+ """Gets the log path hint for the given log path.
138
+
139
+ Kept for backward compatibility when only paths are available.
140
+ """
125
141
  log_path = str(log_path)
126
142
  expanded_home = os.path.expanduser('~')
127
143
  if log_path.startswith(expanded_home):
@@ -134,6 +150,12 @@ def log_path_hint(log_path: Union[str, 'pathlib.Path'],
134
150
  return _LOG_PATH_HINT.format(log_path=log_path)
135
151
 
136
152
 
153
+ def provision_hint(cluster_name: Optional[str]) -> Optional[str]:
154
+ if not cluster_name:
155
+ return None
156
+ return _PROVISION_LOG_HINT.format(cluster_name=cluster_name)
157
+
158
+
137
159
  def starting_message(message: str) -> str:
138
160
  """Gets the starting message for the given message."""
139
161
  # We have to reset the color before the message, because sometimes if a
@@ -145,7 +167,8 @@ def starting_message(message: str) -> str:
145
167
  def finishing_message(message: str,
146
168
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
147
169
  is_local: bool = False,
148
- follow_up_message: Optional[str] = None) -> str:
170
+ follow_up_message: Optional[str] = None,
171
+ cluster_name: Optional[str] = None) -> str:
149
172
  """Gets the finishing message for the given message.
150
173
 
151
174
  Args:
@@ -161,7 +184,11 @@ def finishing_message(message: str,
161
184
  follow_up_message = follow_up_message if (follow_up_message
162
185
  is not None) else ''
163
186
  success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
164
- f'{message}{colorama.Style.RESET_ALL}{follow_up_message}')
187
+ f'{message}{colorama.Style.RESET_ALL}{follow_up_message}'
188
+ f'{colorama.Style.RESET_ALL}')
189
+ hint = provision_hint(cluster_name)
190
+ if hint:
191
+ return f'{success_prefix} {hint}'
165
192
  if log_path is None:
166
193
  return success_prefix
167
194
  path_hint = log_path_hint(log_path, is_local)
@@ -170,13 +197,17 @@ def finishing_message(message: str,
170
197
 
171
198
  def error_message(message: str,
172
199
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
173
- is_local: bool = False) -> str:
200
+ is_local: bool = False,
201
+ cluster_name: Optional[str] = None) -> str:
174
202
  """Gets the error message for the given message."""
175
203
  # We have to reset the color before the message, because sometimes if a
176
204
  # previous spinner with dimmed color overflows in a narrow terminal, the
177
205
  # color might be messed up.
178
206
  error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
179
207
  f'{colorama.Style.RESET_ALL} {message}')
208
+ hint = provision_hint(cluster_name)
209
+ if hint:
210
+ return f'{error_prefix} {hint}'
180
211
  if log_path is None:
181
212
  return error_prefix
182
213
  path_hint = log_path_hint(log_path, is_local)
@@ -194,9 +225,16 @@ def retry_message(message: str) -> str:
194
225
 
195
226
  def spinner_message(message: str,
196
227
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
197
- is_local: bool = False) -> str:
198
- """Gets the spinner message for the given message and log path."""
228
+ is_local: bool = False,
229
+ cluster_name: Optional[str] = None) -> str:
230
+ """Gets the spinner message for the given message and log path.
231
+
232
+ If cluster_name is provided, recommend `sky logs --provision <cluster>`.
233
+ """
199
234
  colored_spinner = f'[bold cyan]{message}[/]'
235
+ hint = provision_hint(cluster_name)
236
+ if hint:
237
+ return f'{colored_spinner} {hint}'
200
238
  if log_path is None:
201
239
  return colored_spinner
202
240
  path_hint = log_path_hint(log_path, is_local)
@@ -247,9 +285,40 @@ def command_hint_messages(hint_type: CommandHintType,
247
285
  f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
248
286
  f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
249
287
  f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
250
- f'\n{INDENT_SYMBOL}To view all managed jobs:\t\t'
251
- f'{BOLD}sky jobs queue{RESET_BOLD}'
252
- f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
253
- f'{BOLD}sky jobs dashboard{RESET_BOLD}')
288
+ f'\n{INDENT_LAST_SYMBOL}To view all managed jobs:\t\t'
289
+ f'{BOLD}sky jobs queue{RESET_BOLD}')
254
290
  else:
255
291
  raise ValueError(f'Invalid hint type: {hint_type}')
292
+
293
+
294
+ def is_glob_pattern(pattern: str) -> bool:
295
+ """Checks if a string contains common glob pattern wildcards."""
296
+ glob_chars = {'*', '?', '[', ']'}
297
+ # Also check for '**' as a specific globstar pattern
298
+ if '**' in pattern:
299
+ return True
300
+ for char in pattern:
301
+ if char in glob_chars:
302
+ return True
303
+ return False
304
+
305
+
306
+ def get_non_matched_query(query_clusters: Iterable[str],
307
+ cluster_names: Iterable[str]) -> List[str]:
308
+ """Gets the non-matched query clusters."""
309
+ glob_query_clusters = []
310
+ non_glob_query_clusters = []
311
+ for cluster_name in query_clusters:
312
+ if is_glob_pattern(cluster_name):
313
+ glob_query_clusters.append(cluster_name)
314
+ else:
315
+ non_glob_query_clusters.append(cluster_name)
316
+ not_found_clusters = [
317
+ query_cluster for query_cluster in non_glob_query_clusters
318
+ if query_cluster not in cluster_names
319
+ ]
320
+ not_found_clusters.extend([
321
+ query_cluster for query_cluster in glob_query_clusters
322
+ if not fnmatch.filter(cluster_names, query_cluster)
323
+ ])
324
+ return not_found_clusters
sky/utils/validator.py CHANGED
@@ -14,9 +14,19 @@ def case_insensitive_enum(validator, enums, instance, schema):
14
14
  f'{instance!r} is not one of {enums!r}')
15
15
 
16
16
 
17
+ def case_sensitive_enum(validator, enums, instance, schema):
18
+ del validator, schema # Unused.
19
+ if instance not in enums:
20
+ yield jsonschema.ValidationError(
21
+ f'{instance!r} is not one of {enums!r}')
22
+
23
+
17
24
  # Move this to a function to delay initialization
18
25
  def get_schema_validator():
19
26
  """Get the schema validator class, initializing it only when needed."""
20
27
  return jsonschema.validators.extend(
21
28
  jsonschema.Draft7Validator,
22
- validators={'case_insensitive_enum': case_insensitive_enum})
29
+ validators={
30
+ 'case_insensitive_enum': case_insensitive_enum,
31
+ 'case_sensitive_enum': case_sensitive_enum
32
+ })
sky/utils/volume.py ADDED
@@ -0,0 +1,86 @@
1
+ """Volume utilities."""
2
+ import enum
3
+ import time
4
+ from typing import Any, Dict
5
+
6
+ from sky import exceptions
7
+ from sky import global_user_state
8
+ from sky import models
9
+ from sky.utils import common_utils
10
+ from sky.utils import schemas
11
+ from sky.utils import status_lib
12
+
13
+ MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB = 10
14
+
15
+
16
+ class VolumeAccessMode(enum.Enum):
17
+ """Volume access mode."""
18
+ READ_WRITE_ONCE = 'ReadWriteOnce'
19
+ READ_WRITE_ONCE_POD = 'ReadWriteOncePod'
20
+ READ_WRITE_MANY = 'ReadWriteMany'
21
+ READ_ONLY_MANY = 'ReadOnlyMany'
22
+
23
+
24
+ class VolumeType(enum.Enum):
25
+ """Volume type."""
26
+ PVC = 'k8s-pvc'
27
+ RUNPOD_NETWORK_VOLUME = 'runpod-network-volume'
28
+
29
+ @classmethod
30
+ def supported_types(cls) -> list:
31
+ """Return list of supported volume type values."""
32
+ return [vt.value for vt in cls]
33
+
34
+
35
+ class VolumeMount:
36
+ """Volume mount specification."""
37
+
38
+ def __init__(self, path: str, volume_name: str,
39
+ volume_config: models.VolumeConfig):
40
+ self.path: str = path
41
+ self.volume_name: str = volume_name
42
+ self.volume_config: models.VolumeConfig = volume_config
43
+
44
+ def pre_mount(self) -> None:
45
+ """Update the volume status before actual mounting."""
46
+ # TODO(aylei): for ReadWriteOnce volume, we also need to queue the
47
+ # mount request if the target volume is already mounted to another
48
+ # cluster. For now, we only support ReadWriteMany volume.
49
+ global_user_state.update_volume(self.volume_name,
50
+ last_attached_at=int(time.time()),
51
+ status=status_lib.VolumeStatus.IN_USE)
52
+
53
+ @classmethod
54
+ def resolve(cls, path: str, volume_name: str) -> 'VolumeMount':
55
+ """Resolve the volume mount by populating metadata of volume."""
56
+ record = global_user_state.get_volume_by_name(volume_name)
57
+ if record is None:
58
+ raise exceptions.VolumeNotFoundError(
59
+ f'Volume {volume_name} not found.')
60
+ assert 'handle' in record, 'Volume handle is None.'
61
+ volume_config: models.VolumeConfig = record['handle']
62
+ return cls(path, volume_name, volume_config)
63
+
64
+ @classmethod
65
+ def from_yaml_config(cls, config: Dict[str, Any]) -> 'VolumeMount':
66
+ common_utils.validate_schema(config, schemas.get_volume_mount_schema(),
67
+ 'Invalid volume mount config: ')
68
+
69
+ path = config.pop('path', None)
70
+ volume_name = config.pop('volume_name', None)
71
+ volume_config: models.VolumeConfig = models.VolumeConfig.model_validate(
72
+ config.pop('volume_config', None))
73
+ return cls(path, volume_name, volume_config)
74
+
75
+ def to_yaml_config(self) -> Dict[str, Any]:
76
+ return {
77
+ 'path': self.path,
78
+ 'volume_name': self.volume_name,
79
+ 'volume_config': self.volume_config.model_dump(),
80
+ }
81
+
82
+ def __repr__(self):
83
+ return (f'VolumeMount('
84
+ f'\n\tpath={self.path},'
85
+ f'\n\tvolume_name={self.volume_name},'
86
+ f'\n\tvolume_config={self.volume_config})')