skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,16 @@
1
+ """Utilities for handling resource handles."""
2
+ import copy
3
+ import typing
4
+
5
+
6
+ def prepare_handle_for_backwards_compatibility(
7
+ handle: typing.Any) -> typing.Any:
8
+ """Prepare a handle for backwards compatibility with older clients."""
9
+ # skylet_ssh_tunnel was causing backwards compatibility issues with older
10
+ # clients: AttributeError: Can't get attribute 'SSHTunnelInfo'
11
+ #
12
+ # But it is not needed on the client side, so we can just remove it.
13
+ if handle is not None and hasattr(handle, 'skylet_ssh_tunnel'):
14
+ handle = copy.deepcopy(handle)
15
+ handle.skylet_ssh_tunnel = None
16
+ return handle
sky/utils/status_lib.py CHANGED
@@ -54,3 +54,13 @@ class StorageStatus(enum.Enum):
54
54
 
55
55
  # Finished uploading, in terminal state
56
56
  READY = 'READY'
57
+
58
+
59
+ class VolumeStatus(enum.Enum):
60
+ """Volume status as recorded in table 'volumes'."""
61
+
62
+ # Volume is ready to be used
63
+ READY = 'READY'
64
+
65
+ # Volume is being used
66
+ IN_USE = 'IN_USE'
@@ -6,18 +6,20 @@ import random
6
6
  import resource
7
7
  import shlex
8
8
  import subprocess
9
+ import sys
9
10
  import threading
10
11
  import time
11
12
  import typing
12
- from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
13
+ from typing import (Any, Callable, Dict, List, Optional, Protocol, Set, Tuple,
14
+ Union)
13
15
 
14
16
  import colorama
15
17
 
16
18
  from sky import exceptions
17
19
  from sky import sky_logging
18
20
  from sky.adaptors import common as adaptors_common
19
- from sky.skylet import constants
20
21
  from sky.skylet import log_lib
22
+ from sky.skylet import subprocess_daemon
21
23
  from sky.utils import common_utils
22
24
  from sky.utils import timeline
23
25
  from sky.utils import ux_utils
@@ -107,7 +109,7 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
107
109
 
108
110
 
109
111
  def run_in_parallel(func: Callable,
110
- args: List[Any],
112
+ args: Union[List[Any], Set[Any]],
111
113
  num_threads: Optional[int] = None) -> List[Any]:
112
114
  """Run a function in parallel on a list of arguments.
113
115
 
@@ -128,7 +130,7 @@ def run_in_parallel(func: Callable,
128
130
  if len(args) == 0:
129
131
  return []
130
132
  if len(args) == 1:
131
- return [func(args[0])]
133
+ return [func(list(args)[0])]
132
134
 
133
135
  processes = (num_threads
134
136
  if num_threads is not None else get_parallel_threads())
@@ -208,8 +210,11 @@ def kill_children_processes(parent_pids: Optional[Union[
208
210
  kill_process_with_grace_period(child, force=force)
209
211
 
210
212
 
211
- def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
212
- psutil.Process],
213
+ GenericProcess = Union[multiprocessing.Process, psutil.Process,
214
+ subprocess.Popen]
215
+
216
+
217
+ def kill_process_with_grace_period(proc: GenericProcess,
213
218
  force: bool = False,
214
219
  grace_period: int = 10) -> None:
215
220
  """Kill a process with SIGTERM and wait for it to exit.
@@ -223,6 +228,9 @@ def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
223
228
  if isinstance(proc, psutil.Process):
224
229
  alive = proc.is_running
225
230
  wait = proc.wait
231
+ elif isinstance(proc, subprocess.Popen):
232
+ alive = lambda: proc.poll() is None
233
+ wait = proc.wait
226
234
  else:
227
235
  alive = proc.is_alive
228
236
  wait = proc.join
@@ -240,11 +248,10 @@ def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
240
248
  # The child process may have already been terminated.
241
249
  return
242
250
  except psutil.TimeoutExpired:
243
- # Pass to finally to force kill the process.
244
- pass
245
- finally:
246
251
  logger.debug(f'Process {proc.pid} did not terminate after '
247
252
  f'{grace_period} seconds')
253
+ # Continue to finally to force kill the process.
254
+ finally:
248
255
  # Attempt to force kill if the normal termination fails
249
256
  if not force:
250
257
  logger.debug(f'Force killing process {proc.pid}')
@@ -300,11 +307,17 @@ def run_with_retries(
300
307
  return returncode, stdout, stderr
301
308
 
302
309
 
303
- def kill_process_daemon(process_pid: int) -> None:
310
+ def kill_process_daemon(process_pid: int, use_kill_pg: bool = False) -> None:
304
311
  """Start a daemon as a safety net to kill the process.
305
312
 
306
313
  Args:
307
314
  process_pid: The PID of the process to kill.
315
+ use_kill_pg: Whether to use kill process group to kill the process. If
316
+ True, the process will use os.killpg() to kill the target process
317
+ group on UNIX system, which is more efficient than using the daemon
318
+ to refresh the process tree in the daemon. Note that both
319
+ implementations have corner cases where subprocesses might not be
320
+ killed. Refer to subprocess_daemon.py for more details.
308
321
  """
309
322
  # Get initial children list
310
323
  try:
@@ -317,12 +330,8 @@ def kill_process_daemon(process_pid: int) -> None:
317
330
  daemon_script = os.path.join(
318
331
  os.path.dirname(os.path.abspath(log_lib.__file__)),
319
332
  'subprocess_daemon.py')
320
- python_path = subprocess.check_output(constants.SKY_GET_PYTHON_PATH_CMD,
321
- shell=True,
322
- stderr=subprocess.DEVNULL,
323
- encoding='utf-8').strip()
324
333
  daemon_cmd = [
325
- python_path,
334
+ sys.executable,
326
335
  daemon_script,
327
336
  '--parent-pid',
328
337
  str(parent_pid),
@@ -335,6 +344,10 @@ def kill_process_daemon(process_pid: int) -> None:
335
344
  ','.join(map(str, initial_children)),
336
345
  ]
337
346
 
347
+ env = os.environ.copy()
348
+ if use_kill_pg:
349
+ env[subprocess_daemon.USE_KILL_PG_ENV_VAR] = '1'
350
+
338
351
  # We do not need to set `start_new_session=True` here, as the
339
352
  # daemon script will detach itself from the parent process with
340
353
  # fork to avoid being killed by parent process. See the reason we
@@ -346,6 +359,7 @@ def kill_process_daemon(process_pid: int) -> None:
346
359
  stderr=subprocess.DEVNULL,
347
360
  # Disable input
348
361
  stdin=subprocess.DEVNULL,
362
+ env=env,
349
363
  )
350
364
 
351
365
 
sky/utils/tempstore.py ADDED
@@ -0,0 +1,70 @@
1
+ """Temporary storage context manager."""
2
+
3
+ import contextlib
4
+ import contextvars
5
+ import functools
6
+ import os
7
+ import tempfile
8
+ import typing
9
+ from typing import Any, Callable, Iterator, Optional, TypeVar
10
+
11
+ _TEMP_DIR: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar(
12
+ 'temp_store_dir', default=None)
13
+
14
+
15
+ @contextlib.contextmanager
16
+ def tempdir() -> Iterator[str]:
17
+ """Context manager for temporary directory of current context.
18
+
19
+ This wraps tempfile.TemporaryDirectory and makes the temp dir available
20
+ throughout the context, eliminating the need to pass the temp dir to
21
+ the nested functions that need it.
22
+
23
+ This context manager is nestable - nested calls will create new temp dirs
24
+ and restore the previous temp dir when exiting.
25
+ """
26
+ with tempfile.TemporaryDirectory(prefix='sky-tmp') as temp_dir:
27
+ token = _TEMP_DIR.set(temp_dir)
28
+ try:
29
+ yield temp_dir
30
+ finally:
31
+ _TEMP_DIR.reset(token)
32
+
33
+
34
+ # Keep the function signature same as tempfile.mkdtemp.
35
+ # pylint: disable=redefined-builtin
36
+ def mkdtemp(suffix: Optional[str] = None,
37
+ prefix: Optional[str] = None,
38
+ dir: Optional[str] = None) -> str:
39
+ """Create a temporary directory in the temp dir of current context.
40
+
41
+ The directory will be cleaned when the current context exits.
42
+ If there is no temp dir in current context, this function is equivalent to
43
+ tempfile.mkdtemp.
44
+ """
45
+ context_temp_dir = _TEMP_DIR.get()
46
+
47
+ if context_temp_dir is not None and dir is None:
48
+ dir = context_temp_dir
49
+ elif context_temp_dir is not None and dir is not None:
50
+ dir = os.path.join(context_temp_dir, dir)
51
+ os.makedirs(dir, exist_ok=True)
52
+
53
+ return tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir)
54
+
55
+
56
+ F = TypeVar('F', bound=Callable[..., Any])
57
+
58
+
59
+ def with_tempdir(func: F) -> F:
60
+ """Decorator that wraps a function call with tempdir() context manager.
61
+
62
+ Refer to `tempdir` for more details.
63
+ """
64
+
65
+ @functools.wraps(func)
66
+ def wrapper(*args, **kwargs):
67
+ with tempdir():
68
+ return func(*args, **kwargs)
69
+
70
+ return typing.cast(F, wrapper)
@@ -0,0 +1,91 @@
1
+ """Utility functions for threads."""
2
+
3
+ import threading
4
+ from typing import Any, Dict, Generic, Optional, overload, TypeVar
5
+
6
+ from sky.utils import common_utils
7
+
8
+
9
+ class SafeThread(threading.Thread):
10
+ """A thread that can catch exceptions."""
11
+
12
+ def __init__(self, *args, **kwargs):
13
+ super().__init__(*args, **kwargs)
14
+ self._exc = None
15
+
16
+ def run(self):
17
+ try:
18
+ super().run()
19
+ except BaseException as e: # pylint: disable=broad-except
20
+ self._exc = e
21
+
22
+ @property
23
+ def format_exc(self) -> Optional[str]:
24
+ if self._exc is None:
25
+ return None
26
+ return common_utils.format_exception(self._exc)
27
+
28
+
29
+ # pylint: disable=invalid-name
30
+ KeyType = TypeVar('KeyType')
31
+ ValueType = TypeVar('ValueType')
32
+
33
+
34
+ # Google style guide: Do not rely on the atomicity of built-in types.
35
+ # Our launch and down process pool will be used by multiple threads,
36
+ # therefore we need to use a thread-safe dict.
37
+ # see https://google.github.io/styleguide/pyguide.html#218-threading
38
+ class ThreadSafeDict(Generic[KeyType, ValueType]):
39
+ """A thread-safe dict."""
40
+
41
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
42
+ self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
43
+ self._lock = threading.Lock()
44
+
45
+ def __getitem__(self, key: KeyType) -> ValueType:
46
+ with self._lock:
47
+ return self._dict.__getitem__(key)
48
+
49
+ def __setitem__(self, key: KeyType, value: ValueType) -> None:
50
+ with self._lock:
51
+ return self._dict.__setitem__(key, value)
52
+
53
+ def __delitem__(self, key: KeyType) -> None:
54
+ with self._lock:
55
+ return self._dict.__delitem__(key)
56
+
57
+ def __len__(self) -> int:
58
+ with self._lock:
59
+ return self._dict.__len__()
60
+
61
+ def __contains__(self, key: KeyType) -> bool:
62
+ with self._lock:
63
+ return self._dict.__contains__(key)
64
+
65
+ def items(self):
66
+ with self._lock:
67
+ return self._dict.items()
68
+
69
+ def values(self):
70
+ with self._lock:
71
+ return self._dict.values()
72
+
73
+ @overload
74
+ def get(self, key: KeyType, default: ValueType) -> ValueType:
75
+ ...
76
+
77
+ @overload
78
+ def get(self,
79
+ key: KeyType,
80
+ default: Optional[ValueType] = None) -> Optional[ValueType]:
81
+ ...
82
+
83
+ def get(self,
84
+ key: KeyType,
85
+ default: Optional[ValueType] = None) -> Optional[ValueType]:
86
+ with self._lock:
87
+ return self._dict.get(key, default)
88
+
89
+ def pop(self, key: KeyType) -> Optional[ValueType]:
90
+ with self._lock:
91
+ return self._dict.pop(key, None)
sky/utils/timeline.py CHANGED
@@ -4,7 +4,6 @@ The timeline follows the trace event format defined here:
4
4
  https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
5
5
  """ # pylint: disable=line-too-long
6
6
  import atexit
7
- import functools
8
7
  import json
9
8
  import os
10
9
  import threading
@@ -12,13 +11,15 @@ import time
12
11
  import traceback
13
12
  from typing import Callable, Optional, Union
14
13
 
15
- import filelock
16
-
17
14
  from sky.utils import common_utils
18
15
 
19
16
  _events = []
20
17
 
21
18
 
19
+ def _get_events_file_path():
20
+ return os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
21
+
22
+
22
23
  class Event:
23
24
  """Record an event.
24
25
 
@@ -28,6 +29,10 @@ class Event:
28
29
  """
29
30
 
30
31
  def __init__(self, name: str, message: Optional[str] = None):
32
+ self._skipped = False
33
+ if not _get_events_file_path():
34
+ self._skipped = True
35
+ return
31
36
  self._name = name
32
37
  self._message = message
33
38
  # See the module doc for the event format.
@@ -44,6 +49,8 @@ class Event:
44
49
  self._event['args'] = {'message': self._message}
45
50
 
46
51
  def begin(self):
52
+ if self._skipped:
53
+ return
47
54
  event_begin = self._event.copy()
48
55
  event_begin.update({
49
56
  'ph': 'B',
@@ -51,10 +58,13 @@ class Event:
51
58
  })
52
59
  event_begin['args'] = {'stack': '\n'.join(traceback.format_stack())}
53
60
  if self._message is not None:
54
- event_begin['args']['message'] = self._message
61
+ event_begin['args'][
62
+ 'message'] = self._message # type: ignore[index]
55
63
  _events.append(event_begin)
56
64
 
57
65
  def end(self):
66
+ if self._skipped:
67
+ return
58
68
  event_end = self._event.copy()
59
69
  event_end.update({
60
70
  'ph': 'E',
@@ -76,63 +86,26 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
76
86
  return common_utils.make_decorator(Event, name_or_fn, message=message)
77
87
 
78
88
 
79
- class FileLockEvent:
80
- """Serve both as a file lock and event for the lock."""
81
-
82
- def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
83
- self._lockfile = lockfile
84
- os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
85
- exist_ok=True)
86
- self._lock = filelock.FileLock(self._lockfile, timeout)
87
- self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
88
-
89
- def acquire(self):
90
- was_locked = self._lock.is_locked
91
- with Event(f'[FileLock.acquire]:{self._lockfile}'):
92
- self._lock.acquire()
93
- if not was_locked and self._lock.is_locked:
94
- # start holding the lock after initial acquiring
95
- self._hold_lock_event.begin()
96
-
97
- def release(self):
98
- was_locked = self._lock.is_locked
99
- self._lock.release()
100
- if was_locked and not self._lock.is_locked:
101
- # stop holding the lock after initial releasing
102
- self._hold_lock_event.end()
103
-
104
- def __enter__(self):
105
- self.acquire()
106
- return self
107
-
108
- def __exit__(self, exc_type, exc_val, exc_tb):
109
- self.release()
110
-
111
- def __call__(self, f):
112
- # Make this class callable as a decorator.
113
- @functools.wraps(f)
114
- def wrapper(*args, **kwargs):
115
- with self:
116
- return f(*args, **kwargs)
117
-
118
- return wrapper
119
-
120
-
121
89
  def save_timeline():
122
- file_path = os.environ.get('SKYPILOT_TIMELINE_FILE_PATH')
123
- if not file_path:
90
+ events_file_path = _get_events_file_path()
91
+ if not events_file_path:
124
92
  return
93
+ global _events
94
+ events_to_write = _events
95
+ _events = []
125
96
  json_output = {
126
- 'traceEvents': _events,
97
+ 'traceEvents': events_to_write,
127
98
  'displayTimeUnit': 'ms',
128
99
  'otherData': {
129
- 'log_dir': os.path.dirname(os.path.abspath(file_path)),
100
+ 'log_dir': os.path.dirname(os.path.abspath(events_file_path)),
130
101
  }
131
102
  }
132
- os.makedirs(os.path.dirname(os.path.abspath(file_path)), exist_ok=True)
133
- with open(file_path, 'w', encoding='utf-8') as f:
103
+ os.makedirs(os.path.dirname(os.path.abspath(events_file_path)),
104
+ exist_ok=True)
105
+ with open(events_file_path, 'w', encoding='utf-8') as f:
134
106
  json.dump(json_output, f)
107
+ del events_to_write
135
108
 
136
109
 
137
- if os.environ.get('SKYPILOT_TIMELINE_FILE_PATH'):
110
+ if _get_events_file_path():
138
111
  atexit.register(save_timeline)
sky/utils/ux_utils.py CHANGED
@@ -1,17 +1,19 @@
1
1
  """Utility functions for UX."""
2
2
  import contextlib
3
3
  import enum
4
+ import fnmatch
4
5
  import os
5
6
  import sys
6
7
  import traceback
7
8
  import typing
8
- from typing import Callable, Optional, Union
9
+ from typing import Callable, Iterable, List, Optional, Union
9
10
 
10
11
  import colorama
11
12
 
12
13
  from sky import sky_logging
13
14
  from sky.skylet import constants
14
15
  from sky.utils import common_utils
16
+ from sky.utils import env_options
15
17
  from sky.utils import rich_console_utils
16
18
 
17
19
  if typing.TYPE_CHECKING:
@@ -25,9 +27,16 @@ BOLD = '\033[1m'
25
27
  RESET_BOLD = '\033[0m'
26
28
 
27
29
  # Log path hint in the spinner during launching
30
+ # (old, kept for backward compatibility)
28
31
  _LOG_PATH_HINT = (f'{colorama.Style.DIM}View logs: sky api logs -l '
29
32
  '{log_path}'
30
33
  f'{colorama.Style.RESET_ALL}')
34
+ # Log hint: recommend sky logs --provision <cluster_name>
35
+ _PROVISION_LOG_HINT = (
36
+ f'{colorama.Style.DIM}View logs: '
37
+ f'{BOLD}sky logs --provision {{cluster_name}}{RESET_BOLD}'
38
+ f'{colorama.Style.RESET_ALL}')
39
+ # Legacy path hint retained for local-only cases where we don't have cluster
31
40
  _LOG_PATH_HINT_LOCAL = (f'{colorama.Style.DIM}View logs: '
32
41
  '{log_path}'
33
42
  f'{colorama.Style.RESET_ALL}')
@@ -57,10 +66,14 @@ def print_exception_no_traceback():
57
66
  if error():
58
67
  raise ValueError('...')
59
68
  """
60
- original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
61
- sys.tracebacklimit = 0
62
- yield
63
- sys.tracebacklimit = original_tracelimit
69
+ if env_options.Options.SHOW_DEBUG_INFO.get():
70
+ # When SKYPILOT_DEBUG is set, show the full traceback
71
+ yield
72
+ else:
73
+ original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
74
+ sys.tracebacklimit = 0
75
+ yield
76
+ sys.tracebacklimit = original_tracelimit
64
77
 
65
78
 
66
79
  @contextlib.contextmanager
@@ -121,7 +134,10 @@ class RedirectOutputForProcess:
121
134
 
122
135
  def log_path_hint(log_path: Union[str, 'pathlib.Path'],
123
136
  is_local: bool = False) -> str:
124
- """Gets the log path hint for the given log path."""
137
+ """Gets the log path hint for the given log path.
138
+
139
+ Kept for backward compatibility when only paths are available.
140
+ """
125
141
  log_path = str(log_path)
126
142
  expanded_home = os.path.expanduser('~')
127
143
  if log_path.startswith(expanded_home):
@@ -134,6 +150,12 @@ def log_path_hint(log_path: Union[str, 'pathlib.Path'],
134
150
  return _LOG_PATH_HINT.format(log_path=log_path)
135
151
 
136
152
 
153
+ def provision_hint(cluster_name: Optional[str]) -> Optional[str]:
154
+ if not cluster_name:
155
+ return None
156
+ return _PROVISION_LOG_HINT.format(cluster_name=cluster_name)
157
+
158
+
137
159
  def starting_message(message: str) -> str:
138
160
  """Gets the starting message for the given message."""
139
161
  # We have to reset the color before the message, because sometimes if a
@@ -145,7 +167,8 @@ def starting_message(message: str) -> str:
145
167
  def finishing_message(message: str,
146
168
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
147
169
  is_local: bool = False,
148
- follow_up_message: Optional[str] = None) -> str:
170
+ follow_up_message: Optional[str] = None,
171
+ cluster_name: Optional[str] = None) -> str:
149
172
  """Gets the finishing message for the given message.
150
173
 
151
174
  Args:
@@ -161,7 +184,11 @@ def finishing_message(message: str,
161
184
  follow_up_message = follow_up_message if (follow_up_message
162
185
  is not None) else ''
163
186
  success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
164
- f'{message}{colorama.Style.RESET_ALL}{follow_up_message}')
187
+ f'{message}{colorama.Style.RESET_ALL}{follow_up_message}'
188
+ f'{colorama.Style.RESET_ALL}')
189
+ hint = provision_hint(cluster_name)
190
+ if hint:
191
+ return f'{success_prefix} {hint}'
165
192
  if log_path is None:
166
193
  return success_prefix
167
194
  path_hint = log_path_hint(log_path, is_local)
@@ -170,13 +197,17 @@ def finishing_message(message: str,
170
197
 
171
198
  def error_message(message: str,
172
199
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
173
- is_local: bool = False) -> str:
200
+ is_local: bool = False,
201
+ cluster_name: Optional[str] = None) -> str:
174
202
  """Gets the error message for the given message."""
175
203
  # We have to reset the color before the message, because sometimes if a
176
204
  # previous spinner with dimmed color overflows in a narrow terminal, the
177
205
  # color might be messed up.
178
206
  error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
179
207
  f'{colorama.Style.RESET_ALL} {message}')
208
+ hint = provision_hint(cluster_name)
209
+ if hint:
210
+ return f'{error_prefix} {hint}'
180
211
  if log_path is None:
181
212
  return error_prefix
182
213
  path_hint = log_path_hint(log_path, is_local)
@@ -194,9 +225,16 @@ def retry_message(message: str) -> str:
194
225
 
195
226
  def spinner_message(message: str,
196
227
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
197
- is_local: bool = False) -> str:
198
- """Gets the spinner message for the given message and log path."""
228
+ is_local: bool = False,
229
+ cluster_name: Optional[str] = None) -> str:
230
+ """Gets the spinner message for the given message and log path.
231
+
232
+ If cluster_name is provided, recommend `sky logs --provision <cluster>`.
233
+ """
199
234
  colored_spinner = f'[bold cyan]{message}[/]'
235
+ hint = provision_hint(cluster_name)
236
+ if hint:
237
+ return f'{colored_spinner} {hint}'
200
238
  if log_path is None:
201
239
  return colored_spinner
202
240
  path_hint = log_path_hint(log_path, is_local)
@@ -247,9 +285,40 @@ def command_hint_messages(hint_type: CommandHintType,
247
285
  f'{BOLD}sky jobs logs {job_id}{RESET_BOLD}'
248
286
  f'\n{INDENT_SYMBOL}To stream controller logs:\t\t'
249
287
  f'{BOLD}sky jobs logs --controller {job_id}{RESET_BOLD}'
250
- f'\n{INDENT_SYMBOL}To view all managed jobs:\t\t'
251
- f'{BOLD}sky jobs queue{RESET_BOLD}'
252
- f'\n{INDENT_LAST_SYMBOL}To view managed job dashboard:\t\t'
253
- f'{BOLD}sky jobs dashboard{RESET_BOLD}')
288
+ f'\n{INDENT_LAST_SYMBOL}To view all managed jobs:\t\t'
289
+ f'{BOLD}sky jobs queue{RESET_BOLD}')
254
290
  else:
255
291
  raise ValueError(f'Invalid hint type: {hint_type}')
292
+
293
+
294
+ def is_glob_pattern(pattern: str) -> bool:
295
+ """Checks if a string contains common glob pattern wildcards."""
296
+ glob_chars = {'*', '?', '[', ']'}
297
+ # Also check for '**' as a specific globstar pattern
298
+ if '**' in pattern:
299
+ return True
300
+ for char in pattern:
301
+ if char in glob_chars:
302
+ return True
303
+ return False
304
+
305
+
306
+ def get_non_matched_query(query_clusters: Iterable[str],
307
+ cluster_names: Iterable[str]) -> List[str]:
308
+ """Gets the non-matched query clusters."""
309
+ glob_query_clusters = []
310
+ non_glob_query_clusters = []
311
+ for cluster_name in query_clusters:
312
+ if is_glob_pattern(cluster_name):
313
+ glob_query_clusters.append(cluster_name)
314
+ else:
315
+ non_glob_query_clusters.append(cluster_name)
316
+ not_found_clusters = [
317
+ query_cluster for query_cluster in non_glob_query_clusters
318
+ if query_cluster not in cluster_names
319
+ ]
320
+ not_found_clusters.extend([
321
+ query_cluster for query_cluster in glob_query_clusters
322
+ if not fnmatch.filter(cluster_names, query_cluster)
323
+ ])
324
+ return not_found_clusters
sky/utils/validator.py CHANGED
@@ -14,9 +14,19 @@ def case_insensitive_enum(validator, enums, instance, schema):
14
14
  f'{instance!r} is not one of {enums!r}')
15
15
 
16
16
 
17
+ def case_sensitive_enum(validator, enums, instance, schema):
18
+ del validator, schema # Unused.
19
+ if instance not in enums:
20
+ yield jsonschema.ValidationError(
21
+ f'{instance!r} is not one of {enums!r}')
22
+
23
+
17
24
  # Move this to a function to delay initialization
18
25
  def get_schema_validator():
19
26
  """Get the schema validator class, initializing it only when needed."""
20
27
  return jsonschema.validators.extend(
21
28
  jsonschema.Draft7Validator,
22
- validators={'case_insensitive_enum': case_insensitive_enum})
29
+ validators={
30
+ 'case_insensitive_enum': case_insensitive_enum,
31
+ 'case_sensitive_enum': case_sensitive_enum
32
+ })