skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/metrics/utils.py ADDED
@@ -0,0 +1,453 @@
1
+ """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import contextlib
3
+ import functools
4
+ import os
5
+ import re
6
+ import select
7
+ import subprocess
8
+ import time
9
+ from typing import List, Optional, Tuple
10
+
11
+ import httpx
12
+ import prometheus_client as prom
13
+
14
+ from sky import sky_logging
15
+ from sky.skylet import constants
16
+ from sky.utils import common_utils
17
+ from sky.utils import context_utils
18
+
19
+ _SELECT_TIMEOUT = 1
20
+ _SELECT_BUFFER_SIZE = 4096
21
+
22
+ _KB = 2**10
23
+ _MB = 2**20
24
+ _MEM_BUCKETS = [
25
+ _KB,
26
+ 256 * _KB,
27
+ 512 * _KB,
28
+ _MB,
29
+ 2 * _MB,
30
+ 4 * _MB,
31
+ 8 * _MB,
32
+ 16 * _MB,
33
+ 32 * _MB,
34
+ 64 * _MB,
35
+ 128 * _MB,
36
+ 256 * _MB,
37
+ float('inf'),
38
+ ]
39
+
40
+ logger = sky_logging.init_logger(__name__)
41
+
42
+ # Whether the metrics are enabled, cannot be changed at runtime.
43
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
44
+ 'false').lower() == 'true'
45
+
46
+ # Time spent processing a piece of code, refer to time_it().
47
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
48
+ 'sky_apiserver_code_duration_seconds',
49
+ 'Time spent processing code',
50
+ ['name', 'group'],
51
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
52
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
53
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
54
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
55
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
56
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
57
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
58
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
59
+ 960.0, 980.0, 1000.0, float('inf')),
60
+ )
61
+
62
+ # Total number of API server requests, grouped by path, method, and status.
63
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
64
+ 'sky_apiserver_requests_total',
65
+ 'Total number of API server requests',
66
+ ['path', 'method', 'status'],
67
+ )
68
+
69
+ # Time spent processing API server requests, grouped by path, method, and
70
+ # status.
71
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
72
+ 'sky_apiserver_request_duration_seconds',
73
+ 'Time spent processing API server requests',
74
+ ['path', 'method', 'status'],
75
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
76
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
77
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
78
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
79
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
80
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
81
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
82
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
83
+ 960.0, 980.0, 1000.0, float('inf')),
84
+ )
85
+
86
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
87
+ 'sky_apiserver_event_loop_lag_seconds',
88
+ 'Scheduling delay of the server event loop',
89
+ ['pid'],
90
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
91
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
92
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
93
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
94
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
95
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
96
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
97
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
98
+ 960.0, 980.0, 1000.0, float('inf')),
99
+ )
100
+
101
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
102
+ 'sky_apiserver_websocket_connections',
103
+ 'Number of websocket connections',
104
+ ['pid'],
105
+ multiprocess_mode='livesum',
106
+ )
107
+
108
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
109
+ 'sky_apiserver_websocket_closed_total',
110
+ 'Number of websocket closed',
111
+ ['pid', 'reason'],
112
+ )
113
+
114
+ # The number of execution starts in each worker process, we do not record
115
+ # histogram here as the duration has been measured in
116
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
117
+ # Recording histogram WITH worker label will cause high cardinality.
118
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
119
+ 'sky_apiserver_process_execution_start_total',
120
+ 'Total number of execution starts in each worker process',
121
+ ['request', 'pid'],
122
+ )
123
+
124
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
125
+ 'sky_apiserver_process_peak_rss',
126
+ 'Peak RSS we saw in each process in last 30 seconds',
127
+ ['pid', 'type'],
128
+ )
129
+
130
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
131
+ 'sky_apiserver_process_cpu_total',
132
+ 'Total CPU times a worker process has been running',
133
+ ['pid', 'type', 'mode'],
134
+ )
135
+
136
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
137
+ 'sky_apiserver_request_memory_usage_bytes',
138
+ 'Peak memory usage of requests', ['name'],
139
+ buckets=_MEM_BUCKETS)
140
+
141
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
142
+ 'sky_apiserver_request_rss_incr_bytes',
143
+ 'RSS increment after requests', ['name'],
144
+ buckets=_MEM_BUCKETS)
145
+
146
+ SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
147
+ 'sky_apiserver_websocket_ssh_latency_seconds',
148
+ ('Time taken for ssh message to go from client to API server and back'
149
+ 'to the client. This does not include: latency to reach the pod, '
150
+ 'overhead from sending through the k8s port-forward tunnel, or '
151
+ 'ssh server lag on the destination pod.'),
152
+ ['pid'],
153
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
154
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
155
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
156
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
157
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
158
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
159
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
160
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
161
+ 960.0, 980.0, 1000.0, float('inf')),
162
+ )
163
+
164
+ SKY_APISERVER_LONG_EXECUTORS = prom.Gauge(
165
+ 'sky_apiserver_long_executors',
166
+ 'Total number of long-running request executors in the API server',
167
+ )
168
+
169
+ SKY_APISERVER_SHORT_EXECUTORS = prom.Gauge(
170
+ 'sky_apiserver_short_executors',
171
+ 'Total number of short-running request executors in the API server',
172
+ )
173
+
174
+
175
+ @contextlib.contextmanager
176
+ def time_it(name: str, group: str = 'default'):
177
+ """Context manager to measure and record code execution duration."""
178
+ if not METRICS_ENABLED:
179
+ yield
180
+ else:
181
+ start_time = time.time()
182
+ try:
183
+ yield
184
+ finally:
185
+ duration = time.time() - start_time
186
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
187
+ name=name, group=group).observe(duration)
188
+
189
+
190
+ def time_me(func):
191
+ """Measure the duration of decorated function."""
192
+
193
+ @functools.wraps(func)
194
+ def wrapper(*args, **kwargs):
195
+ if not METRICS_ENABLED:
196
+ return func(*args, **kwargs)
197
+ name = f'{func.__module__}/{func.__name__}'
198
+ with time_it(name, group='function'):
199
+ return func(*args, **kwargs)
200
+
201
+ return wrapper
202
+
203
+
204
+ def time_me_async(func):
205
+ """Measure the duration of decorated async function."""
206
+
207
+ @functools.wraps(func)
208
+ async def async_wrapper(*args, **kwargs):
209
+ if not METRICS_ENABLED:
210
+ return await func(*args, **kwargs)
211
+ name = f'{func.__module__}/{func.__name__}'
212
+ with time_it(name, group='function'):
213
+ return await func(*args, **kwargs)
214
+
215
+ return async_wrapper
216
+
217
+
218
+ def start_svc_port_forward(context: str, namespace: str, service: str,
219
+ service_port: int) -> Tuple[subprocess.Popen, int]:
220
+ """Starts a port forward to a service in a Kubernetes cluster.
221
+ Args:
222
+ context: Kubernetes context name
223
+ namespace: Namespace where the service is located
224
+ service: Service name to port forward to
225
+ service_port: Port on the service to forward to
226
+ Returns:
227
+ Tuple of (subprocess.Popen process, local_port assigned)
228
+ Raises:
229
+ RuntimeError: If port forward fails to start
230
+ """
231
+ start_port_forward_timeout = 10 # 10 second timeout
232
+ terminate_port_forward_timeout = 5 # 5 second timeout
233
+
234
+ # Use ':service_port' to let kubectl choose the local port
235
+ cmd = [
236
+ 'kubectl', '--context', context, '-n', namespace, 'port-forward',
237
+ f'service/{service}', f':{service_port}'
238
+ ]
239
+
240
+ env = os.environ.copy()
241
+ if 'KUBECONFIG' not in env:
242
+ env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
243
+
244
+ port_forward_process = None
245
+ port_forward_exit = False
246
+ local_port = None
247
+ poller = None
248
+ fd = None
249
+
250
+ try:
251
+ # start the port forward process
252
+ port_forward_process = subprocess.Popen(cmd,
253
+ stdout=subprocess.PIPE,
254
+ stderr=subprocess.STDOUT,
255
+ text=True,
256
+ env=env)
257
+
258
+ # Use poll() instead of select() to avoid FD_SETSIZE limit
259
+ poller = select.poll()
260
+ assert port_forward_process.stdout is not None
261
+ fd = port_forward_process.stdout.fileno()
262
+ poller.register(fd, select.POLLIN)
263
+
264
+ start_time = time.time()
265
+ buffer = ''
266
+ # wait for the port forward to start and extract the local port
267
+ while time.time() - start_time < start_port_forward_timeout:
268
+ if port_forward_process.poll() is not None:
269
+ # port forward process has terminated
270
+ if port_forward_process.returncode != 0:
271
+ port_forward_exit = True
272
+ break
273
+
274
+ # Wait up to 1000ms for data to be available without blocking
275
+ # poll() takes timeout in milliseconds
276
+ events = poller.poll(_SELECT_TIMEOUT * 1000)
277
+
278
+ if events:
279
+ # Read available bytes from the FD without blocking
280
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
281
+ chunk = raw.decode(errors='ignore')
282
+ buffer += chunk
283
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
284
+ if match:
285
+ local_port = int(match.group(1))
286
+ break
287
+
288
+ # sleep for 100ms to avoid busy-waiting
289
+ time.sleep(0.1)
290
+ except BaseException: # pylint: disable=broad-exception-caught
291
+ if port_forward_process:
292
+ stop_svc_port_forward(port_forward_process,
293
+ timeout=terminate_port_forward_timeout)
294
+ raise
295
+ finally:
296
+ if poller is not None and fd is not None:
297
+ try:
298
+ poller.unregister(fd)
299
+ except (OSError, ValueError):
300
+ # FD may already be unregistered or invalid
301
+ pass
302
+ if port_forward_exit:
303
+ raise RuntimeError(f'Port forward failed for service {service} in '
304
+ f'namespace {namespace} on context {context}')
305
+ if local_port is None:
306
+ try:
307
+ if port_forward_process:
308
+ stop_svc_port_forward(port_forward_process,
309
+ timeout=terminate_port_forward_timeout)
310
+ finally:
311
+ raise RuntimeError(
312
+ f'Failed to extract local port for service {service} in '
313
+ f'namespace {namespace} on context {context}')
314
+
315
+ return port_forward_process, local_port
316
+
317
+
318
+ def stop_svc_port_forward(port_forward_process: subprocess.Popen,
319
+ timeout: int = 5) -> None:
320
+ """Stops a port forward to a service in a Kubernetes cluster.
321
+ Args:
322
+ port_forward_process: The subprocess.Popen process to terminate
323
+ """
324
+ try:
325
+ port_forward_process.terminate()
326
+ port_forward_process.wait(timeout=timeout)
327
+ except subprocess.TimeoutExpired:
328
+ port_forward_process.kill()
329
+ port_forward_process.wait()
330
+
331
+
332
+ async def send_metrics_request_with_port_forward(
333
+ context: str,
334
+ namespace: str,
335
+ service: str,
336
+ service_port: int,
337
+ endpoint_path: str = '/federate',
338
+ match_patterns: Optional[List[str]] = None,
339
+ timeout: float = 30.0) -> str:
340
+ """Sends a metrics request to a Prometheus endpoint via port forwarding.
341
+ Args:
342
+ context: Kubernetes context name
343
+ namespace: Namespace where the service is located
344
+ service: Service name to port forward to
345
+ service_port: Port on the service to forward to
346
+ endpoint_path: Path to append to the localhost endpoint (e.g.,
347
+ '/federate')
348
+ match_patterns: List of metric patterns to match (for federate
349
+ endpoint)
350
+ timeout: Request timeout in seconds
351
+ Returns:
352
+ Response text containing the metrics
353
+ Raises:
354
+ RuntimeError: If port forward or HTTP request fails
355
+ """
356
+ port_forward_process = None
357
+ try:
358
+ # Start port forward
359
+ port_forward_process, local_port = await context_utils.to_thread(
360
+ start_svc_port_forward, context, namespace, service, service_port)
361
+
362
+ # Build endpoint URL
363
+ endpoint = f'http://localhost:{local_port}{endpoint_path}'
364
+
365
+ # Make HTTP request
366
+ async with httpx.AsyncClient(timeout=timeout) as client:
367
+ if match_patterns:
368
+ # For federate endpoint, add match[] parameters
369
+ params = [('match[]', pattern) for pattern in match_patterns]
370
+ response = await client.get(endpoint, params=params)
371
+ else:
372
+ response = await client.get(endpoint)
373
+
374
+ response.raise_for_status()
375
+ return response.text
376
+
377
+ except Exception as e: # pylint: disable=broad-exception-caught
378
+ logger.error(f'Failed to send metrics request with port forward: '
379
+ f'{common_utils.format_exception(e)}')
380
+ raise
381
+ finally:
382
+ # Always clean up port forward
383
+ if port_forward_process:
384
+ await context_utils.to_thread(stop_svc_port_forward,
385
+ port_forward_process)
386
+
387
+
388
+ async def add_cluster_name_label(metrics_text: str, context: str) -> str:
389
+ """Adds a cluster_name label to each metric line.
390
+ Args:
391
+ metrics_text: The text containing the metrics
392
+ context: The cluster name
393
+ """
394
+ lines = metrics_text.strip().split('\n')
395
+ modified_lines = []
396
+
397
+ for line in lines:
398
+ # keep comment lines and empty lines as-is
399
+ if line.startswith('#') or not line.strip():
400
+ modified_lines.append(line)
401
+ continue
402
+ # if line is a metric line with labels, add cluster label
403
+ brace_start = line.find('{')
404
+ brace_end = line.find('}')
405
+ if brace_start != -1 and brace_end != -1:
406
+ metric_name = line[:brace_start]
407
+ existing_labels = line[brace_start + 1:brace_end]
408
+ rest_of_line = line[brace_end + 1:]
409
+
410
+ if existing_labels:
411
+ new_labels = f'cluster="{context}",{existing_labels}'
412
+ else:
413
+ new_labels = f'cluster="{context}"'
414
+
415
+ modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
416
+ modified_lines.append(modified_line)
417
+ else:
418
+ # keep other lines as-is
419
+ modified_lines.append(line)
420
+
421
+ return '\n'.join(modified_lines)
422
+
423
+
424
+ async def get_metrics_for_context(context: str) -> str:
425
+ """Get GPU metrics for a single Kubernetes context.
426
+ Args:
427
+ context: Kubernetes context name
428
+ Returns:
429
+ metrics_text: String containing the metrics
430
+ Raises:
431
+ Exception: If metrics collection fails for any reason
432
+ """
433
+ # Query both DCGM metrics and kube_pod_labels metrics
434
+ # This ensures the dashboard can perform joins to filter by skypilot cluster
435
+ match_patterns = [
436
+ '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
437
+ 'kube_pod_labels',
438
+ 'node_cpu_seconds_total{mode="idle"}'
439
+ ]
440
+
441
+ # TODO(rohan): don't hardcode the namespace and service name
442
+ metrics_text = await send_metrics_request_with_port_forward(
443
+ context=context,
444
+ namespace='skypilot',
445
+ service='skypilot-prometheus-server',
446
+ service_port=80,
447
+ endpoint_path='/federate',
448
+ match_patterns=match_patterns)
449
+
450
+ # add cluster name as a label to each metric line
451
+ metrics_text = await add_cluster_name_label(metrics_text, context)
452
+
453
+ return metrics_text
sky/models.py CHANGED
@@ -2,19 +2,57 @@
2
2
 
3
3
  import collections
4
4
  import dataclasses
5
- from typing import Any, Dict, Optional
5
+ import getpass
6
+ import os
7
+ from typing import Any, ClassVar, Dict, Optional
8
+
9
+ import pydantic
10
+
11
+ from sky.skylet import constants
12
+ from sky.utils import common_utils
6
13
 
7
14
 
8
15
  @dataclasses.dataclass
9
16
  class User:
17
+ """Dataclass to store user information."""
10
18
  # User hash
11
19
  id: str
12
20
  # Display name of the user
13
21
  name: Optional[str] = None
22
+ password: Optional[str] = None
23
+ created_at: Optional[int] = None
24
+
25
+ def __init__(
26
+ self,
27
+ id: str, # pylint: disable=redefined-builtin
28
+ name: Optional[str] = None,
29
+ password: Optional[str] = None,
30
+ created_at: Optional[int] = None):
31
+ self.id = id.strip().lower()
32
+ self.name = name
33
+ self.password = password
34
+ self.created_at = created_at
14
35
 
15
36
  def to_dict(self) -> Dict[str, Any]:
16
37
  return {'id': self.id, 'name': self.name}
17
38
 
39
+ def to_env_vars(self) -> Dict[str, Any]:
40
+ return {
41
+ constants.USER_ID_ENV_VAR: self.id,
42
+ constants.USER_ENV_VAR: self.name,
43
+ }
44
+
45
+ @classmethod
46
+ def get_current_user(cls) -> 'User':
47
+ """Returns the current user."""
48
+ user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
49
+ user_hash = common_utils.get_user_hash()
50
+ return User(id=user_hash, name=user_name)
51
+
52
+ def is_service_account(self) -> bool:
53
+ """Check if the user is a service account."""
54
+ return self.id.lower().startswith('sa-')
55
+
18
56
 
19
57
  RealtimeGpuAvailability = collections.namedtuple(
20
58
  'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
@@ -28,6 +66,8 @@ class KubernetesNodeInfo:
28
66
  # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
29
67
  total: Dict[str, int]
30
68
  free: Dict[str, int]
69
+ # IP address of the node (external IP preferred, fallback to internal IP)
70
+ ip_address: Optional[str] = None
31
71
 
32
72
 
33
73
  @dataclasses.dataclass
@@ -56,3 +96,40 @@ class KubernetesNodesInfo:
56
96
  },
57
97
  hint=data['hint'],
58
98
  )
99
+
100
+
101
+ class VolumeConfig(pydantic.BaseModel):
102
+ """Configuration for creating a volume."""
103
+ # If any fields changed, increment the version. For backward compatibility,
104
+ # modify the __setstate__ method to handle the old version.
105
+ _VERSION: ClassVar[int] = 1
106
+
107
+ _version: int
108
+ name: str
109
+ type: str
110
+ cloud: str
111
+ region: Optional[str]
112
+ zone: Optional[str]
113
+ name_on_cloud: str
114
+ size: Optional[str]
115
+ config: Dict[str, Any] = {}
116
+ labels: Optional[Dict[str, str]] = None
117
+ id_on_cloud: Optional[str] = None
118
+
119
+ def __getstate__(self) -> Dict[str, Any]:
120
+ state = super().__getstate__()
121
+ state['_version'] = self._VERSION
122
+ return state
123
+
124
+ def __setstate__(self, state: Dict[str, Any]) -> None:
125
+ """Set state from pickled state, for backward compatibility."""
126
+ super().__setstate__(state)
127
+ version = state.pop('_version', None)
128
+ if version is None:
129
+ version = -1
130
+
131
+ if version < 0:
132
+ state['id_on_cloud'] = None
133
+
134
+ state['_version'] = self._VERSION
135
+ self.__dict__.update(state)