skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/metrics/utils.py ADDED
@@ -0,0 +1,443 @@
1
+ """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import contextlib
3
+ import functools
4
+ import os
5
+ import re
6
+ import select
7
+ import subprocess
8
+ import time
9
+ from typing import List, Optional, Tuple
10
+
11
+ import httpx
12
+ import prometheus_client as prom
13
+
14
+ from sky import sky_logging
15
+ from sky.skylet import constants
16
+ from sky.utils import common_utils
17
+ from sky.utils import context_utils
18
+
19
+ _SELECT_TIMEOUT = 1
20
+ _SELECT_BUFFER_SIZE = 4096
21
+
22
+ _KB = 2**10
23
+ _MB = 2**20
24
+ _MEM_BUCKETS = [
25
+ _KB,
26
+ 256 * _KB,
27
+ 512 * _KB,
28
+ _MB,
29
+ 2 * _MB,
30
+ 4 * _MB,
31
+ 8 * _MB,
32
+ 16 * _MB,
33
+ 32 * _MB,
34
+ 64 * _MB,
35
+ 128 * _MB,
36
+ 256 * _MB,
37
+ float('inf'),
38
+ ]
39
+
40
+ logger = sky_logging.init_logger(__name__)
41
+
42
+ # Whether the metrics are enabled, cannot be changed at runtime.
43
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
44
+ 'false').lower() == 'true'
45
+
46
+ # Time spent processing a piece of code, refer to time_it().
47
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
48
+ 'sky_apiserver_code_duration_seconds',
49
+ 'Time spent processing code',
50
+ ['name', 'group'],
51
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
52
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
53
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
54
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
55
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
56
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
57
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
58
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
59
+ 960.0, 980.0, 1000.0, float('inf')),
60
+ )
61
+
62
+ # Total number of API server requests, grouped by path, method, and status.
63
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
64
+ 'sky_apiserver_requests_total',
65
+ 'Total number of API server requests',
66
+ ['path', 'method', 'status'],
67
+ )
68
+
69
+ # Time spent processing API server requests, grouped by path, method, and
70
+ # status.
71
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
72
+ 'sky_apiserver_request_duration_seconds',
73
+ 'Time spent processing API server requests',
74
+ ['path', 'method', 'status'],
75
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
76
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
77
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
78
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
79
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
80
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
81
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
82
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
83
+ 960.0, 980.0, 1000.0, float('inf')),
84
+ )
85
+
86
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
87
+ 'sky_apiserver_event_loop_lag_seconds',
88
+ 'Scheduling delay of the server event loop',
89
+ ['pid'],
90
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
91
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
92
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
93
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
94
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
95
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
96
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
97
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
98
+ 960.0, 980.0, 1000.0, float('inf')),
99
+ )
100
+
101
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
102
+ 'sky_apiserver_websocket_connections',
103
+ 'Number of websocket connections',
104
+ ['pid'],
105
+ multiprocess_mode='livesum',
106
+ )
107
+
108
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
109
+ 'sky_apiserver_websocket_closed_total',
110
+ 'Number of websocket closed',
111
+ ['pid', 'reason'],
112
+ )
113
+
114
+ # The number of execution starts in each worker process, we do not record
115
+ # histogram here as the duration has been measured in
116
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
117
+ # Recording histogram WITH worker label will cause high cardinality.
118
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
119
+ 'sky_apiserver_process_execution_start_total',
120
+ 'Total number of execution starts in each worker process',
121
+ ['request', 'pid'],
122
+ )
123
+
124
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
125
+ 'sky_apiserver_process_peak_rss',
126
+ 'Peak RSS we saw in each process in last 30 seconds',
127
+ ['pid', 'type'],
128
+ )
129
+
130
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
131
+ 'sky_apiserver_process_cpu_total',
132
+ 'Total CPU times a worker process has been running',
133
+ ['pid', 'type', 'mode'],
134
+ )
135
+
136
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
137
+ 'sky_apiserver_request_memory_usage_bytes',
138
+ 'Peak memory usage of requests', ['name'],
139
+ buckets=_MEM_BUCKETS)
140
+
141
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
142
+ 'sky_apiserver_request_rss_incr_bytes',
143
+ 'RSS increment after requests', ['name'],
144
+ buckets=_MEM_BUCKETS)
145
+
146
+ SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
147
+ 'sky_apiserver_websocket_ssh_latency_seconds',
148
+ ('Time taken for ssh message to go from client to API server and back'
149
+ 'to the client. This does not include: latency to reach the pod, '
150
+ 'overhead from sending through the k8s port-forward tunnel, or '
151
+ 'ssh server lag on the destination pod.'),
152
+ ['pid'],
153
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
154
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
155
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
156
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
157
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
158
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
159
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
160
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
161
+ 960.0, 980.0, 1000.0, float('inf')),
162
+ )
163
+
164
+
165
+ @contextlib.contextmanager
166
+ def time_it(name: str, group: str = 'default'):
167
+ """Context manager to measure and record code execution duration."""
168
+ if not METRICS_ENABLED:
169
+ yield
170
+ else:
171
+ start_time = time.time()
172
+ try:
173
+ yield
174
+ finally:
175
+ duration = time.time() - start_time
176
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
177
+ name=name, group=group).observe(duration)
178
+
179
+
180
+ def time_me(func):
181
+ """Measure the duration of decorated function."""
182
+
183
+ @functools.wraps(func)
184
+ def wrapper(*args, **kwargs):
185
+ if not METRICS_ENABLED:
186
+ return func(*args, **kwargs)
187
+ name = f'{func.__module__}/{func.__name__}'
188
+ with time_it(name, group='function'):
189
+ return func(*args, **kwargs)
190
+
191
+ return wrapper
192
+
193
+
194
+ def time_me_async(func):
195
+ """Measure the duration of decorated async function."""
196
+
197
+ @functools.wraps(func)
198
+ async def async_wrapper(*args, **kwargs):
199
+ if not METRICS_ENABLED:
200
+ return await func(*args, **kwargs)
201
+ name = f'{func.__module__}/{func.__name__}'
202
+ with time_it(name, group='function'):
203
+ return await func(*args, **kwargs)
204
+
205
+ return async_wrapper
206
+
207
+
208
+ def start_svc_port_forward(context: str, namespace: str, service: str,
209
+ service_port: int) -> Tuple[subprocess.Popen, int]:
210
+ """Starts a port forward to a service in a Kubernetes cluster.
211
+ Args:
212
+ context: Kubernetes context name
213
+ namespace: Namespace where the service is located
214
+ service: Service name to port forward to
215
+ service_port: Port on the service to forward to
216
+ Returns:
217
+ Tuple of (subprocess.Popen process, local_port assigned)
218
+ Raises:
219
+ RuntimeError: If port forward fails to start
220
+ """
221
+ start_port_forward_timeout = 10 # 10 second timeout
222
+ terminate_port_forward_timeout = 5 # 5 second timeout
223
+
224
+ # Use ':service_port' to let kubectl choose the local port
225
+ cmd = [
226
+ 'kubectl', '--context', context, '-n', namespace, 'port-forward',
227
+ f'service/{service}', f':{service_port}'
228
+ ]
229
+
230
+ env = os.environ.copy()
231
+ if 'KUBECONFIG' not in env:
232
+ env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
233
+
234
+ port_forward_process = None
235
+ port_forward_exit = False
236
+ local_port = None
237
+ poller = None
238
+ fd = None
239
+
240
+ try:
241
+ # start the port forward process
242
+ port_forward_process = subprocess.Popen(cmd,
243
+ stdout=subprocess.PIPE,
244
+ stderr=subprocess.STDOUT,
245
+ text=True,
246
+ env=env)
247
+
248
+ # Use poll() instead of select() to avoid FD_SETSIZE limit
249
+ poller = select.poll()
250
+ assert port_forward_process.stdout is not None
251
+ fd = port_forward_process.stdout.fileno()
252
+ poller.register(fd, select.POLLIN)
253
+
254
+ start_time = time.time()
255
+ buffer = ''
256
+ # wait for the port forward to start and extract the local port
257
+ while time.time() - start_time < start_port_forward_timeout:
258
+ if port_forward_process.poll() is not None:
259
+ # port forward process has terminated
260
+ if port_forward_process.returncode != 0:
261
+ port_forward_exit = True
262
+ break
263
+
264
+ # Wait up to 1000ms for data to be available without blocking
265
+ # poll() takes timeout in milliseconds
266
+ events = poller.poll(_SELECT_TIMEOUT * 1000)
267
+
268
+ if events:
269
+ # Read available bytes from the FD without blocking
270
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
271
+ chunk = raw.decode(errors='ignore')
272
+ buffer += chunk
273
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
274
+ if match:
275
+ local_port = int(match.group(1))
276
+ break
277
+
278
+ # sleep for 100ms to avoid busy-waiting
279
+ time.sleep(0.1)
280
+ except BaseException: # pylint: disable=broad-exception-caught
281
+ if port_forward_process:
282
+ stop_svc_port_forward(port_forward_process,
283
+ timeout=terminate_port_forward_timeout)
284
+ raise
285
+ finally:
286
+ if poller is not None and fd is not None:
287
+ try:
288
+ poller.unregister(fd)
289
+ except (OSError, ValueError):
290
+ # FD may already be unregistered or invalid
291
+ pass
292
+ if port_forward_exit:
293
+ raise RuntimeError(f'Port forward failed for service {service} in '
294
+ f'namespace {namespace} on context {context}')
295
+ if local_port is None:
296
+ try:
297
+ if port_forward_process:
298
+ stop_svc_port_forward(port_forward_process,
299
+ timeout=terminate_port_forward_timeout)
300
+ finally:
301
+ raise RuntimeError(
302
+ f'Failed to extract local port for service {service} in '
303
+ f'namespace {namespace} on context {context}')
304
+
305
+ return port_forward_process, local_port
306
+
307
+
308
+ def stop_svc_port_forward(port_forward_process: subprocess.Popen,
309
+ timeout: int = 5) -> None:
310
+ """Stops a port forward to a service in a Kubernetes cluster.
311
+ Args:
312
+ port_forward_process: The subprocess.Popen process to terminate
313
+ """
314
+ try:
315
+ port_forward_process.terminate()
316
+ port_forward_process.wait(timeout=timeout)
317
+ except subprocess.TimeoutExpired:
318
+ port_forward_process.kill()
319
+ port_forward_process.wait()
320
+
321
+
322
+ async def send_metrics_request_with_port_forward(
323
+ context: str,
324
+ namespace: str,
325
+ service: str,
326
+ service_port: int,
327
+ endpoint_path: str = '/federate',
328
+ match_patterns: Optional[List[str]] = None,
329
+ timeout: float = 30.0) -> str:
330
+ """Sends a metrics request to a Prometheus endpoint via port forwarding.
331
+ Args:
332
+ context: Kubernetes context name
333
+ namespace: Namespace where the service is located
334
+ service: Service name to port forward to
335
+ service_port: Port on the service to forward to
336
+ endpoint_path: Path to append to the localhost endpoint (e.g.,
337
+ '/federate')
338
+ match_patterns: List of metric patterns to match (for federate
339
+ endpoint)
340
+ timeout: Request timeout in seconds
341
+ Returns:
342
+ Response text containing the metrics
343
+ Raises:
344
+ RuntimeError: If port forward or HTTP request fails
345
+ """
346
+ port_forward_process = None
347
+ try:
348
+ # Start port forward
349
+ port_forward_process, local_port = await context_utils.to_thread(
350
+ start_svc_port_forward, context, namespace, service, service_port)
351
+
352
+ # Build endpoint URL
353
+ endpoint = f'http://localhost:{local_port}{endpoint_path}'
354
+
355
+ # Make HTTP request
356
+ async with httpx.AsyncClient(timeout=timeout) as client:
357
+ if match_patterns:
358
+ # For federate endpoint, add match[] parameters
359
+ params = [('match[]', pattern) for pattern in match_patterns]
360
+ response = await client.get(endpoint, params=params)
361
+ else:
362
+ response = await client.get(endpoint)
363
+
364
+ response.raise_for_status()
365
+ return response.text
366
+
367
+ except Exception as e: # pylint: disable=broad-exception-caught
368
+ logger.error(f'Failed to send metrics request with port forward: '
369
+ f'{common_utils.format_exception(e)}')
370
+ raise
371
+ finally:
372
+ # Always clean up port forward
373
+ if port_forward_process:
374
+ await context_utils.to_thread(stop_svc_port_forward,
375
+ port_forward_process)
376
+
377
+
378
+ async def add_cluster_name_label(metrics_text: str, context: str) -> str:
379
+ """Adds a cluster_name label to each metric line.
380
+ Args:
381
+ metrics_text: The text containing the metrics
382
+ context: The cluster name
383
+ """
384
+ lines = metrics_text.strip().split('\n')
385
+ modified_lines = []
386
+
387
+ for line in lines:
388
+ # keep comment lines and empty lines as-is
389
+ if line.startswith('#') or not line.strip():
390
+ modified_lines.append(line)
391
+ continue
392
+ # if line is a metric line with labels, add cluster label
393
+ brace_start = line.find('{')
394
+ brace_end = line.find('}')
395
+ if brace_start != -1 and brace_end != -1:
396
+ metric_name = line[:brace_start]
397
+ existing_labels = line[brace_start + 1:brace_end]
398
+ rest_of_line = line[brace_end + 1:]
399
+
400
+ if existing_labels:
401
+ new_labels = f'cluster="{context}",{existing_labels}'
402
+ else:
403
+ new_labels = f'cluster="{context}"'
404
+
405
+ modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
406
+ modified_lines.append(modified_line)
407
+ else:
408
+ # keep other lines as-is
409
+ modified_lines.append(line)
410
+
411
+ return '\n'.join(modified_lines)
412
+
413
+
414
+ async def get_metrics_for_context(context: str) -> str:
415
+ """Get GPU metrics for a single Kubernetes context.
416
+ Args:
417
+ context: Kubernetes context name
418
+ Returns:
419
+ metrics_text: String containing the metrics
420
+ Raises:
421
+ Exception: If metrics collection fails for any reason
422
+ """
423
+ # Query both DCGM metrics and kube_pod_labels metrics
424
+ # This ensures the dashboard can perform joins to filter by skypilot cluster
425
+ match_patterns = [
426
+ '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
427
+ 'kube_pod_labels',
428
+ 'node_cpu_seconds_total{mode="idle"}'
429
+ ]
430
+
431
+ # TODO(rohan): don't hardcode the namespace and service name
432
+ metrics_text = await send_metrics_request_with_port_forward(
433
+ context=context,
434
+ namespace='skypilot',
435
+ service='skypilot-prometheus-server',
436
+ service_port=80,
437
+ endpoint_path='/federate',
438
+ match_patterns=match_patterns)
439
+
440
+ # add cluster name as a label to each metric line
441
+ metrics_text = await add_cluster_name_label(metrics_text, context)
442
+
443
+ return metrics_text
sky/models.py CHANGED
@@ -2,19 +2,57 @@
2
2
 
3
3
  import collections
4
4
  import dataclasses
5
- from typing import Any, Dict, Optional
5
+ import getpass
6
+ import os
7
+ from typing import Any, ClassVar, Dict, Optional
8
+
9
+ import pydantic
10
+
11
+ from sky.skylet import constants
12
+ from sky.utils import common_utils
6
13
 
7
14
 
8
15
  @dataclasses.dataclass
9
16
  class User:
17
+ """Dataclass to store user information."""
10
18
  # User hash
11
19
  id: str
12
20
  # Display name of the user
13
21
  name: Optional[str] = None
22
+ password: Optional[str] = None
23
+ created_at: Optional[int] = None
24
+
25
+ def __init__(
26
+ self,
27
+ id: str, # pylint: disable=redefined-builtin
28
+ name: Optional[str] = None,
29
+ password: Optional[str] = None,
30
+ created_at: Optional[int] = None):
31
+ self.id = id.strip().lower()
32
+ self.name = name
33
+ self.password = password
34
+ self.created_at = created_at
14
35
 
15
36
  def to_dict(self) -> Dict[str, Any]:
16
37
  return {'id': self.id, 'name': self.name}
17
38
 
39
+ def to_env_vars(self) -> Dict[str, Any]:
40
+ return {
41
+ constants.USER_ID_ENV_VAR: self.id,
42
+ constants.USER_ENV_VAR: self.name,
43
+ }
44
+
45
+ @classmethod
46
+ def get_current_user(cls) -> 'User':
47
+ """Returns the current user."""
48
+ user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
49
+ user_hash = common_utils.get_user_hash()
50
+ return User(id=user_hash, name=user_name)
51
+
52
+ def is_service_account(self) -> bool:
53
+ """Check if the user is a service account."""
54
+ return self.id.lower().startswith('sa-')
55
+
18
56
 
19
57
  RealtimeGpuAvailability = collections.namedtuple(
20
58
  'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
@@ -28,6 +66,8 @@ class KubernetesNodeInfo:
28
66
  # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
29
67
  total: Dict[str, int]
30
68
  free: Dict[str, int]
69
+ # IP address of the node (external IP preferred, fallback to internal IP)
70
+ ip_address: Optional[str] = None
31
71
 
32
72
 
33
73
  @dataclasses.dataclass
@@ -56,3 +96,40 @@ class KubernetesNodesInfo:
56
96
  },
57
97
  hint=data['hint'],
58
98
  )
99
+
100
+
101
+ class VolumeConfig(pydantic.BaseModel):
102
+ """Configuration for creating a volume."""
103
+ # If any fields changed, increment the version. For backward compatibility,
104
+ # modify the __setstate__ method to handle the old version.
105
+ _VERSION: ClassVar[int] = 1
106
+
107
+ _version: int
108
+ name: str
109
+ type: str
110
+ cloud: str
111
+ region: Optional[str]
112
+ zone: Optional[str]
113
+ name_on_cloud: str
114
+ size: Optional[str]
115
+ config: Dict[str, Any] = {}
116
+ labels: Optional[Dict[str, str]] = None
117
+ id_on_cloud: Optional[str] = None
118
+
119
+ def __getstate__(self) -> Dict[str, Any]:
120
+ state = super().__getstate__()
121
+ state['_version'] = self._VERSION
122
+ return state
123
+
124
+ def __setstate__(self, state: Dict[str, Any]) -> None:
125
+ """Set state from pickled state, for backward compatibility."""
126
+ super().__setstate__(state)
127
+ version = state.pop('_version', None)
128
+ if version is None:
129
+ version = -1
130
+
131
+ if version < 0:
132
+ state['id_on_cloud'] = None
133
+
134
+ state['_version'] = self._VERSION
135
+ self.__dict__.update(state)