skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/metrics.py CHANGED
@@ -1,74 +1,33 @@
1
1
  """Instrumentation for the API server."""
2
2
 
3
- import contextlib
4
- import functools
3
+ import asyncio
4
+ import multiprocessing
5
5
  import os
6
+ import threading
6
7
  import time
8
+ from typing import List
7
9
 
8
10
  import fastapi
9
11
  from prometheus_client import generate_latest
10
12
  from prometheus_client import multiprocess
11
13
  import prometheus_client as prom
14
+ import psutil
12
15
  import starlette.middleware.base
13
16
  import uvicorn
14
17
 
18
+ from sky import core
15
19
  from sky import sky_logging
16
- from sky.skylet import constants
17
-
18
- # Whether the metrics are enabled, cannot be changed at runtime.
19
- METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
20
- 'false').lower() == 'true'
20
+ from sky.metrics import utils as metrics_utils
21
21
 
22
22
  logger = sky_logging.init_logger(__name__)
23
23
 
24
- # Total number of API server requests, grouped by path, method, and status.
25
- SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
26
- 'sky_apiserver_requests_total',
27
- 'Total number of API server requests',
28
- ['path', 'method', 'status'],
29
- )
30
-
31
- # Time spent processing API server requests, grouped by path, method, and
32
- # status.
33
- SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
34
- 'sky_apiserver_request_duration_seconds',
35
- 'Time spent processing API server requests',
36
- ['path', 'method', 'status'],
37
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
38
- 60.0, 120.0, float('inf')),
39
- )
40
-
41
- # Time spent processing requests in executor.
42
- SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
43
- 'sky_apiserver_request_execution_duration_seconds',
44
- 'Time spent executing requests in executor',
45
- ['request', 'worker'],
46
- buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
47
- float('inf')),
48
- )
49
-
50
- # Time spent processing a piece of code, refer to time_it().
51
- SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
52
- 'sky_apiserver_code_duration_seconds',
53
- 'Time spent processing code',
54
- ['name', 'group'],
55
- buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0,
56
- 60.0, 120.0, float('inf')),
57
- )
58
-
59
- SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
60
- 'sky_apiserver_event_loop_lag_seconds',
61
- 'Scheduling delay of the server event loop',
62
- ['pid'],
63
- buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 20.0,
64
- 60.0, float('inf')),
65
- )
66
-
67
24
  metrics_app = fastapi.FastAPI()
68
25
 
69
26
 
27
+ # Serve /metrics in dedicated thread to avoid blocking the event loop
28
+ # of metrics server.
70
29
  @metrics_app.get('/metrics')
71
- async def metrics() -> fastapi.Response:
30
+ def metrics() -> fastapi.Response:
72
31
  """Expose aggregated Prometheus metrics from all worker processes."""
73
32
  if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
74
33
  # In multiprocess mode, we need to collect metrics from all processes.
@@ -82,6 +41,42 @@ async def metrics() -> fastapi.Response:
82
41
  headers={'Cache-Control': 'no-cache'})
83
42
 
84
43
 
44
+ @metrics_app.get('/gpu-metrics')
45
+ async def gpu_metrics() -> fastapi.Response:
46
+ """Gets the GPU metrics from multiple external k8s clusters"""
47
+ contexts = core.get_all_contexts()
48
+ all_metrics: List[str] = []
49
+ successful_contexts = 0
50
+
51
+ tasks = [
52
+ asyncio.create_task(metrics_utils.get_metrics_for_context(context))
53
+ for context in contexts
54
+ if context != 'in-cluster'
55
+ ]
56
+
57
+ results = await asyncio.gather(*tasks, return_exceptions=True)
58
+
59
+ for i, result in enumerate(results):
60
+ if isinstance(result, Exception):
61
+ logger.error(
62
+ f'Failed to get metrics for context {contexts[i]}: {result}')
63
+ elif isinstance(result, BaseException):
64
+ # Avoid changing behavior for non-Exception BaseExceptions
65
+ # like KeyboardInterrupt/SystemExit: re-raise them.
66
+ raise result
67
+ else:
68
+ metrics_text = result
69
+ all_metrics.append(metrics_text)
70
+ successful_contexts += 1
71
+
72
+ combined_metrics = '\n\n'.join(all_metrics)
73
+
74
+ # Return as plain text for Prometheus compatibility
75
+ return fastapi.Response(
76
+ content=combined_metrics,
77
+ media_type='text/plain; version=0.0.4; charset=utf-8')
78
+
79
+
85
80
  def build_metrics_server(host: str, port: int) -> uvicorn.Server:
86
81
  metrics_config = uvicorn.Config(
87
82
  'sky.server.metrics:metrics_app',
@@ -125,56 +120,41 @@ class PrometheusMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
125
120
  status_code_group = '5xx'
126
121
  raise
127
122
  finally:
128
- SKY_APISERVER_REQUESTS_TOTAL.labels(path=path,
129
- method=method,
130
- status=status_code_group).inc()
123
+ metrics_utils.SKY_APISERVER_REQUESTS_TOTAL.labels(
124
+ path=path, method=method, status=status_code_group).inc()
131
125
  if not streaming:
132
126
  duration = time.time() - start_time
133
- SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
127
+ metrics_utils.SKY_APISERVER_REQUEST_DURATION_SECONDS.labels(
134
128
  path=path, method=method,
135
129
  status=status_code_group).observe(duration)
136
130
 
137
131
  return response
138
132
 
139
133
 
140
- @contextlib.contextmanager
141
- def time_it(name: str, group: str = 'default'):
142
- """Context manager to measure and record code execution duration."""
143
- if not METRICS_ENABLED:
144
- yield
145
- else:
146
- start_time = time.time()
147
- try:
148
- yield
149
- finally:
150
- duration = time.time() - start_time
151
- SKY_APISERVER_CODE_DURATION_SECONDS.labels(
152
- name=name, group=group).observe(duration)
153
-
154
-
155
- def time_me(func):
156
- """Measure the duration of decorated function."""
157
-
158
- @functools.wraps(func)
159
- def wrapper(*args, **kwargs):
160
- if not METRICS_ENABLED:
161
- return func(*args, **kwargs)
162
- name = f'{func.__module__}/{func.__name__}'
163
- with time_it(name, group='function'):
164
- return func(*args, **kwargs)
165
-
166
- return wrapper
167
-
168
-
169
- def time_me_async(func):
170
- """Measure the duration of decorated async function."""
171
-
172
- @functools.wraps(func)
173
- async def async_wrapper(*args, **kwargs):
174
- if not METRICS_ENABLED:
175
- return await func(*args, **kwargs)
176
- name = f'{func.__module__}/{func.__name__}'
177
- with time_it(name, group='function'):
178
- return await func(*args, **kwargs)
179
-
180
- return async_wrapper
134
+ peak_rss_bytes = 0
135
+
136
+
137
+ def process_monitor(process_type: str, stop: threading.Event):
138
+ pid = multiprocessing.current_process().pid
139
+ proc = psutil.Process(pid)
140
+ last_bucket_end = time.time()
141
+ bucket_peak = 0
142
+ global peak_rss_bytes
143
+ while not stop.is_set():
144
+ if time.time() - last_bucket_end >= 30:
145
+ # Reset peak RSS for the next time bucket.
146
+ last_bucket_end = time.time()
147
+ bucket_peak = 0
148
+ peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
149
+ metrics_utils.SKY_APISERVER_PROCESS_PEAK_RSS.labels(
150
+ pid=pid, type=process_type).set(peak_rss_bytes)
151
+ ctimes = proc.cpu_times()
152
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
153
+ type=process_type,
154
+ mode='user').set(
155
+ ctimes.user)
156
+ metrics_utils.SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
157
+ type=process_type,
158
+ mode='system').set(
159
+ ctimes.system)
160
+ time.sleep(1)
@@ -0,0 +1,166 @@
1
+ """Utilities for building middlewares."""
2
+ import enum
3
+ import http
4
+ from typing import Type
5
+
6
+ import fastapi
7
+ import starlette.middleware.base
8
+ import starlette.types
9
+
10
+ from sky import sky_logging
11
+
12
+ logger = sky_logging.init_logger(__name__)
13
+
14
+
15
+ class WebSocketDecision(enum.Enum):
16
+ ACCEPT = 'accept'
17
+ UNAUTHORIZED = 'unauthorized'
18
+ FORBIDDEN = 'forbidden'
19
+ ERROR = 'error'
20
+
21
+
22
+ def websocket_aware(
23
+ middleware_cls: Type[starlette.middleware.base.BaseHTTPMiddleware]):
24
+ """Decorator to adapt BaseHTTPMiddleware to handle WebSockets.
25
+
26
+ It assembles an HTTP-style request like the HTTP upgrade request during
27
+ websocket handshake and then delegates it to the real HTTP middleware.
28
+ The websocket connection will be rejected if the HTTP middleware returns
29
+ a 4xx or 5xx status code.
30
+
31
+ Note: for websocket connection, the mutation made by the underlying HTTP
32
+ middleware on the request and response will be discarded.
33
+ """
34
+
35
+ class WebSocketAwareMiddleware:
36
+ """WebSocket-aware middleware wrapper."""
37
+
38
+ def __init__(self, app: starlette.types.ASGIApp, *args, **kwargs):
39
+ self.app = app
40
+ self.middleware = middleware_cls(app, *args, **kwargs)
41
+
42
+ async def __call__(self, scope: starlette.types.Scope,
43
+ receive: starlette.types.Receive,
44
+ send: starlette.types.Send):
45
+ scope_type = scope.get('type')
46
+ if scope_type == 'websocket':
47
+ await self._handle_websocket(scope, receive, send)
48
+ else:
49
+ # Delegate other scopes to the underlying HTTP middleware.
50
+ await self.middleware(scope, receive, send)
51
+
52
+ async def dispatch(
53
+ self, request: fastapi.Request,
54
+ call_next: starlette.middleware.base.RequestResponseEndpoint):
55
+ """Implement dispatch method to keep compatibility."""
56
+ return await self.middleware.dispatch(request, call_next)
57
+
58
+ async def _handle_websocket(self, scope: starlette.types.Scope,
59
+ receive: starlette.types.Receive,
60
+ send: starlette.types.Send):
61
+ """Handle websocket connection by delegating to HTTP middleware."""
62
+ decision = await self._run_websocket_dispatch(scope)
63
+ if decision == WebSocketDecision.ACCEPT:
64
+ await self.app(scope, receive, send)
65
+ elif decision == WebSocketDecision.UNAUTHORIZED:
66
+ await send({
67
+ 'type': 'websocket.close',
68
+ 'code': 4401,
69
+ 'reason': 'Unauthorized',
70
+ })
71
+ elif decision == WebSocketDecision.FORBIDDEN:
72
+ await send({
73
+ 'type': 'websocket.close',
74
+ 'code': 4403,
75
+ 'reason': 'Forbidden',
76
+ })
77
+ else:
78
+ await send({
79
+ 'type': 'websocket.close',
80
+ 'code': 1011,
81
+ 'reason': 'Internal Server Error',
82
+ })
83
+
84
+ async def _run_websocket_dispatch(
85
+ self, scope: starlette.types.Scope) -> WebSocketDecision:
86
+ http_scope = self._build_http_scope(scope)
87
+ http_receive = self._http_receive_adapter()
88
+ request = fastapi.Request(http_scope, receive=http_receive)
89
+ call_next_called = False
90
+ stub_response = fastapi.Response(status_code=http.HTTPStatus.OK)
91
+
92
+ async def call_next(req):
93
+ del req
94
+ # Capture whether call_next() is called in the underlying
95
+ # HTTP middleware to determine if we can proceed with current
96
+ # websocket connection.
97
+ nonlocal call_next_called
98
+ call_next_called = True
99
+ return stub_response
100
+
101
+ try:
102
+ response = await self.dispatch(request, call_next)
103
+ except Exception as e: # pylint: disable=broad-except
104
+ logger.error('Exception occurred in middleware dispatch for '
105
+ f'WebSocket scope: {e}')
106
+ return WebSocketDecision.ERROR
107
+
108
+ if response is None:
109
+ response = stub_response
110
+
111
+ status_code = response.status_code
112
+
113
+ if call_next_called and 200 <= status_code < 400:
114
+ return WebSocketDecision.ACCEPT
115
+ if status_code == http.HTTPStatus.UNAUTHORIZED:
116
+ return WebSocketDecision.UNAUTHORIZED
117
+ if status_code == http.HTTPStatus.FORBIDDEN:
118
+ return WebSocketDecision.FORBIDDEN
119
+ return WebSocketDecision.ERROR
120
+
121
+ @staticmethod
122
+ def _build_http_scope(
123
+ scope: starlette.types.Scope) -> starlette.types.Scope:
124
+ state = scope.setdefault('state', {})
125
+ scheme = scope.get('scheme', 'ws')
126
+ if scheme == 'ws':
127
+ http_scheme = 'http'
128
+ elif scheme == 'wss':
129
+ http_scheme = 'https'
130
+ else:
131
+ http_scheme = scheme
132
+ http_scope = dict(scope)
133
+ http_scope['type'] = 'http'
134
+ http_scope['scheme'] = http_scheme
135
+ http_scope['method'] = 'GET'
136
+ http_scope['http_version'] = scope.get('http_version', '1.1')
137
+ http_scope['state'] = state
138
+ return http_scope
139
+
140
+ @staticmethod
141
+ def _http_receive_adapter() -> starlette.types.Receive:
142
+ """Adapter thatmimics the sequence produced by Starlette for an HTTP
143
+ request: a single http.request event followed by a http.disconnect
144
+ """
145
+ sent = False
146
+
147
+ async def receive():
148
+ nonlocal sent
149
+ if not sent:
150
+ sent = True
151
+ return {
152
+ 'type': 'http.request',
153
+ 'body': b'',
154
+ 'more_body': False,
155
+ }
156
+ return {
157
+ 'type': 'http.disconnect',
158
+ }
159
+
160
+ return receive
161
+
162
+ WebSocketAwareMiddleware.__name__ = middleware_cls.__name__
163
+ WebSocketAwareMiddleware.__qualname__ = middleware_cls.__qualname__
164
+ WebSocketAwareMiddleware.__module__ = middleware_cls.__module__
165
+ WebSocketAwareMiddleware.__doc__ = middleware_cls.__doc__
166
+ return WebSocketAwareMiddleware