skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/logs/agent.py CHANGED
@@ -34,23 +34,50 @@ class FluentbitAgent(LoggingAgent):
34
34
  def get_setup_command(self,
35
35
  cluster_name: resources_utils.ClusterName) -> str:
36
36
  install_cmd = (
37
- 'if ! command -v fluent-bit >/dev/null 2>&1; then '
38
- 'sudo apt-get install -y gnupg; '
39
37
  # pylint: disable=line-too-long
40
- 'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
38
+ 'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
39
+ 'sudo apt-get update; sudo apt-get install -y gnupg; '
40
+ # pylint: disable=line-too-long
41
+ 'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
42
+ # pylint: disable=line-too-long
43
+ 'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
44
+ # pylint: disable=line-too-long
45
+ 'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
46
+ # pylint: disable=line-too-long
47
+ 'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
48
+ 'sudo apt-get update; '
49
+ 'sudo apt-get install -y fluent-bit; '
41
50
  'fi')
42
51
  cfg = self.fluentbit_config(cluster_name)
43
52
  cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
44
53
  config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
45
54
  f'echo {shlex.quote(cfg)} > {cfg_path}')
55
+ kill_prior_cmd = (
56
+ 'if [ -f "/tmp/fluentbit.pid" ]; then '
57
+ # pylint: disable=line-too-long
58
+ 'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
59
+ 'kill "$(cat /tmp/fluentbit.pid)" || true; '
60
+ 'fi')
46
61
  start_cmd = ('nohup $(command -v fluent-bit || '
47
62
  'echo "/opt/fluent-bit/bin/fluent-bit") '
48
- f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
49
- return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
63
+ f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
64
+ 'echo $! > /tmp/fluentbit.pid')
65
+ return ('set -e; '
66
+ f'{install_cmd}; '
67
+ f'{config_cmd}; '
68
+ f'{kill_prior_cmd}; '
69
+ f'{start_cmd}')
50
70
 
51
71
  def fluentbit_config(self,
52
72
  cluster_name: resources_utils.ClusterName) -> str:
53
73
  cfg_dict = {
74
+ 'parsers': [{
75
+ 'name': 'sky-ray-parser',
76
+ 'format': 'regex',
77
+ # pylint: disable=line-too-long
78
+ 'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
79
+ 'types': 'rank:integer pid:integer',
80
+ }],
54
81
  'pipeline': {
55
82
  'inputs': [{
56
83
  'name': 'tail',
@@ -62,6 +89,14 @@ class FluentbitAgent(LoggingAgent):
62
89
  # right after the job completion.
63
90
  'refresh_interval': 1,
64
91
  }],
92
+ 'filters': [{
93
+ 'name': 'parser',
94
+ 'match': '*',
95
+ 'key_name': 'log',
96
+ 'parser': 'sky-ray-parser',
97
+ 'preserve_key': 'on', # preserve field for backwards compat
98
+ 'reserve_data': 'on',
99
+ }],
65
100
  'outputs': [self.fluentbit_output_config(cluster_name)],
66
101
  }
67
102
  }
sky/logs/aws.py CHANGED
@@ -5,7 +5,6 @@ from typing import Any, Dict, Optional
5
5
  import pydantic
6
6
 
7
7
  from sky.logs.agent import FluentbitAgent
8
- from sky.skylet import constants
9
8
  from sky.utils import resources_utils
10
9
  from sky.utils import yaml_utils
11
10
 
@@ -176,6 +175,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
176
175
  Returns:
177
176
  The Fluent Bit configuration as a YAML string.
178
177
  """
178
+ cfg_dict = yaml_utils.read_yaml_str(
179
+ super().fluentbit_config(cluster_name))
179
180
  display_name = cluster_name.display_name
180
181
  unique_name = cluster_name.name_on_cloud
181
182
  # Build tags for the log stream
@@ -197,24 +198,13 @@ class CloudwatchLoggingAgent(FluentbitAgent):
197
198
  'value': value
198
199
  })
199
200
 
200
- cfg_dict = {
201
- 'pipeline': {
202
- 'inputs': [{
203
- 'name': 'tail',
204
- 'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
205
- 'path_key': 'log_path',
206
- # Shorten the refresh interval from 60s to 1s since every
207
- # job creates a new log file and we must be responsive
208
- # for this: the VM might be autodown within a minute
209
- # right after the job completion.
210
- 'refresh_interval': 1,
211
- 'processors': {
212
- 'logs': log_processors,
213
- }
214
- }],
215
- 'outputs': [self.fluentbit_output_config(cluster_name)],
216
- }
217
- }
201
+ # Add log processors to config
202
+ processors_config = cfg_dict['pipeline']['inputs'][0].get(
203
+ 'processors', {})
204
+ processors_logs_config = processors_config.get('logs', [])
205
+ processors_logs_config.extend(log_processors)
206
+ processors_config['logs'] = processors_logs_config
207
+ cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
218
208
 
219
209
  return yaml_utils.dump_yaml_str(cfg_dict)
220
210
 
sky/metrics/utils.py CHANGED
@@ -1,11 +1,218 @@
1
1
  """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import contextlib
3
+ import functools
2
4
  import os
3
5
  import re
6
+ import select
4
7
  import subprocess
5
8
  import time
6
9
  from typing import List, Optional, Tuple
7
10
 
8
11
  import httpx
12
+ import prometheus_client as prom
13
+
14
+ from sky import sky_logging
15
+ from sky.skylet import constants
16
+ from sky.utils import common_utils
17
+ from sky.utils import context_utils
18
+
19
+ _SELECT_TIMEOUT = 1
20
+ _SELECT_BUFFER_SIZE = 4096
21
+
22
+ _KB = 2**10
23
+ _MB = 2**20
24
+ _MEM_BUCKETS = [
25
+ _KB,
26
+ 256 * _KB,
27
+ 512 * _KB,
28
+ _MB,
29
+ 2 * _MB,
30
+ 4 * _MB,
31
+ 8 * _MB,
32
+ 16 * _MB,
33
+ 32 * _MB,
34
+ 64 * _MB,
35
+ 128 * _MB,
36
+ 256 * _MB,
37
+ float('inf'),
38
+ ]
39
+
40
+ logger = sky_logging.init_logger(__name__)
41
+
42
+ # Whether the metrics are enabled, cannot be changed at runtime.
43
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
44
+ 'false').lower() == 'true'
45
+
46
+ # Time spent processing a piece of code, refer to time_it().
47
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
48
+ 'sky_apiserver_code_duration_seconds',
49
+ 'Time spent processing code',
50
+ ['name', 'group'],
51
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
52
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
53
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
54
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
55
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
56
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
57
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
58
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
59
+ 960.0, 980.0, 1000.0, float('inf')),
60
+ )
61
+
62
+ # Total number of API server requests, grouped by path, method, and status.
63
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
64
+ 'sky_apiserver_requests_total',
65
+ 'Total number of API server requests',
66
+ ['path', 'method', 'status'],
67
+ )
68
+
69
+ # Time spent processing API server requests, grouped by path, method, and
70
+ # status.
71
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
72
+ 'sky_apiserver_request_duration_seconds',
73
+ 'Time spent processing API server requests',
74
+ ['path', 'method', 'status'],
75
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
76
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
77
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
78
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
79
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
80
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
81
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
82
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
83
+ 960.0, 980.0, 1000.0, float('inf')),
84
+ )
85
+
86
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
87
+ 'sky_apiserver_event_loop_lag_seconds',
88
+ 'Scheduling delay of the server event loop',
89
+ ['pid'],
90
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
91
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
92
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
93
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
94
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
95
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
96
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
97
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
98
+ 960.0, 980.0, 1000.0, float('inf')),
99
+ )
100
+
101
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
102
+ 'sky_apiserver_websocket_connections',
103
+ 'Number of websocket connections',
104
+ ['pid'],
105
+ multiprocess_mode='livesum',
106
+ )
107
+
108
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
109
+ 'sky_apiserver_websocket_closed_total',
110
+ 'Number of websocket closed',
111
+ ['pid', 'reason'],
112
+ )
113
+
114
+ # The number of execution starts in each worker process, we do not record
115
+ # histogram here as the duration has been measured in
116
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
117
+ # Recording histogram WITH worker label will cause high cardinality.
118
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
119
+ 'sky_apiserver_process_execution_start_total',
120
+ 'Total number of execution starts in each worker process',
121
+ ['request', 'pid'],
122
+ )
123
+
124
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
125
+ 'sky_apiserver_process_peak_rss',
126
+ 'Peak RSS we saw in each process in last 30 seconds',
127
+ ['pid', 'type'],
128
+ )
129
+
130
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
131
+ 'sky_apiserver_process_cpu_total',
132
+ 'Total CPU times a worker process has been running',
133
+ ['pid', 'type', 'mode'],
134
+ )
135
+
136
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
137
+ 'sky_apiserver_request_memory_usage_bytes',
138
+ 'Peak memory usage of requests', ['name'],
139
+ buckets=_MEM_BUCKETS)
140
+
141
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
142
+ 'sky_apiserver_request_rss_incr_bytes',
143
+ 'RSS increment after requests', ['name'],
144
+ buckets=_MEM_BUCKETS)
145
+
146
+ SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
147
+ 'sky_apiserver_websocket_ssh_latency_seconds',
148
+ ('Time taken for ssh message to go from client to API server and back'
149
+ 'to the client. This does not include: latency to reach the pod, '
150
+ 'overhead from sending through the k8s port-forward tunnel, or '
151
+ 'ssh server lag on the destination pod.'),
152
+ ['pid'],
153
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
154
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
155
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
156
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
157
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
158
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
159
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
160
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
161
+ 960.0, 980.0, 1000.0, float('inf')),
162
+ )
163
+
164
+ SKY_APISERVER_LONG_EXECUTORS = prom.Gauge(
165
+ 'sky_apiserver_long_executors',
166
+ 'Total number of long-running request executors in the API server',
167
+ )
168
+
169
+ SKY_APISERVER_SHORT_EXECUTORS = prom.Gauge(
170
+ 'sky_apiserver_short_executors',
171
+ 'Total number of short-running request executors in the API server',
172
+ )
173
+
174
+
175
+ @contextlib.contextmanager
176
+ def time_it(name: str, group: str = 'default'):
177
+ """Context manager to measure and record code execution duration."""
178
+ if not METRICS_ENABLED:
179
+ yield
180
+ else:
181
+ start_time = time.time()
182
+ try:
183
+ yield
184
+ finally:
185
+ duration = time.time() - start_time
186
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
187
+ name=name, group=group).observe(duration)
188
+
189
+
190
+ def time_me(func):
191
+ """Measure the duration of decorated function."""
192
+
193
+ @functools.wraps(func)
194
+ def wrapper(*args, **kwargs):
195
+ if not METRICS_ENABLED:
196
+ return func(*args, **kwargs)
197
+ name = f'{func.__module__}/{func.__name__}'
198
+ with time_it(name, group='function'):
199
+ return func(*args, **kwargs)
200
+
201
+ return wrapper
202
+
203
+
204
+ def time_me_async(func):
205
+ """Measure the duration of decorated async function."""
206
+
207
+ @functools.wraps(func)
208
+ async def async_wrapper(*args, **kwargs):
209
+ if not METRICS_ENABLED:
210
+ return await func(*args, **kwargs)
211
+ name = f'{func.__module__}/{func.__name__}'
212
+ with time_it(name, group='function'):
213
+ return await func(*args, **kwargs)
214
+
215
+ return async_wrapper
9
216
 
10
217
 
11
218
  def start_svc_port_forward(context: str, namespace: str, service: str,
@@ -34,46 +241,72 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
34
241
  if 'KUBECONFIG' not in env:
35
242
  env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
36
243
 
37
- # start the port forward process
38
- port_forward_process = subprocess.Popen(cmd,
39
- stdout=subprocess.PIPE,
40
- stderr=subprocess.STDOUT,
41
- text=True,
42
- env=env)
43
-
244
+ port_forward_process = None
245
+ port_forward_exit = False
44
246
  local_port = None
45
- start_time = time.time()
46
-
47
- # wait for the port forward to start and extract the local port
48
- while time.time() - start_time < start_port_forward_timeout:
49
- if port_forward_process.poll() is not None:
50
- # port forward process has terminated
51
- if port_forward_process.returncode != 0:
52
- raise RuntimeError(
53
- f'Port forward failed for service {service} in namespace '
54
- f'{namespace} on context {context}')
55
- break
56
-
57
- # read output line by line to find the local port
58
- if port_forward_process.stdout:
59
- line = port_forward_process.stdout.readline()
60
- if line:
61
- # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
62
- match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
247
+ poller = None
248
+ fd = None
249
+
250
+ try:
251
+ # start the port forward process
252
+ port_forward_process = subprocess.Popen(cmd,
253
+ stdout=subprocess.PIPE,
254
+ stderr=subprocess.STDOUT,
255
+ text=True,
256
+ env=env)
257
+
258
+ # Use poll() instead of select() to avoid FD_SETSIZE limit
259
+ poller = select.poll()
260
+ assert port_forward_process.stdout is not None
261
+ fd = port_forward_process.stdout.fileno()
262
+ poller.register(fd, select.POLLIN)
263
+
264
+ start_time = time.time()
265
+ buffer = ''
266
+ # wait for the port forward to start and extract the local port
267
+ while time.time() - start_time < start_port_forward_timeout:
268
+ if port_forward_process.poll() is not None:
269
+ # port forward process has terminated
270
+ if port_forward_process.returncode != 0:
271
+ port_forward_exit = True
272
+ break
273
+
274
+ # Wait up to 1000ms for data to be available without blocking
275
+ # poll() takes timeout in milliseconds
276
+ events = poller.poll(_SELECT_TIMEOUT * 1000)
277
+
278
+ if events:
279
+ # Read available bytes from the FD without blocking
280
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
281
+ chunk = raw.decode(errors='ignore')
282
+ buffer += chunk
283
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
63
284
  if match:
64
285
  local_port = int(match.group(1))
65
286
  break
66
287
 
67
- # sleep for 100ms to avoid busy-waiting
68
- time.sleep(0.1)
69
-
288
+ # sleep for 100ms to avoid busy-waiting
289
+ time.sleep(0.1)
290
+ except BaseException: # pylint: disable=broad-exception-caught
291
+ if port_forward_process:
292
+ stop_svc_port_forward(port_forward_process,
293
+ timeout=terminate_port_forward_timeout)
294
+ raise
295
+ finally:
296
+ if poller is not None and fd is not None:
297
+ try:
298
+ poller.unregister(fd)
299
+ except (OSError, ValueError):
300
+ # FD may already be unregistered or invalid
301
+ pass
302
+ if port_forward_exit:
303
+ raise RuntimeError(f'Port forward failed for service {service} in '
304
+ f'namespace {namespace} on context {context}')
70
305
  if local_port is None:
71
306
  try:
72
- port_forward_process.terminate()
73
- port_forward_process.wait(timeout=terminate_port_forward_timeout)
74
- except subprocess.TimeoutExpired:
75
- port_forward_process.kill()
76
- port_forward_process.wait()
307
+ if port_forward_process:
308
+ stop_svc_port_forward(port_forward_process,
309
+ timeout=terminate_port_forward_timeout)
77
310
  finally:
78
311
  raise RuntimeError(
79
312
  f'Failed to extract local port for service {service} in '
@@ -82,14 +315,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
82
315
  return port_forward_process, local_port
83
316
 
84
317
 
85
- def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
318
+ def stop_svc_port_forward(port_forward_process: subprocess.Popen,
319
+ timeout: int = 5) -> None:
86
320
  """Stops a port forward to a service in a Kubernetes cluster.
87
321
  Args:
88
322
  port_forward_process: The subprocess.Popen process to terminate
89
323
  """
90
324
  try:
91
325
  port_forward_process.terminate()
92
- port_forward_process.wait(timeout=5)
326
+ port_forward_process.wait(timeout=timeout)
93
327
  except subprocess.TimeoutExpired:
94
328
  port_forward_process.kill()
95
329
  port_forward_process.wait()
@@ -122,8 +356,8 @@ async def send_metrics_request_with_port_forward(
122
356
  port_forward_process = None
123
357
  try:
124
358
  # Start port forward
125
- port_forward_process, local_port = start_svc_port_forward(
126
- context, namespace, service, service_port)
359
+ port_forward_process, local_port = await context_utils.to_thread(
360
+ start_svc_port_forward, context, namespace, service, service_port)
127
361
 
128
362
  # Build endpoint URL
129
363
  endpoint = f'http://localhost:{local_port}{endpoint_path}'
@@ -140,10 +374,15 @@ async def send_metrics_request_with_port_forward(
140
374
  response.raise_for_status()
141
375
  return response.text
142
376
 
377
+ except Exception as e: # pylint: disable=broad-exception-caught
378
+ logger.error(f'Failed to send metrics request with port forward: '
379
+ f'{common_utils.format_exception(e)}')
380
+ raise
143
381
  finally:
144
382
  # Always clean up port forward
145
383
  if port_forward_process:
146
- stop_svc_port_forward(port_forward_process)
384
+ await context_utils.to_thread(stop_svc_port_forward,
385
+ port_forward_process)
147
386
 
148
387
 
149
388
  async def add_cluster_name_label(metrics_text: str, context: str) -> str:
@@ -193,7 +432,11 @@ async def get_metrics_for_context(context: str) -> str:
193
432
  """
194
433
  # Query both DCGM metrics and kube_pod_labels metrics
195
434
  # This ensures the dashboard can perform joins to filter by skypilot cluster
196
- match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
435
+ match_patterns = [
436
+ '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
437
+ 'kube_pod_labels',
438
+ 'node_cpu_seconds_total{mode="idle"}'
439
+ ]
197
440
 
198
441
  # TODO(rohan): don't hardcode the namespace and service name
199
442
  metrics_text = await send_metrics_request_with_port_forward(
sky/models.py CHANGED
@@ -68,6 +68,8 @@ class KubernetesNodeInfo:
68
68
  free: Dict[str, int]
69
69
  # IP address of the node (external IP preferred, fallback to internal IP)
70
70
  ip_address: Optional[str] = None
71
+ # Whether the node is ready (all conditions are satisfied)
72
+ is_ready: bool = True
71
73
 
72
74
 
73
75
  @dataclasses.dataclass
sky/optimizer.py CHANGED
@@ -781,7 +781,7 @@ class Optimizer:
781
781
  def _instance_type_str(resources: 'resources_lib.Resources') -> str:
782
782
  instance_type = resources.instance_type
783
783
  assert instance_type is not None, 'Instance type must be specified'
784
- if isinstance(resources.cloud, clouds.Kubernetes):
784
+ if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
785
785
  instance_type = '-'
786
786
  if resources.use_spot:
787
787
  instance_type = ''
@@ -865,11 +865,12 @@ class Optimizer:
865
865
  'use_spot': resources.use_spot
866
866
  }
867
867
 
868
- # Handle special case for Kubernetes and SSH clouds
869
- if isinstance(resources.cloud, clouds.Kubernetes):
868
+ # Handle special case for Kubernetes, SSH, and SLURM clouds
869
+ if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
870
870
  # Region for Kubernetes-like clouds (SSH, Kubernetes) is the
871
- # context name, i.e. different Kubernetes clusters. We add
872
- # region to the key to show all the Kubernetes clusters in the
871
+ # context name, i.e. different Kubernetes clusters.
872
+ # Region for SLURM is the cluster name.
873
+ # We add region to the key to show all the clusters in the
873
874
  # optimizer table for better UX.
874
875
 
875
876
  if resources.cloud.__class__.__name__ == 'SSH':
@@ -1019,7 +1020,7 @@ class Optimizer:
1019
1020
  if res.instance_type is not None
1020
1021
  ])
1021
1022
  candidate_str = resources_utils.format_resource(
1022
- best_resources, simplify=True)
1023
+ best_resources, simplified_only=True)[0]
1023
1024
 
1024
1025
  logger.info(
1025
1026
  f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
sky/provision/__init__.py CHANGED
@@ -24,8 +24,12 @@ from sky.provision import kubernetes
24
24
  from sky.provision import lambda_cloud
25
25
  from sky.provision import nebius
26
26
  from sky.provision import oci
27
+ from sky.provision import primeintellect
27
28
  from sky.provision import runpod
28
29
  from sky.provision import scp
30
+ from sky.provision import seeweb
31
+ from sky.provision import shadeform
32
+ from sky.provision import slurm
29
33
  from sky.provision import ssh
30
34
  from sky.provision import vast
31
35
  from sky.provision import vsphere
@@ -77,6 +81,7 @@ def query_instances(
77
81
  cluster_name_on_cloud: str,
78
82
  provider_config: Optional[Dict[str, Any]] = None,
79
83
  non_terminated_only: bool = True,
84
+ retry_if_missing: bool = False,
80
85
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
81
86
  """Query instances.
82
87
 
@@ -85,6 +90,11 @@ def query_instances(
85
90
 
86
91
  A None status means the instance is marked as "terminated"
87
92
  or "terminating".
93
+
94
+ Args:
95
+ retry_if_missing: Whether to retry the call to the cloud api if the
96
+ cluster is not found when querying the live status on the cloud.
97
+ NOTE: This is currently only used on kubernetes.
88
98
  """
89
99
  raise NotImplementedError
90
100
 
@@ -140,7 +150,34 @@ def get_volume_usedby(
140
150
 
141
151
 
142
152
  @_route_to_cloud_impl
143
- def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
153
+ def get_all_volumes_usedby(
154
+ provider_name: str, configs: List[models.VolumeConfig]
155
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
156
+ """Get the usedby of a volume.
157
+
158
+ Returns:
159
+ usedby_pods: List of dictionaries, each containing the config keys for
160
+ a volume and a key containing pods using the volume.
161
+ These may include pods not created by SkyPilot.
162
+ usedby_clusters: List of dictionaries, each containing the config keys
163
+ for a volume and a key containing clusters using
164
+ the volume.
165
+ """
166
+ raise NotImplementedError
167
+
168
+
169
+ @_route_to_cloud_impl
170
+ def map_all_volumes_usedby(
171
+ provider_name: str, used_by_pods: Dict[str, Any],
172
+ used_by_clusters: Dict[str, Any],
173
+ config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
174
+ """Map the usedby resources of a volume."""
175
+ raise NotImplementedError
176
+
177
+
178
+ @_route_to_cloud_impl
179
+ def run_instances(provider_name: str, region: str, cluster_name: str,
180
+ cluster_name_on_cloud: str,
144
181
  config: common.ProvisionConfig) -> common.ProvisionRecord:
145
182
  """Start instances with bootstrapped configuration."""
146
183
  raise NotImplementedError