skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/logs/agent.py CHANGED
@@ -34,23 +34,50 @@ class FluentbitAgent(LoggingAgent):
34
34
  def get_setup_command(self,
35
35
  cluster_name: resources_utils.ClusterName) -> str:
36
36
  install_cmd = (
37
- 'if ! command -v fluent-bit >/dev/null 2>&1; then '
38
- 'sudo apt-get install -y gnupg; '
39
37
  # pylint: disable=line-too-long
40
- 'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
38
+ 'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
39
+ 'sudo apt-get update; sudo apt-get install -y gnupg; '
40
+ # pylint: disable=line-too-long
41
+ 'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
42
+ # pylint: disable=line-too-long
43
+ 'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
44
+ # pylint: disable=line-too-long
45
+ 'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
46
+ # pylint: disable=line-too-long
47
+ 'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
48
+ 'sudo apt-get update; '
49
+ 'sudo apt-get install -y fluent-bit; '
41
50
  'fi')
42
51
  cfg = self.fluentbit_config(cluster_name)
43
52
  cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
44
53
  config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
45
54
  f'echo {shlex.quote(cfg)} > {cfg_path}')
55
+ kill_prior_cmd = (
56
+ 'if [ -f "/tmp/fluentbit.pid" ]; then '
57
+ # pylint: disable=line-too-long
58
+ 'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
59
+ 'kill "$(cat /tmp/fluentbit.pid)" || true; '
60
+ 'fi')
46
61
  start_cmd = ('nohup $(command -v fluent-bit || '
47
62
  'echo "/opt/fluent-bit/bin/fluent-bit") '
48
- f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
49
- return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
63
+ f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
64
+ 'echo $! > /tmp/fluentbit.pid')
65
+ return ('set -e; '
66
+ f'{install_cmd}; '
67
+ f'{config_cmd}; '
68
+ f'{kill_prior_cmd}; '
69
+ f'{start_cmd}')
50
70
 
51
71
  def fluentbit_config(self,
52
72
  cluster_name: resources_utils.ClusterName) -> str:
53
73
  cfg_dict = {
74
+ 'parsers': [{
75
+ 'name': 'sky-ray-parser',
76
+ 'format': 'regex',
77
+ # pylint: disable=line-too-long
78
+ 'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
79
+ 'types': 'rank:integer pid:integer',
80
+ }],
54
81
  'pipeline': {
55
82
  'inputs': [{
56
83
  'name': 'tail',
@@ -62,6 +89,14 @@ class FluentbitAgent(LoggingAgent):
62
89
  # right after the job completion.
63
90
  'refresh_interval': 1,
64
91
  }],
92
+ 'filters': [{
93
+ 'name': 'parser',
94
+ 'match': '*',
95
+ 'key_name': 'log',
96
+ 'parser': 'sky-ray-parser',
97
+ 'preserve_key': 'on', # preserve field for backwards compat
98
+ 'reserve_data': 'on',
99
+ }],
65
100
  'outputs': [self.fluentbit_output_config(cluster_name)],
66
101
  }
67
102
  }
sky/logs/aws.py CHANGED
@@ -5,7 +5,6 @@ from typing import Any, Dict, Optional
5
5
  import pydantic
6
6
 
7
7
  from sky.logs.agent import FluentbitAgent
8
- from sky.skylet import constants
9
8
  from sky.utils import resources_utils
10
9
  from sky.utils import yaml_utils
11
10
 
@@ -176,6 +175,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
176
175
  Returns:
177
176
  The Fluent Bit configuration as a YAML string.
178
177
  """
178
+ cfg_dict = yaml_utils.read_yaml_str(
179
+ super().fluentbit_config(cluster_name))
179
180
  display_name = cluster_name.display_name
180
181
  unique_name = cluster_name.name_on_cloud
181
182
  # Build tags for the log stream
@@ -197,24 +198,13 @@ class CloudwatchLoggingAgent(FluentbitAgent):
197
198
  'value': value
198
199
  })
199
200
 
200
- cfg_dict = {
201
- 'pipeline': {
202
- 'inputs': [{
203
- 'name': 'tail',
204
- 'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
205
- 'path_key': 'log_path',
206
- # Shorten the refresh interval from 60s to 1s since every
207
- # job creates a new log file and we must be responsive
208
- # for this: the VM might be autodown within a minute
209
- # right after the job completion.
210
- 'refresh_interval': 1,
211
- 'processors': {
212
- 'logs': log_processors,
213
- }
214
- }],
215
- 'outputs': [self.fluentbit_output_config(cluster_name)],
216
- }
217
- }
201
+ # Add log processors to config
202
+ processors_config = cfg_dict['pipeline']['inputs'][0].get(
203
+ 'processors', {})
204
+ processors_logs_config = processors_config.get('logs', [])
205
+ processors_logs_config.extend(log_processors)
206
+ processors_config['logs'] = processors_logs_config
207
+ cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
218
208
 
219
209
  return yaml_utils.dump_yaml_str(cfg_dict)
220
210
 
sky/metrics/utils.py CHANGED
@@ -1,11 +1,218 @@
1
1
  """Utilities for processing GPU metrics from Kubernetes clusters."""
2
+ import contextlib
3
+ import functools
2
4
  import os
3
5
  import re
6
+ import select
4
7
  import subprocess
5
8
  import time
6
9
  from typing import List, Optional, Tuple
7
10
 
8
11
  import httpx
12
+ import prometheus_client as prom
13
+
14
+ from sky import sky_logging
15
+ from sky.skylet import constants
16
+ from sky.utils import common_utils
17
+ from sky.utils import context_utils
18
+
19
+ _SELECT_TIMEOUT = 1
20
+ _SELECT_BUFFER_SIZE = 4096
21
+
22
+ _KB = 2**10
23
+ _MB = 2**20
24
+ _MEM_BUCKETS = [
25
+ _KB,
26
+ 256 * _KB,
27
+ 512 * _KB,
28
+ _MB,
29
+ 2 * _MB,
30
+ 4 * _MB,
31
+ 8 * _MB,
32
+ 16 * _MB,
33
+ 32 * _MB,
34
+ 64 * _MB,
35
+ 128 * _MB,
36
+ 256 * _MB,
37
+ float('inf'),
38
+ ]
39
+
40
+ logger = sky_logging.init_logger(__name__)
41
+
42
+ # Whether the metrics are enabled, cannot be changed at runtime.
43
+ METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
44
+ 'false').lower() == 'true'
45
+
46
+ # Time spent processing a piece of code, refer to time_it().
47
+ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
48
+ 'sky_apiserver_code_duration_seconds',
49
+ 'Time spent processing code',
50
+ ['name', 'group'],
51
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
52
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
53
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
54
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
55
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
56
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
57
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
58
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
59
+ 960.0, 980.0, 1000.0, float('inf')),
60
+ )
61
+
62
+ # Total number of API server requests, grouped by path, method, and status.
63
+ SKY_APISERVER_REQUESTS_TOTAL = prom.Counter(
64
+ 'sky_apiserver_requests_total',
65
+ 'Total number of API server requests',
66
+ ['path', 'method', 'status'],
67
+ )
68
+
69
+ # Time spent processing API server requests, grouped by path, method, and
70
+ # status.
71
+ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
72
+ 'sky_apiserver_request_duration_seconds',
73
+ 'Time spent processing API server requests',
74
+ ['path', 'method', 'status'],
75
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
76
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
77
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
78
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
79
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
80
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
81
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
82
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
83
+ 960.0, 980.0, 1000.0, float('inf')),
84
+ )
85
+
86
+ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
87
+ 'sky_apiserver_event_loop_lag_seconds',
88
+ 'Scheduling delay of the server event loop',
89
+ ['pid'],
90
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
91
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
92
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
93
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
94
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
95
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
96
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
97
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
98
+ 960.0, 980.0, 1000.0, float('inf')),
99
+ )
100
+
101
+ SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
102
+ 'sky_apiserver_websocket_connections',
103
+ 'Number of websocket connections',
104
+ ['pid'],
105
+ multiprocess_mode='livesum',
106
+ )
107
+
108
+ SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
109
+ 'sky_apiserver_websocket_closed_total',
110
+ 'Number of websocket closed',
111
+ ['pid', 'reason'],
112
+ )
113
+
114
+ # The number of execution starts in each worker process, we do not record
115
+ # histogram here as the duration has been measured in
116
+ # SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
117
+ # Recording histogram WITH worker label will cause high cardinality.
118
+ SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
119
+ 'sky_apiserver_process_execution_start_total',
120
+ 'Total number of execution starts in each worker process',
121
+ ['request', 'pid'],
122
+ )
123
+
124
+ SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
125
+ 'sky_apiserver_process_peak_rss',
126
+ 'Peak RSS we saw in each process in last 30 seconds',
127
+ ['pid', 'type'],
128
+ )
129
+
130
+ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
131
+ 'sky_apiserver_process_cpu_total',
132
+ 'Total CPU times a worker process has been running',
133
+ ['pid', 'type', 'mode'],
134
+ )
135
+
136
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
137
+ 'sky_apiserver_request_memory_usage_bytes',
138
+ 'Peak memory usage of requests', ['name'],
139
+ buckets=_MEM_BUCKETS)
140
+
141
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
142
+ 'sky_apiserver_request_rss_incr_bytes',
143
+ 'RSS increment after requests', ['name'],
144
+ buckets=_MEM_BUCKETS)
145
+
146
+ SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
147
+ 'sky_apiserver_websocket_ssh_latency_seconds',
148
+ ('Time taken for ssh message to go from client to API server and back'
149
+ 'to the client. This does not include: latency to reach the pod, '
150
+ 'overhead from sending through the k8s port-forward tunnel, or '
151
+ 'ssh server lag on the destination pod.'),
152
+ ['pid'],
153
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
154
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
155
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
156
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
157
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
158
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
159
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
160
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
161
+ 960.0, 980.0, 1000.0, float('inf')),
162
+ )
163
+
164
+ SKY_APISERVER_LONG_EXECUTORS = prom.Gauge(
165
+ 'sky_apiserver_long_executors',
166
+ 'Total number of long-running request executors in the API server',
167
+ )
168
+
169
+ SKY_APISERVER_SHORT_EXECUTORS = prom.Gauge(
170
+ 'sky_apiserver_short_executors',
171
+ 'Total number of short-running request executors in the API server',
172
+ )
173
+
174
+
175
+ @contextlib.contextmanager
176
+ def time_it(name: str, group: str = 'default'):
177
+ """Context manager to measure and record code execution duration."""
178
+ if not METRICS_ENABLED:
179
+ yield
180
+ else:
181
+ start_time = time.time()
182
+ try:
183
+ yield
184
+ finally:
185
+ duration = time.time() - start_time
186
+ SKY_APISERVER_CODE_DURATION_SECONDS.labels(
187
+ name=name, group=group).observe(duration)
188
+
189
+
190
+ def time_me(func):
191
+ """Measure the duration of decorated function."""
192
+
193
+ @functools.wraps(func)
194
+ def wrapper(*args, **kwargs):
195
+ if not METRICS_ENABLED:
196
+ return func(*args, **kwargs)
197
+ name = f'{func.__module__}/{func.__name__}'
198
+ with time_it(name, group='function'):
199
+ return func(*args, **kwargs)
200
+
201
+ return wrapper
202
+
203
+
204
+ def time_me_async(func):
205
+ """Measure the duration of decorated async function."""
206
+
207
+ @functools.wraps(func)
208
+ async def async_wrapper(*args, **kwargs):
209
+ if not METRICS_ENABLED:
210
+ return await func(*args, **kwargs)
211
+ name = f'{func.__module__}/{func.__name__}'
212
+ with time_it(name, group='function'):
213
+ return await func(*args, **kwargs)
214
+
215
+ return async_wrapper
9
216
 
10
217
 
11
218
  def start_svc_port_forward(context: str, namespace: str, service: str,
@@ -34,46 +241,72 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
34
241
  if 'KUBECONFIG' not in env:
35
242
  env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
36
243
 
37
- # start the port forward process
38
- port_forward_process = subprocess.Popen(cmd,
39
- stdout=subprocess.PIPE,
40
- stderr=subprocess.STDOUT,
41
- text=True,
42
- env=env)
43
-
244
+ port_forward_process = None
245
+ port_forward_exit = False
44
246
  local_port = None
45
- start_time = time.time()
46
-
47
- # wait for the port forward to start and extract the local port
48
- while time.time() - start_time < start_port_forward_timeout:
49
- if port_forward_process.poll() is not None:
50
- # port forward process has terminated
51
- if port_forward_process.returncode != 0:
52
- raise RuntimeError(
53
- f'Port forward failed for service {service} in namespace '
54
- f'{namespace} on context {context}')
55
- break
56
-
57
- # read output line by line to find the local port
58
- if port_forward_process.stdout:
59
- line = port_forward_process.stdout.readline()
60
- if line:
61
- # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
62
- match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
247
+ poller = None
248
+ fd = None
249
+
250
+ try:
251
+ # start the port forward process
252
+ port_forward_process = subprocess.Popen(cmd,
253
+ stdout=subprocess.PIPE,
254
+ stderr=subprocess.STDOUT,
255
+ text=True,
256
+ env=env)
257
+
258
+ # Use poll() instead of select() to avoid FD_SETSIZE limit
259
+ poller = select.poll()
260
+ assert port_forward_process.stdout is not None
261
+ fd = port_forward_process.stdout.fileno()
262
+ poller.register(fd, select.POLLIN)
263
+
264
+ start_time = time.time()
265
+ buffer = ''
266
+ # wait for the port forward to start and extract the local port
267
+ while time.time() - start_time < start_port_forward_timeout:
268
+ if port_forward_process.poll() is not None:
269
+ # port forward process has terminated
270
+ if port_forward_process.returncode != 0:
271
+ port_forward_exit = True
272
+ break
273
+
274
+ # Wait up to 1000ms for data to be available without blocking
275
+ # poll() takes timeout in milliseconds
276
+ events = poller.poll(_SELECT_TIMEOUT * 1000)
277
+
278
+ if events:
279
+ # Read available bytes from the FD without blocking
280
+ raw = os.read(fd, _SELECT_BUFFER_SIZE)
281
+ chunk = raw.decode(errors='ignore')
282
+ buffer += chunk
283
+ match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
63
284
  if match:
64
285
  local_port = int(match.group(1))
65
286
  break
66
287
 
67
- # sleep for 100ms to avoid busy-waiting
68
- time.sleep(0.1)
69
-
288
+ # sleep for 100ms to avoid busy-waiting
289
+ time.sleep(0.1)
290
+ except BaseException: # pylint: disable=broad-exception-caught
291
+ if port_forward_process:
292
+ stop_svc_port_forward(port_forward_process,
293
+ timeout=terminate_port_forward_timeout)
294
+ raise
295
+ finally:
296
+ if poller is not None and fd is not None:
297
+ try:
298
+ poller.unregister(fd)
299
+ except (OSError, ValueError):
300
+ # FD may already be unregistered or invalid
301
+ pass
302
+ if port_forward_exit:
303
+ raise RuntimeError(f'Port forward failed for service {service} in '
304
+ f'namespace {namespace} on context {context}')
70
305
  if local_port is None:
71
306
  try:
72
- port_forward_process.terminate()
73
- port_forward_process.wait(timeout=terminate_port_forward_timeout)
74
- except subprocess.TimeoutExpired:
75
- port_forward_process.kill()
76
- port_forward_process.wait()
307
+ if port_forward_process:
308
+ stop_svc_port_forward(port_forward_process,
309
+ timeout=terminate_port_forward_timeout)
77
310
  finally:
78
311
  raise RuntimeError(
79
312
  f'Failed to extract local port for service {service} in '
@@ -82,14 +315,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
82
315
  return port_forward_process, local_port
83
316
 
84
317
 
85
- def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
318
+ def stop_svc_port_forward(port_forward_process: subprocess.Popen,
319
+ timeout: int = 5) -> None:
86
320
  """Stops a port forward to a service in a Kubernetes cluster.
87
321
  Args:
88
322
  port_forward_process: The subprocess.Popen process to terminate
89
323
  """
90
324
  try:
91
325
  port_forward_process.terminate()
92
- port_forward_process.wait(timeout=5)
326
+ port_forward_process.wait(timeout=timeout)
93
327
  except subprocess.TimeoutExpired:
94
328
  port_forward_process.kill()
95
329
  port_forward_process.wait()
@@ -122,8 +356,8 @@ async def send_metrics_request_with_port_forward(
122
356
  port_forward_process = None
123
357
  try:
124
358
  # Start port forward
125
- port_forward_process, local_port = start_svc_port_forward(
126
- context, namespace, service, service_port)
359
+ port_forward_process, local_port = await context_utils.to_thread(
360
+ start_svc_port_forward, context, namespace, service, service_port)
127
361
 
128
362
  # Build endpoint URL
129
363
  endpoint = f'http://localhost:{local_port}{endpoint_path}'
@@ -140,10 +374,15 @@ async def send_metrics_request_with_port_forward(
140
374
  response.raise_for_status()
141
375
  return response.text
142
376
 
377
+ except Exception as e: # pylint: disable=broad-exception-caught
378
+ logger.error(f'Failed to send metrics request with port forward: '
379
+ f'{common_utils.format_exception(e)}')
380
+ raise
143
381
  finally:
144
382
  # Always clean up port forward
145
383
  if port_forward_process:
146
- stop_svc_port_forward(port_forward_process)
384
+ await context_utils.to_thread(stop_svc_port_forward,
385
+ port_forward_process)
147
386
 
148
387
 
149
388
  async def add_cluster_name_label(metrics_text: str, context: str) -> str:
@@ -193,7 +432,11 @@ async def get_metrics_for_context(context: str) -> str:
193
432
  """
194
433
  # Query both DCGM metrics and kube_pod_labels metrics
195
434
  # This ensures the dashboard can perform joins to filter by skypilot cluster
196
- match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
435
+ match_patterns = [
436
+ '{__name__=~"node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|DCGM_.*"}', # pylint: disable=line-too-long
437
+ 'kube_pod_labels',
438
+ 'node_cpu_seconds_total{mode="idle"}'
439
+ ]
197
440
 
198
441
  # TODO(rohan): don't hardcode the namespace and service name
199
442
  metrics_text = await send_metrics_request_with_port_forward(
sky/optimizer.py CHANGED
@@ -1019,7 +1019,7 @@ class Optimizer:
1019
1019
  if res.instance_type is not None
1020
1020
  ])
1021
1021
  candidate_str = resources_utils.format_resource(
1022
- best_resources, simplify=True)
1022
+ best_resources, simplified_only=True)[0]
1023
1023
 
1024
1024
  logger.info(
1025
1025
  f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
sky/provision/__init__.py CHANGED
@@ -24,8 +24,11 @@ from sky.provision import kubernetes
24
24
  from sky.provision import lambda_cloud
25
25
  from sky.provision import nebius
26
26
  from sky.provision import oci
27
+ from sky.provision import primeintellect
27
28
  from sky.provision import runpod
28
29
  from sky.provision import scp
30
+ from sky.provision import seeweb
31
+ from sky.provision import shadeform
29
32
  from sky.provision import ssh
30
33
  from sky.provision import vast
31
34
  from sky.provision import vsphere
@@ -77,6 +80,7 @@ def query_instances(
77
80
  cluster_name_on_cloud: str,
78
81
  provider_config: Optional[Dict[str, Any]] = None,
79
82
  non_terminated_only: bool = True,
83
+ retry_if_missing: bool = False,
80
84
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
81
85
  """Query instances.
82
86
 
@@ -85,6 +89,11 @@ def query_instances(
85
89
 
86
90
  A None status means the instance is marked as "terminated"
87
91
  or "terminating".
92
+
93
+ Args:
94
+ retry_if_missing: Whether to retry the call to the cloud api if the
95
+ cluster is not found when querying the live status on the cloud.
96
+ NOTE: This is currently only used on kubernetes.
88
97
  """
89
98
  raise NotImplementedError
90
99
 
@@ -140,7 +149,34 @@ def get_volume_usedby(
140
149
 
141
150
 
142
151
  @_route_to_cloud_impl
143
- def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
152
+ def get_all_volumes_usedby(
153
+ provider_name: str, configs: List[models.VolumeConfig]
154
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
155
+ """Get the usedby of a volume.
156
+
157
+ Returns:
158
+ usedby_pods: List of dictionaries, each containing the config keys for
159
+ a volume and a key containing pods using the volume.
160
+ These may include pods not created by SkyPilot.
161
+ usedby_clusters: List of dictionaries, each containing the config keys
162
+ for a volume and a key containing clusters using
163
+ the volume.
164
+ """
165
+ raise NotImplementedError
166
+
167
+
168
+ @_route_to_cloud_impl
169
+ def map_all_volumes_usedby(
170
+ provider_name: str, used_by_pods: Dict[str, Any],
171
+ used_by_clusters: Dict[str, Any],
172
+ config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
173
+ """Map the usedby resources of a volume."""
174
+ raise NotImplementedError
175
+
176
+
177
+ @_route_to_cloud_impl
178
+ def run_instances(provider_name: str, region: str, cluster_name: str,
179
+ cluster_name_on_cloud: str,
144
180
  config: common.ProvisionConfig) -> common.ProvisionRecord:
145
181
  """Start instances with bootstrapped configuration."""
146
182
  raise NotImplementedError
@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
305
305
  Returns:
306
306
  A list of route tables associated with the options VPC and region
307
307
  """
308
- filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
308
+ filters: List['ec2_type_defs.FilterTypeDef'] = [{
309
+ 'Name': 'association.main',
310
+ 'Values': [str(main).lower()],
311
+ }]
309
312
  if vpc_id is not None:
310
313
  filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
311
314
  logger.debug(
@@ -406,10 +409,26 @@ def _usable_subnets(
406
409
  s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg
407
410
  ]
408
411
 
412
+ if not candidate_subnets:
413
+ _skypilot_log_error_and_exit_for_failover(
414
+ 'No candidate subnets found in specified VPC '
415
+ f'{vpc_id_of_sg}.')
416
+
409
417
  available_subnets = [
410
418
  s for s in candidate_subnets if s.state == 'available'
411
419
  ]
412
420
 
421
+ if not available_subnets:
422
+ _skypilot_log_error_and_exit_for_failover(
423
+ 'All candidate subnets are pending in specified VPC '
424
+ f'{vpc_id_of_sg}.')
425
+
426
+ if len(candidate_subnets) > len(available_subnets):
427
+ num_pruned = len(candidate_subnets) - len(available_subnets)
428
+ logger.debug(
429
+ f'{num_pruned} candidate subnets pruned since they are not '
430
+ 'available.')
431
+
413
432
  if use_internal_ips:
414
433
  # Get private subnets.
415
434
  #
@@ -421,6 +440,10 @@ def _usable_subnets(
421
440
  if not _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg) and
422
441
  not s.map_public_ip_on_launch
423
442
  ]
443
+ if not subnets:
444
+ _skypilot_log_error_and_exit_for_failover(
445
+ 'The use_internal_ips option is set to True, but all '
446
+ 'candidate subnets are public.')
424
447
  else:
425
448
  # Get public subnets.
426
449
  #
@@ -436,6 +459,10 @@ def _usable_subnets(
436
459
  s for s in available_subnets
437
460
  if _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg)
438
461
  ]
462
+ if not subnets:
463
+ _skypilot_log_error_and_exit_for_failover(
464
+ 'All candidate subnets are private, did you mean to '
465
+ 'set use_internal_ips to True?')
439
466
 
440
467
  subnets = sorted(
441
468
  subnets,
@@ -449,18 +476,7 @@ def _usable_subnets(
449
476
  'Failed to fetch available subnets from AWS.')
450
477
  raise exc
451
478
 
452
- if not subnets:
453
- vpc_msg = (f'Does a default VPC exist in region '
454
- f'{ec2.meta.client.meta.region_name}? ') if (
455
- vpc_id_of_sg is None) else ''
456
- _skypilot_log_error_and_exit_for_failover(
457
- f'No usable subnets found. {vpc_msg}'
458
- 'Try manually creating an instance in your specified region to '
459
- 'populate the list of subnets and try again. '
460
- 'Note that the subnet must map public IPs '
461
- 'on instance launch unless you set `use_internal_ips: true` in '
462
- 'the `provider` config.')
463
- elif _are_user_subnets_pruned(subnets):
479
+ if _are_user_subnets_pruned(subnets):
464
480
  _skypilot_log_error_and_exit_for_failover(
465
481
  f'The specified subnets are not '
466
482
  f'usable: {_get_pruned_subnets(subnets)}')
@@ -579,6 +595,11 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
579
595
  # not want SkyPilot to use.
580
596
  if vpc_id_of_sg is None:
581
597
  all_subnets = [s for s in all_subnets if s.vpc.is_default]
598
+ if not all_subnets:
599
+ _skypilot_log_error_and_exit_for_failover(
600
+ f'The default VPC in {region} either does not exist or '
601
+ 'has no subnets.')
602
+
582
603
  subnets, vpc_id = _usable_subnets(
583
604
  ec2,
584
605
  user_specified_subnets=None,