skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,379 @@
1
+ #!/bin/bash
2
+ # ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
3
+ # Used as kubectl exec credential plugin to establish SSH tunnel on demand.
4
+ # Returns a valid credential format for kubectl with expiration. The expiration
5
+ # is calculated based on the TTL argument and is required to force kubectl to
6
+ # check the tunnel status frequently.
7
+
8
+ # Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
9
+
10
+ # Default time-to-live for credential in seconds
11
+ # This forces kubectl to check the tunnel status frequently
12
+ TTL_SECONDS=30
13
+
14
+ # Parse arguments
15
+ USE_SSH_CONFIG=0
16
+ SSH_KEY=""
17
+ CONTEXT=""
18
+ HOST=""
19
+ USER=""
20
+ PORT=6443 # Default port if not specified
21
+
22
+ # Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
23
+ debug_log() {
24
+ local message="$(date): $1"
25
+ echo "$message" >> "$LOG_FILE"
26
+ }
27
+
28
+ # Generate expiration timestamp for credential
29
+ generate_expiration_timestamp() {
30
+ # Try macOS date format first, fallback to Linux format
31
+ date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
32
+ }
33
+
34
+ # Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
35
+ acquire_lock() {
36
+ # Check for flock command
37
+ if ! command -v flock >/dev/null 2>&1; then
38
+ debug_log "flock command not available, using alternative lock mechanism"
39
+ # Simple file-based locking
40
+ if [ -f "$LOCK_FILE" ]; then
41
+ lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
42
+ if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
43
+ debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
44
+ return 1
45
+ else
46
+ # Stale lock file
47
+ debug_log "Removing stale lock file"
48
+ rm -f "$LOCK_FILE"
49
+ fi
50
+ fi
51
+ # Create our lock
52
+ echo $$ > "$LOCK_FILE"
53
+ return 0
54
+ else
55
+ # Use flock for better locking
56
+ exec 9>"$LOCK_FILE"
57
+ if ! flock -n 9; then
58
+ debug_log "Another process is starting the tunnel, waiting briefly"
59
+ return 1
60
+ fi
61
+ return 0
62
+ fi
63
+ }
64
+
65
+ # Release the lock
66
+ release_lock() {
67
+ if command -v flock >/dev/null 2>&1; then
68
+ # Using flock
69
+ exec 9>&- # Close file descriptor to release lock
70
+ else
71
+ # Using simple lock
72
+ rm -f "$LOCK_FILE"
73
+ fi
74
+ debug_log "Lock released"
75
+ }
76
+
77
+ # Generate SSH command based on available tools and parameters
78
+ generate_ssh_command() {
79
+ # Check for autossh
80
+ if ! command -v autossh >/dev/null 2>&1; then
81
+ debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
82
+ debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
83
+
84
+ # Fall back to regular ssh
85
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
86
+ SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
87
+ else
88
+ SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
89
+
90
+ # Add SSH key if provided
91
+ if [[ -n "$SSH_KEY" ]]; then
92
+ SSH_CMD+=("-i" "$SSH_KEY")
93
+ fi
94
+
95
+ # Add user@host
96
+ SSH_CMD+=("$USER@$HOST")
97
+ fi
98
+ else
99
+ # Configure autossh
100
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
101
+ SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
102
+ else
103
+ SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
104
+
105
+ # Add SSH key if provided
106
+ if [[ -n "$SSH_KEY" ]]; then
107
+ SSH_CMD+=("-i" "$SSH_KEY")
108
+ fi
109
+
110
+ # Add user@host
111
+ SSH_CMD+=("$USER@$HOST")
112
+ fi
113
+ fi
114
+ }
115
+
116
+ # Function to read certificate files if they exist
117
+ read_certificate_data() {
118
+ local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
119
+ local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
120
+ local cert_data=""
121
+ local key_data=""
122
+
123
+ if [[ -f "$client_cert_file" ]]; then
124
+ # Read the certificate file as is - it's already in PEM format
125
+ cert_data=$(cat "$client_cert_file")
126
+ debug_log "Found client certificate data for context $CONTEXT"
127
+
128
+ # Log the first and last few characters to verify PEM format
129
+ local cert_start=$(head -1 "$client_cert_file")
130
+ local cert_end=$(tail -1 "$client_cert_file")
131
+ debug_log "Certificate starts with: $cert_start"
132
+ debug_log "Certificate ends with: $cert_end"
133
+
134
+ # Check if it has proper PEM format
135
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
136
+ debug_log "WARNING: Certificate file may not be in proper PEM format"
137
+ # Try to fix it if needed
138
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
139
+ echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
140
+ cat "$client_cert_file" >> "$client_cert_file.fixed"
141
+ echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
142
+ mv "$client_cert_file.fixed" "$client_cert_file"
143
+ cert_data=$(cat "$client_cert_file")
144
+ debug_log "Fixed certificate format by adding BEGIN/END markers"
145
+ fi
146
+ fi
147
+ fi
148
+
149
+ if [[ -f "$client_key_file" ]]; then
150
+ # Read the key file as is - it's already in PEM format
151
+ key_data=$(cat "$client_key_file")
152
+ debug_log "Found client key data for context $CONTEXT"
153
+
154
+ # Log the first and last few characters to verify PEM format
155
+ local key_start=$(head -1 "$client_key_file")
156
+ local key_end=$(tail -1 "$client_key_file")
157
+ debug_log "Key starts with: $key_start"
158
+ debug_log "Key ends with: $key_end"
159
+
160
+ # Check if it has proper PEM format
161
+ if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
162
+ debug_log "WARNING: Key file may not be in proper PEM format"
163
+ # Try to fix it if needed
164
+ if ! grep -q "BEGIN" "$client_key_file"; then
165
+ echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
166
+ cat "$client_key_file" >> "$client_key_file.fixed"
167
+ echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
168
+ mv "$client_key_file.fixed" "$client_key_file"
169
+ key_data=$(cat "$client_key_file")
170
+ debug_log "Fixed key format by adding BEGIN/END markers"
171
+ fi
172
+ fi
173
+ fi
174
+
175
+ echo "$cert_data:$key_data"
176
+ }
177
+
178
+ # Function to generate credentials JSON
179
+ generate_credentials_json() {
180
+ local expiration_time=$(generate_expiration_timestamp)
181
+ local cert_bundle=$(read_certificate_data)
182
+ local client_cert_data=${cert_bundle%:*}
183
+ local client_key_data=${cert_bundle#*:}
184
+
185
+ if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
186
+ # Debug the certificate data
187
+ debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
188
+ debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
+
190
+ # Check if we can create proper JSON with `jq`
191
+ if ! command -v jq &>/dev/null; then
192
+ echo "jq is not installed. Please install jq to use this script." >&2
193
+ exit 1
194
+ fi
195
+ debug_log "Using jq for JSON formatting"
196
+
197
+ # Create a temporary file for the JSON output to avoid shell escaping issues
198
+ local TEMP_JSON_FILE=$(mktemp)
199
+
200
+ # Write the JSON to the temporary file using jq for proper JSON formatting
201
+ cat > "$TEMP_JSON_FILE" << EOL
202
+ {
203
+ "apiVersion": "client.authentication.k8s.io/v1beta1",
204
+ "kind": "ExecCredential",
205
+ "status": {
206
+ "clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
207
+ "clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
208
+ "expirationTimestamp": "$expiration_time"
209
+ }
210
+ }
211
+ EOL
212
+
213
+ # Read the JSON from the file
214
+ local json_response=$(cat "$TEMP_JSON_FILE")
215
+
216
+ # Clean up
217
+ rm -f "$TEMP_JSON_FILE"
218
+
219
+ # Output the JSON
220
+ echo "$json_response"
221
+ else
222
+ # Fallback to token-based credential for tunnel-only authentication
223
+ echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
224
+ fi
225
+ }
226
+
227
+ while [[ $# -gt 0 ]]; do
228
+ case $1 in
229
+ --use-ssh-config)
230
+ USE_SSH_CONFIG=1
231
+ shift
232
+ ;;
233
+ --ssh-key)
234
+ SSH_KEY="$2"
235
+ shift 2
236
+ ;;
237
+ --context)
238
+ CONTEXT="$2"
239
+ shift 2
240
+ ;;
241
+ --port)
242
+ PORT="$2"
243
+ shift 2
244
+ ;;
245
+ --host)
246
+ HOST="$2"
247
+ shift 2
248
+ ;;
249
+ --user)
250
+ USER="$2"
251
+ shift 2
252
+ ;;
253
+ --ttl)
254
+ TTL_SECONDS="$2"
255
+ shift 2
256
+ ;;
257
+ *)
258
+ echo "Unknown parameter: $1" >&2
259
+ exit 1
260
+ ;;
261
+ esac
262
+ done
263
+
264
+ # Validate required parameters
265
+ if [[ -z "$HOST" ]]; then
266
+ echo "Error: --host parameter is required" >&2
267
+ exit 1
268
+ fi
269
+
270
+ # Setup directories
271
+ TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
272
+ mkdir -p "$TUNNEL_DIR"
273
+
274
+ # Get context name for PID file
275
+ if [[ -z "$CONTEXT" ]]; then
276
+ CONTEXT="default"
277
+ fi
278
+
279
+ PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
280
+ LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
281
+ LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
282
+
283
+ debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
284
+ debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
285
+
286
+ # Check if specified port is already in use (tunnel may be running)
287
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
288
+ debug_log "Port $PORT already in use, checking if it's our tunnel"
289
+
290
+ # Check if there's a PID file and if that process is running
291
+ if [[ -f "$PID_FILE" ]]; then
292
+ OLD_PID=$(cat "$PID_FILE")
293
+ if kill -0 "$OLD_PID" 2>/dev/null; then
294
+ debug_log "Tunnel appears to be running with PID $OLD_PID"
295
+ else
296
+ debug_log "PID file exists but process $OLD_PID is not running"
297
+ fi
298
+ else
299
+ debug_log "Port $PORT is in use but no PID file exists"
300
+ fi
301
+
302
+ # Return valid credential format for kubectl with expiration
303
+ generate_credentials_json
304
+ exit 0
305
+ fi
306
+
307
+ # Try to acquire the lock
308
+ if ! acquire_lock; then
309
+ # Wait briefly for the tunnel to be established
310
+ for i in {1..10}; do
311
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
312
+ debug_log "Tunnel is now active"
313
+
314
+ # Return valid credential format for kubectl with expiration
315
+ generate_credentials_json
316
+ exit 0
317
+ fi
318
+ sleep 0.2
319
+ done
320
+ debug_log "Waited for tunnel but port $PORT still not available"
321
+ fi
322
+
323
+ # Check if we have a PID file with running process
324
+ if [[ -f "$PID_FILE" ]]; then
325
+ OLD_PID=$(cat "$PID_FILE")
326
+ if kill -0 "$OLD_PID" 2>/dev/null; then
327
+ # Process exists but port isn't open - something's wrong, kill it
328
+ kill "$OLD_PID" 2>/dev/null
329
+ debug_log "Killed stale tunnel process $OLD_PID"
330
+ else
331
+ debug_log "PID file exists but process $OLD_PID is not running anymore"
332
+ fi
333
+ # Remove the stale PID file
334
+ rm -f "$PID_FILE"
335
+ fi
336
+
337
+ # Generate the SSH command
338
+ generate_ssh_command
339
+
340
+ debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
341
+
342
+ # Start the tunnel in foreground and wait for it to establish
343
+ "${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
344
+ TUNNEL_PID=$!
345
+
346
+ # Save PID
347
+ echo $TUNNEL_PID > "$PID_FILE"
348
+ debug_log "Tunnel started with PID $TUNNEL_PID"
349
+
350
+ # Wait for tunnel to establish
351
+ tunnel_up=0
352
+ for i in {1..20}; do
353
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
354
+ debug_log "Tunnel established successfully on port $PORT"
355
+ tunnel_up=1
356
+ break
357
+ fi
358
+ sleep 0.2
359
+ done
360
+
361
+ # Clean up lock file
362
+ release_lock
363
+
364
+ # Check if the tunnel process is still running
365
+ if ! kill -0 $TUNNEL_PID 2>/dev/null; then
366
+ debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
367
+ if [[ -f "$PID_FILE" ]]; then
368
+ rm -f "$PID_FILE"
369
+ fi
370
+ # Return error in case of tunnel failure
371
+ echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
372
+ exit 1
373
+ elif [[ $tunnel_up -eq 0 ]]; then
374
+ debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
375
+ fi
376
+
377
+ # Return valid credential format with certificates if available
378
+ generate_credentials_json
379
+ exit 0
@@ -0,0 +1,221 @@
1
+ """Utility functions for managing SSH node pools."""
2
+ import os
3
+ import re
4
+ import subprocess
5
+ from typing import Any, Callable, Dict, List, Optional
6
+ import uuid
7
+
8
+ import yaml
9
+
10
+ from sky.utils import ux_utils
11
+
12
+ DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
13
+ RED = '\033[0;31m'
14
+ NC = '\033[0m' # No color
15
+
16
+
17
+ def check_host_in_ssh_config(hostname: str) -> bool:
18
+ """Return True iff *hostname* matches at least one `Host`/`Match` stanza
19
+ in the user's OpenSSH client configuration (including anything pulled in
20
+ via Include).
21
+
22
+ It calls: ssh -vvG <hostname> -o ConnectTimeout=0
23
+ which:
24
+ • -G expands the effective config without connecting
25
+ • -vv prints debug lines that show which stanzas are applied
26
+ • ConnectTimeout=0 avoids a DNS lookup if <hostname> is a FQDN/IP
27
+
28
+ No config files are opened or parsed manually.
29
+
30
+ Parameters
31
+ ----------
32
+ hostname : str
33
+ The alias/IP/FQDN you want to test.
34
+
35
+ Returns
36
+ -------
37
+ bool
38
+ True – a specific stanza matched the host
39
+ False – nothing but the global defaults (`Host *`) applied
40
+ """
41
+ # We direct stderr→stdout because debug output goes to stderr.
42
+ proc = subprocess.run(
43
+ ['ssh', '-vvG', hostname, '-o', 'ConnectTimeout=0'],
44
+ text=True,
45
+ stdout=subprocess.PIPE,
46
+ stderr=subprocess.STDOUT,
47
+ check=False, # we only want the text, not to raise
48
+ )
49
+
50
+ # Look for lines like:
51
+ # debug1: ~/.ssh/config line 42: Applying options for <hostname>
52
+ # Anything other than "*"
53
+ pattern = re.compile(r'^debug\d+: .*Applying options for ([^*].*)$',
54
+ re.MULTILINE)
55
+
56
+ return bool(pattern.search(proc.stdout))
57
+
58
+
59
+ class UniqueKeySafeLoader(yaml.SafeLoader):
60
+ """Custom YAML loader that raises an error if there are duplicate keys."""
61
+
62
+ def construct_mapping(self, node, deep=False):
63
+ mapping = set()
64
+ for key_node, _ in node.value:
65
+ key = self.construct_object(key_node, deep=deep)
66
+ if key in mapping:
67
+ raise yaml.constructor.ConstructorError(
68
+ note=(f'Duplicate key found: {key!r}.\n'
69
+ 'Please remove one of them from the YAML file.'))
70
+ mapping.add(key)
71
+ return super().construct_mapping(node, deep)
72
+
73
+
74
+ def load_ssh_targets(file_path: str) -> Dict[str, Any]:
75
+ """Load SSH targets from YAML file."""
76
+ if not os.path.exists(file_path):
77
+ with ux_utils.print_exception_no_traceback():
78
+ raise ValueError(f'SSH Node Pools file not found: {file_path}')
79
+
80
+ try:
81
+ with open(file_path, 'r', encoding='utf-8') as f:
82
+ targets = yaml.load(f, Loader=UniqueKeySafeLoader)
83
+ return targets
84
+ except yaml.constructor.ConstructorError as e:
85
+ with ux_utils.print_exception_no_traceback():
86
+ raise ValueError(e.note) from e
87
+ except (yaml.YAMLError, IOError, OSError) as e:
88
+ with ux_utils.print_exception_no_traceback():
89
+ raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
90
+
91
+
92
+ def get_cluster_config(
93
+ targets: Dict[str, Any],
94
+ cluster_name: Optional[str] = None,
95
+ file_path: str = DEFAULT_SSH_NODE_POOLS_PATH) -> Dict[str, Any]:
96
+ """Get configuration for specific clusters or all clusters."""
97
+ if not targets:
98
+ with ux_utils.print_exception_no_traceback():
99
+ raise ValueError(
100
+ f'No clusters defined in SSH Node Pools file {file_path}')
101
+
102
+ if cluster_name:
103
+ if cluster_name not in targets:
104
+ with ux_utils.print_exception_no_traceback():
105
+ raise ValueError(f'Cluster {cluster_name!r} not found in '
106
+ f'SSH Node Pools file {file_path}')
107
+ return {cluster_name: targets[cluster_name]}
108
+
109
+ # Return all clusters if no specific cluster is specified
110
+ return targets
111
+
112
+
113
+ def prepare_hosts_info(
114
+ cluster_name: str,
115
+ cluster_config: Dict[str, Any],
116
+ upload_ssh_key_func: Optional[Callable[[str, str], str]] = None
117
+ ) -> List[Dict[str, str]]:
118
+ """Prepare list of hosts with resolved user, identity_file, and password.
119
+
120
+ Args:
121
+ cluster_name: The name of the cluster.
122
+ cluster_config: The configuration for the cluster.
123
+ upload_ssh_key_func: A function to upload the SSH key to the remote
124
+ server and wait for the key to be uploaded. This function will take
125
+ the key name and the local key file path as input, and return the
126
+ path for the remote SSH key file on the API server. This function
127
+ will only be set in `sky ssh up -f` mode, and if this function is
128
+ set, any ssh config will not be allowed as we don't support
129
+ uploading any ssh config to the API server.
130
+
131
+ Returns:
132
+ A list of hosts with resolved user, identity_file, and password.
133
+ """
134
+ if 'hosts' not in cluster_config or not cluster_config['hosts']:
135
+ with ux_utils.print_exception_no_traceback():
136
+ raise ValueError(
137
+ f'No hosts defined in cluster {cluster_name} configuration')
138
+
139
+ # Get cluster-level defaults
140
+ cluster_user = cluster_config.get('user', '')
141
+ cluster_identity_file = os.path.expanduser(
142
+ cluster_config.get('identity_file', ''))
143
+ cluster_password = cluster_config.get('password', '')
144
+
145
+ # Check if cluster identity file exists
146
+ if cluster_identity_file and not os.path.isfile(cluster_identity_file):
147
+ with ux_utils.print_exception_no_traceback():
148
+ raise ValueError(
149
+ f'SSH Identity File Missing: {cluster_identity_file}')
150
+
151
+ use_cluster_config_msg = (f'Cluster {cluster_name} uses SSH config '
152
+ 'for hostname {host}, which is not '
153
+ 'supported by the -f flag. Please use a '
154
+ 'dict with `ip` field instead.')
155
+
156
+ def _maybe_hardcode_identity_file(i: int, identity_file: str) -> str:
157
+ if upload_ssh_key_func is None:
158
+ return identity_file
159
+ if not os.path.exists(os.path.expanduser(identity_file)):
160
+ with ux_utils.print_exception_no_traceback():
161
+ raise ValueError(
162
+ f'Identity file {identity_file} does not exist.')
163
+ key_name = f'{cluster_name}-{i}-{str(uuid.uuid4())[:4]}'
164
+ key_file_on_api_server = upload_ssh_key_func(key_name, identity_file)
165
+ return key_file_on_api_server
166
+
167
+ hosts_info = []
168
+ for i, host in enumerate(cluster_config['hosts']):
169
+ # Host can be a string (IP or SSH config hostname) or a dict
170
+ if isinstance(host, str):
171
+ # Check if this is an SSH config hostname
172
+ is_ssh_config_host = check_host_in_ssh_config(host)
173
+ if upload_ssh_key_func is not None and is_ssh_config_host:
174
+ with ux_utils.print_exception_no_traceback():
175
+ raise ValueError(use_cluster_config_msg.format(host=host))
176
+
177
+ hosts_info.append({
178
+ 'ip': host,
179
+ 'user': '' if is_ssh_config_host else cluster_user,
180
+ 'identity_file': '' if is_ssh_config_host else
181
+ _maybe_hardcode_identity_file(
182
+ i, cluster_identity_file),
183
+ 'password': cluster_password,
184
+ 'use_ssh_config': is_ssh_config_host
185
+ })
186
+ else:
187
+ # It's a dict with potential overrides
188
+ if 'ip' not in host:
189
+ print(f'{RED}Warning: Host missing \'ip\' field, '
190
+ f'skipping: {host}{NC}')
191
+ continue
192
+
193
+ # Check if this is an SSH config hostname
194
+ is_ssh_config_host = check_host_in_ssh_config(host['ip'])
195
+ if upload_ssh_key_func is not None and is_ssh_config_host:
196
+ with ux_utils.print_exception_no_traceback():
197
+ raise ValueError(use_cluster_config_msg.format(host=host))
198
+
199
+ # Use host-specific values or fall back to cluster defaults
200
+ host_user = '' if is_ssh_config_host else host.get(
201
+ 'user', cluster_user)
202
+ host_identity_file = '' if is_ssh_config_host else (
203
+ _maybe_hardcode_identity_file(
204
+ i, host.get('identity_file', cluster_identity_file)))
205
+ host_identity_file = os.path.expanduser(host_identity_file)
206
+ host_password = host.get('password', cluster_password)
207
+
208
+ if host_identity_file and not os.path.isfile(host_identity_file):
209
+ with ux_utils.print_exception_no_traceback():
210
+ raise ValueError(
211
+ f'SSH Identity File Missing: {host_identity_file}')
212
+
213
+ hosts_info.append({
214
+ 'ip': host['ip'],
215
+ 'user': host_user,
216
+ 'identity_file': host_identity_file,
217
+ 'password': host_password,
218
+ 'use_ssh_config': is_ssh_config_host
219
+ })
220
+
221
+ return hosts_info
@@ -2,26 +2,13 @@
2
2
  import enum
3
3
 
4
4
 
5
+ # TODO(kevin): Remove this enum in v0.13.0.
5
6
  class KubernetesNetworkingMode(enum.Enum):
6
- """Enum for the different types of networking modes for accessing
7
- jump pods.
7
+ """Enum for the different types of networking modes for accessing pods.
8
8
  """
9
9
  NODEPORT = 'nodeport'
10
10
  PORTFORWARD = 'portforward'
11
11
 
12
- @classmethod
13
- def from_str(cls, mode: str) -> 'KubernetesNetworkingMode':
14
- """Returns the enum value for the given string."""
15
- if mode.lower() == cls.NODEPORT.value:
16
- return cls.NODEPORT
17
- elif mode.lower() == cls.PORTFORWARD.value:
18
- return cls.PORTFORWARD
19
- else:
20
- raise ValueError(f'Unsupported kubernetes networking mode: '
21
- f'{mode}. The mode must be either '
22
- f'\'{cls.PORTFORWARD.value}\' or '
23
- f'\'{cls.NODEPORT.value}\'. ')
24
-
25
12
 
26
13
  class KubernetesServiceType(enum.Enum):
27
14
  """Enum for the different types of services."""
@@ -42,4 +29,10 @@ class KubernetesAutoscalerType(enum.Enum):
42
29
  """Enum for the different types of cluster autoscalers for Kubernetes."""
43
30
  GKE = 'gke'
44
31
  KARPENTER = 'karpenter'
32
+ COREWEAVE = 'coreweave'
45
33
  GENERIC = 'generic'
34
+
35
+ def emits_autoscale_event(self) -> bool:
36
+ """Returns whether specific autoscaler emits the event reason
37
+ TriggeredScaleUp."""
38
+ return self not in {self.KARPENTER}