skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -3,67 +3,8 @@
3
3
  Maps specified ports from host to cluster container.
4
4
  """
5
5
  import argparse
6
- import textwrap
7
-
8
-
9
- def generate_kind_config(path: str,
10
- port_start: int = 30000,
11
- port_end: int = 32768,
12
- num_nodes: int = 1,
13
- gpus: bool = False) -> None:
14
- """Generate a kind cluster config with ports mapped from host to container
15
-
16
- Args:
17
- path: Path to generate the config file at
18
- port_start: Port range start
19
- port_end: Port range end
20
- num_nodes: Number of nodes in the cluster
21
- gpus: If true, initialize kind cluster with GPU support
22
- """
23
-
24
- preamble = textwrap.dedent(f"""
25
- apiVersion: kind.x-k8s.io/v1alpha4
26
- kind: Cluster
27
- kubeadmConfigPatches:
28
- - |
29
- kind: ClusterConfiguration
30
- apiServer:
31
- extraArgs:
32
- "service-node-port-range": {port_start}-{port_end}
33
- nodes:
34
- - role: control-plane
35
- kubeadmConfigPatches:
36
- - |
37
- kind: InitConfiguration
38
- nodeRegistration:
39
- kubeletExtraArgs:
40
- node-labels: "ingress-ready=true"
41
- """)
42
- if gpus:
43
- preamble += textwrap.indent(
44
- textwrap.dedent("""
45
- extraMounts:
46
- - hostPath: /dev/null
47
- containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
48
- preamble += textwrap.indent(
49
- textwrap.dedent("""
50
- extraPortMappings:"""), ' ' * 2)
51
- suffix = ''
52
- if num_nodes > 1:
53
- for _ in range(1, num_nodes):
54
- suffix += """- role: worker\n"""
55
- with open(path, 'w', encoding='utf-8') as f:
56
- f.write(preamble)
57
- for port in range(port_start, port_end + 1):
58
- f.write(f"""
59
- - containerPort: {port}
60
- hostPort: {port}
61
- listenAddress: "0.0.0.0"
62
- protocol: tcp""")
63
- f.write('\n')
64
- if suffix:
65
- f.write(suffix)
66
6
 
7
+ from sky.utils.kubernetes import kubernetes_deploy_utils
67
8
 
68
9
  if __name__ == '__main__':
69
10
  parser = argparse.ArgumentParser(description='Generate a kind cluster '
@@ -77,10 +18,6 @@ if __name__ == '__main__':
77
18
  type=int,
78
19
  default=30000,
79
20
  help='Port range start')
80
- parser.add_argument('--port-end',
81
- type=int,
82
- default=32768,
83
- help='Port range end')
84
21
  parser.add_argument('--num-nodes',
85
22
  type=int,
86
23
  default=1,
@@ -90,5 +27,8 @@ if __name__ == '__main__':
90
27
  action='store_true',
91
28
  help='Initialize kind cluster with GPU support')
92
29
  args = parser.parse_args()
93
- generate_kind_config(args.path, args.port_start, args.port_end,
94
- args.num_nodes, args.gpus)
30
+
31
+ with open(args.path, 'w', encoding='utf-8') as f:
32
+ f.write(
33
+ kubernetes_deploy_utils.generate_kind_config(
34
+ args.port_start, args.num_nodes, args.gpus))
@@ -40,7 +40,9 @@ def cleanup(context: Optional[str] = None) -> Tuple[bool, str]:
40
40
  success = True
41
41
  except subprocess.CalledProcessError as e:
42
42
  output = e.output.decode('utf-8')
43
- reason = 'Error deleting existing GPU labeler resources: ' + output
43
+ stderr = e.stderr.decode('utf-8')
44
+ reason = ('Error deleting existing GPU labeler resources: ' +
45
+ output + stderr)
44
46
  return success, reason
45
47
 
46
48
 
@@ -183,9 +185,17 @@ def wait_for_jobs_completion(jobs_to_node_names: Dict[str, str],
183
185
  batch_v1 = kubernetes.batch_api(context=context)
184
186
  w = kubernetes.watch()
185
187
  completed_jobs = []
188
+ # Use resource_version="0" to start from the oldest available version.
189
+ # In multi-replica API server environments, replicas may be at different
190
+ # resource versions due to replication lag. Without specifying this, the
191
+ # watch may get version X from one replica but connect to another replica
192
+ # that only has up to version Y < X, causing "Too large resource version"
193
+ # errors. Using "0" ensures all replicas can serve the request from their
194
+ # oldest available version, avoiding version mismatches.
186
195
  for event in w.stream(func=batch_v1.list_namespaced_job,
187
196
  namespace=namespace,
188
- timeout_seconds=timeout):
197
+ timeout_seconds=timeout,
198
+ resource_version='0'):
189
199
  job = event['object']
190
200
  job_name = job.metadata.name
191
201
  if job_name in jobs_to_node_names:
@@ -212,7 +222,7 @@ def wait_for_jobs_completion(jobs_to_node_names: Dict[str, str],
212
222
  _format_string(
213
223
  f'Timed out after waiting {timeout} seconds '
214
224
  'for job to complete', colorama.Style.DIM))
215
- return False #Timed out
225
+ return False # Timed out
216
226
 
217
227
 
218
228
  def main():
@@ -13,10 +13,11 @@ spec:
13
13
  serviceAccountName: gpu-labeler-sa
14
14
  containers:
15
15
  - name: gpu-labeler
16
- image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
16
+ image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
17
17
  command: ["/bin/bash", "-i", "-c"]
18
18
  args:
19
19
  - |
20
+ source ~/skypilot-runtime/bin/activate
20
21
  python /label_gpus.py
21
22
  env:
22
23
  - name: MY_NODE_NAME
@@ -53,51 +53,51 @@ data:
53
53
  import os
54
54
  import subprocess
55
55
  from typing import Optional
56
-
56
+
57
57
  from kubernetes import client
58
58
  from kubernetes import config
59
-
59
+
60
60
  canonical_gpu_names = [
61
- 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
61
+ 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
62
62
  'A10', 'P100', 'P40', 'P4', 'L4'
63
63
  ]
64
-
65
-
64
+
65
+
66
66
  def get_gpu_name() -> Optional[str]:
67
67
  try:
68
68
  result = subprocess.run(
69
69
  ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits'],
70
70
  stdout=subprocess.PIPE)
71
71
  gpu_name = result.stdout.decode('utf-8').strip()
72
- # In the case of multi-gpu nodes, we assume the node is homogenous and
72
+ # In the case of multi-gpu nodes, we assume the node is homogenous and
73
73
  # just use the first GPU name.
74
74
  gpu_name = gpu_name.split('\n')[0]
75
75
  return gpu_name.lower()
76
76
  except Exception as e:
77
77
  print(f'Error getting GPU name: {e}')
78
78
  return None
79
-
80
-
79
+
80
+
81
81
  def label_node(gpu_name: str) -> None:
82
82
  try:
83
83
  config.load_incluster_config() # Load in-cluster configuration
84
84
  v1 = client.CoreV1Api()
85
-
85
+
86
86
  # Fetch the current node's name from the environment variable
87
87
  node_name = os.environ.get('MY_NODE_NAME')
88
88
  if not node_name:
89
89
  raise ValueError('Failed to get node name from environment')
90
-
90
+
91
91
  # Label the node with the GPU name
92
92
  body = {'metadata': {'labels': {'skypilot.co/accelerator': gpu_name}}}
93
93
  v1.patch_node(node_name, body)
94
-
94
+
95
95
  print(f'Labeled node {node_name} with GPU {gpu_name}')
96
-
96
+
97
97
  except Exception as e:
98
98
  print(f'Error labeling node: {e}')
99
-
100
-
99
+
100
+
101
101
  def main():
102
102
  gpu_name = get_gpu_name()
103
103
  if gpu_name is not None:
@@ -119,7 +119,7 @@ data:
119
119
  labelled = True
120
120
  else:
121
121
  print('No GPU detected. Try running nvidia-smi in the container.')
122
-
123
-
122
+
123
+
124
124
  if __name__ == '__main__':
125
125
  main()