skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,164 @@
1
+ """Volume functions for provisioning and deleting ephemeral volumes."""
2
+
3
+ import copy
4
+ from typing import Any, Dict, Optional
5
+
6
+ from sky import clouds
7
+ from sky import global_user_state
8
+ from sky import models
9
+ from sky import sky_logging
10
+ from sky.provision import common as provision_common
11
+ from sky.provision import constants as provision_constants
12
+ from sky.provision.kubernetes import utils as kubernetes_utils
13
+ from sky.utils import volume as volume_utils
14
+ from sky.volumes import volume as volume_lib
15
+ from sky.volumes.server import core as volume_server_core
16
+
17
+ logger = sky_logging.init_logger(__name__)
18
+
19
+
20
+ def _resolve_volume_type(cloud: clouds.Cloud,
21
+ volume_type: Optional[str]) -> str:
22
+ if not volume_type:
23
+ volume_types = None
24
+ for cloud_key, vol_types in volume_lib.CLOUD_TO_VOLUME_TYPE.items():
25
+ if cloud.is_same_cloud(cloud_key):
26
+ volume_types = vol_types
27
+ break
28
+ if volume_types is None:
29
+ raise ValueError(f'No default volume type found for cloud {cloud}')
30
+ if len(volume_types) != 1:
31
+ raise ValueError(
32
+ f'Found multiple volume types for cloud {cloud}: {volume_types}'
33
+ )
34
+ return volume_types[0].value
35
+ supported_volume_types = [
36
+ volume_type.value for volume_type in volume_utils.VolumeType
37
+ ]
38
+ volume_type = volume_type.lower()
39
+ if volume_type not in supported_volume_types:
40
+ raise ValueError(
41
+ f'Invalid volume type: {volume_type} for cloud {cloud}')
42
+ return volume_type
43
+
44
+
45
+ def _resolve_pvc_volume_config(cloud: clouds.Cloud,
46
+ config: provision_common.ProvisionConfig,
47
+ volume_config: Dict[str, Any]) -> Dict[str, Any]:
48
+ provider_config = config.provider_config
49
+ if not cloud.is_same_cloud(clouds.Kubernetes()):
50
+ raise ValueError(
51
+ f'PVC volume type is only supported on Kubernetes not on {cloud}')
52
+ supported_access_modes = [
53
+ access_mode.value for access_mode in volume_utils.VolumeAccessMode
54
+ ]
55
+ access_mode = volume_config.get('access_mode')
56
+ if access_mode is None:
57
+ access_mode = volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value
58
+ volume_config['access_mode'] = access_mode
59
+ elif access_mode not in supported_access_modes:
60
+ raise ValueError(f'Invalid access mode: {access_mode} for PVC')
61
+ if (access_mode == volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value and
62
+ config.count > 1):
63
+ raise ValueError(
64
+ 'Access mode ReadWriteOnce is not supported for multi-node'
65
+ ' clusters.')
66
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
67
+ volume_config['namespace'] = namespace
68
+ return volume_config
69
+
70
+
71
+ def _create_ephemeral_volume(
72
+ cloud: clouds.Cloud, region: str, cluster_name_on_cloud: str,
73
+ config: provision_common.ProvisionConfig,
74
+ volume_mount: volume_utils.VolumeMount
75
+ ) -> Optional[volume_utils.VolumeInfo]:
76
+ provider_name = repr(cloud)
77
+ path = volume_mount.path
78
+ volume_config = volume_mount.volume_config
79
+ volume_type = _resolve_volume_type(cloud, volume_config.type)
80
+ labels = volume_config.labels
81
+ if volume_type == volume_utils.VolumeType.PVC.value:
82
+ internal_volume_config = _resolve_pvc_volume_config(
83
+ cloud, config, volume_config.config)
84
+ if labels:
85
+ for key, value in labels.items():
86
+ valid, err_msg = cloud.is_label_valid(key, value)
87
+ if not valid:
88
+ raise ValueError(f'{err_msg}')
89
+ else:
90
+ labels = {}
91
+ labels.update({
92
+ provision_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud
93
+ })
94
+ else:
95
+ logger.warning(f'Skipping unsupported ephemeral volume type: '
96
+ f'{volume_type} for cloud {cloud}.')
97
+ return None
98
+ volume_name = volume_config.name
99
+ volume_server_core.volume_apply(
100
+ name=volume_name,
101
+ volume_type=volume_type,
102
+ cloud=provider_name,
103
+ region=region,
104
+ zone=None,
105
+ size=volume_config.size,
106
+ config=internal_volume_config,
107
+ labels=labels,
108
+ is_ephemeral=True,
109
+ )
110
+ volume = global_user_state.get_volume_by_name(volume_name)
111
+ if volume is None:
112
+ raise ValueError(f'Failed to get record for volume: {volume_name}')
113
+ assert 'handle' in volume, 'Volume handle is None.'
114
+ volume_config: models.VolumeConfig = volume['handle']
115
+ volume_info = volume_utils.VolumeInfo(
116
+ name=volume_name,
117
+ path=path,
118
+ volume_name_on_cloud=volume_config.name_on_cloud,
119
+ volume_id_on_cloud=volume_config.id_on_cloud,
120
+ )
121
+ return volume_info
122
+
123
+
124
+ def provision_ephemeral_volumes(
125
+ cloud: clouds.Cloud,
126
+ region: str,
127
+ cluster_name_on_cloud: str,
128
+ config: provision_common.ProvisionConfig,
129
+ ) -> None:
130
+ """Provision ephemeral volumes for a cluster."""
131
+ provider_config = config.provider_config
132
+ ephemeral_volume_mounts = provider_config.get('ephemeral_volume_specs')
133
+ if not ephemeral_volume_mounts:
134
+ return
135
+ volume_infos = []
136
+ try:
137
+ for ephemeral_volume_mount in ephemeral_volume_mounts:
138
+ mount_copy = copy.deepcopy(ephemeral_volume_mount)
139
+ volume_mount = volume_utils.VolumeMount.from_yaml_config(mount_copy)
140
+ volume_info = _create_ephemeral_volume(cloud, region,
141
+ cluster_name_on_cloud,
142
+ config, volume_mount)
143
+ if volume_info is None:
144
+ continue
145
+ volume_infos.append(volume_info)
146
+ provider_config['ephemeral_volume_infos'] = volume_infos
147
+ except Exception as e: # pylint: disable=broad-exception-caught
148
+ logger.error(f'Failed to provision ephemeral volumes: {e}')
149
+ raise e
150
+
151
+
152
+ def delete_ephemeral_volumes(provider_config: Dict[str, Any],) -> None:
153
+ """Provision ephemeral volumes for a cluster."""
154
+ ephemeral_volume_mounts = provider_config.get('ephemeral_volume_specs')
155
+ if not ephemeral_volume_mounts:
156
+ return
157
+ ephemeral_volume_names = []
158
+ for ephemeral_volume_mount in ephemeral_volume_mounts:
159
+ mount_copy = copy.deepcopy(ephemeral_volume_mount)
160
+ volume_mount = volume_utils.VolumeMount.from_yaml_config(mount_copy)
161
+ volume_name = volume_mount.volume_config.name
162
+ ephemeral_volume_names.append(volume_name)
163
+ volume_server_core.volume_delete(names=ephemeral_volume_names,
164
+ ignore_not_found=True)
@@ -30,5 +30,5 @@ def get_unverified_session():
30
30
  """
31
31
  session = requests.session()
32
32
  session.verify = False
33
- requests.packages.urllib3.disable_warnings()
33
+ requests.packages.urllib3.disable_warnings() # type: ignore[attr-defined]
34
34
  return session
@@ -89,5 +89,6 @@ def create_unverified_session(session, suppress_warning=True):
89
89
  session.verify = False
90
90
  if suppress_warning:
91
91
  # Suppress unverified https request warnings
92
- requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
92
+ requests.packages.urllib3.disable_warnings( # type: ignore
93
+ InsecureRequestWarning)
93
94
  return session
@@ -4,7 +4,7 @@
4
4
  import re
5
5
  import subprocess
6
6
  import time
7
- from typing import List
7
+ from typing import List, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import vsphere as vsphere_adaptor
@@ -15,7 +15,8 @@ DISPLAY_CONTROLLER_CLASS_ID_PREFIXES = ['03']
15
15
  VMWARE_VIRTUAL_DISPLAY_CONTROLLER_IDS = ['0000:00:0f.0']
16
16
 
17
17
 
18
- def get_objs_by_names(content, vimtype: type, names: List[str]):
18
+ def get_objs_by_names(content, vimtype: Union[type, List[type]],
19
+ names: List[str]):
19
20
  """ Get the vsphere managed object associated with a given text name
20
21
  """
21
22
  # Create a set for the names for faster lookups
@@ -30,9 +30,10 @@ HEAD_NODE_VALUE = '1'
30
30
  WORKER_NODE_VALUE = '0'
31
31
 
32
32
 
33
- def run_instances(region: str, cluster_name: str,
33
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
34
34
  config: common.ProvisionConfig) -> common.ProvisionRecord:
35
35
  """See sky/provision/__init__.py"""
36
+ del cluster_name # unused
36
37
  logger.info('New provision of Vsphere: run_instances().')
37
38
 
38
39
  resumed_instance_ids: List[str] = []
@@ -40,7 +41,7 @@ def run_instances(region: str, cluster_name: str,
40
41
  vc_object = _get_vc_object(region)
41
42
  vc_object.connect()
42
43
 
43
- exist_instances = _get_filtered_instance(vc_object, cluster_name,
44
+ exist_instances = _get_filtered_instance(vc_object, cluster_name_on_cloud,
44
45
  config.provider_config)
45
46
  head_instance_id = _get_head_instance_id(exist_instances)
46
47
 
@@ -89,8 +90,8 @@ def run_instances(region: str, cluster_name: str,
89
90
  config, region, vc_object)
90
91
  # TODO: update logic for multi-node creation
91
92
  for _ in range(to_start_num):
92
- created_instance_uuid = _create_instances(cluster_name, config,
93
- region, vc_object,
93
+ created_instance_uuid = _create_instances(cluster_name_on_cloud,
94
+ config, region, vc_object,
94
95
  vsphere_cluster_name)
95
96
  created_instance_ids.append(created_instance_uuid)
96
97
  if head_instance_id is None:
@@ -104,7 +105,7 @@ def run_instances(region: str, cluster_name: str,
104
105
  provider_name='vsphere',
105
106
  region=region,
106
107
  zone=vsphere_cluster_name,
107
- cluster_name=cluster_name,
108
+ cluster_name=cluster_name_on_cloud,
108
109
  head_instance_id=head_instance_id,
109
110
  resumed_instance_ids=resumed_instance_ids,
110
111
  created_instance_ids=created_instance_ids,
@@ -397,9 +398,10 @@ def query_instances(
397
398
  cluster_name_on_cloud: str,
398
399
  provider_config: Optional[Dict[str, Any]] = None,
399
400
  non_terminated_only: bool = True,
401
+ retry_if_missing: bool = False,
400
402
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
401
403
  """See sky/provision/__init__.py"""
402
- del cluster_name # unused
404
+ del cluster_name, retry_if_missing # unused
403
405
  logger.info('New provision of Vsphere: query_instances().')
404
406
  assert provider_config is not None, cluster_name_on_cloud
405
407
  region = provider_config['region']
@@ -262,6 +262,10 @@ class VsphereClient:
262
262
 
263
263
  def get_pbm_manager(self):
264
264
  self.connect()
265
+ if self.servicemanager is None:
266
+ raise VsphereError('Failed to connect to vSphere.')
267
+ if self.servicemanager.si is None:
268
+ raise VsphereError('Failed to connect to vSphere.')
265
269
  pbm_si, pm_content = self._create_pbm_connection( # pylint: disable=unused-variable
266
270
  self.servicemanager.si._stub) # pylint: disable=protected-access
267
271
  pm = pm_content.profileManager
@@ -360,6 +364,8 @@ def initialize_vsphere_data():
360
364
  vcenter_name = vcenter['name']
361
365
  vc_object.connect()
362
366
  vc_servicemanager = vc_object.servicemanager
367
+ if vc_servicemanager is None or vc_servicemanager.content is None:
368
+ raise VsphereError('Failed to connect to vSphere.')
363
369
  vc_content = vc_servicemanager.content
364
370
 
365
371
  cluster_name_dicts = vc_object.clusters
@@ -370,4 +376,5 @@ def initialize_vsphere_data():
370
376
  initialize_images_csv(images_csv_path, vc_object, vcenter_name)
371
377
  initialize_instance_image_mapping_csv(vms_csv_path, images_csv_path,
372
378
  instance_image_mapping_csv_path)
373
- vc_object.servicemanager.disconnect()
379
+ if vc_object.servicemanager is not None:
380
+ vc_object.servicemanager.disconnect()
sky/resources.py CHANGED
@@ -1104,7 +1104,7 @@ class Resources:
1104
1104
  regions = self.cloud.regions_with_offering(self._instance_type,
1105
1105
  self.accelerators,
1106
1106
  self._use_spot, self._region,
1107
- self._zone)
1107
+ self._zone, self)
1108
1108
  if self._image_id is not None and None not in self._image_id:
1109
1109
  regions = [r for r in regions if r.name in self._image_id]
1110
1110
 
@@ -1331,10 +1331,18 @@ class Resources:
1331
1331
  clouds.CloudImplementationFeatures.IMAGE_ID
1332
1332
  })
1333
1333
  except exceptions.NotSupportedError as e:
1334
+ # Provide a more helpful error message for Lambda cloud
1335
+ if self.cloud.is_same_cloud(clouds.Lambda()):
1336
+ with ux_utils.print_exception_no_traceback():
1337
+ raise ValueError(
1338
+ 'Lambda cloud only supports Docker images. '
1339
+ 'Please prefix your image with "docker:" '
1340
+ '(e.g., image_id: docker:your-image-name).') from e
1334
1341
  with ux_utils.print_exception_no_traceback():
1335
1342
  raise ValueError(
1336
1343
  'image_id is only supported for AWS/GCP/Azure/IBM/OCI/'
1337
- 'Kubernetes, please explicitly specify the cloud.') from e
1344
+ 'Kubernetes. For Lambda cloud, use "docker:" prefix for '
1345
+ 'Docker images.') from e
1338
1346
 
1339
1347
  if self._region is not None:
1340
1348
  # If the image_id has None as key (region-agnostic),
@@ -1516,7 +1524,7 @@ class Resources:
1516
1524
  if self.accelerators is not None:
1517
1525
  hourly_cost += self.cloud.accelerators_to_hourly_cost(
1518
1526
  self.accelerators, self.use_spot, self._region, self._zone)
1519
- return hourly_cost * hours
1527
+ return float(hourly_cost * hours)
1520
1528
 
1521
1529
  def get_accelerators_str(self) -> str:
1522
1530
  accelerators = self.accelerators
@@ -5,8 +5,11 @@ from typing import Any, Dict, List, Optional
5
5
 
6
6
  import pydantic
7
7
 
8
+ from sky import data
8
9
  from sky import models
10
+ from sky.jobs import state as job_state
9
11
  from sky.server import common
12
+ from sky.skylet import job_lib
10
13
  from sky.utils import status_lib
11
14
 
12
15
 
@@ -74,8 +77,13 @@ class APIHealthResponse(ResponseBaseModel):
74
77
  version: str = ''
75
78
  version_on_disk: str = ''
76
79
  commit: str = ''
80
+ # Whether basic auth on api server is enabled
77
81
  basic_auth_enabled: bool = False
78
82
  user: Optional[models.User] = None
83
+ # Whether service account token is enabled
84
+ service_account_token_enabled: bool = False
85
+ # Whether basic auth on ingress is enabled
86
+ ingress_basic_auth_enabled: bool = False
79
87
 
80
88
 
81
89
  class StatusResponse(ResponseBaseModel):
@@ -86,8 +94,8 @@ class StatusResponse(ResponseBaseModel):
86
94
  # backends.ResourceHandle, so we use Any here.
87
95
  # This is an internally facing field anyway, so it's less
88
96
  # of a problem that it's not typed.
89
- handle: Any
90
- last_use: str
97
+ handle: Optional[Any] = None
98
+ last_use: Optional[str] = None
91
99
  status: status_lib.ClusterStatus
92
100
  autostop: int
93
101
  to_down: bool
@@ -95,11 +103,8 @@ class StatusResponse(ResponseBaseModel):
95
103
  # metadata is a JSON, so we use Any here.
96
104
  metadata: Optional[Dict[str, Any]] = None
97
105
  cluster_hash: str
98
- # pydantic cannot generate the pydantic-core schema for
99
- # storage_mounts_metadata, so we use Any here.
100
- storage_mounts_metadata: Optional[Dict[str, Any]] = None
101
106
  cluster_ever_up: bool
102
- status_updated_at: int
107
+ status_updated_at: Optional[int] = None
103
108
  user_hash: str
104
109
  user_name: str
105
110
  config_hash: Optional[str] = None
@@ -118,9 +123,105 @@ class StatusResponse(ResponseBaseModel):
118
123
  cpus: Optional[str] = None
119
124
  memory: Optional[str] = None
120
125
  accelerators: Optional[str] = None
126
+ cluster_name_on_cloud: Optional[str] = None
127
+
128
+
129
+ class ClusterJobRecord(ResponseBaseModel):
130
+ """Response for the cluster job queue endpoint."""
131
+ job_id: int
132
+ job_name: str
133
+ username: str
134
+ user_hash: str
135
+ submitted_at: float
136
+ # None if the job has not started yet.
137
+ start_at: Optional[float] = None
138
+ # None if the job has not ended yet.
139
+ end_at: Optional[float] = None
140
+ resources: str
141
+ status: job_lib.JobStatus
142
+ log_path: str
143
+ metadata: Dict[str, Any] = {}
121
144
 
122
145
 
123
146
  class UploadStatus(enum.Enum):
124
147
  """Status of the upload."""
125
148
  UPLOADING = 'uploading'
126
149
  COMPLETED = 'completed'
150
+
151
+
152
+ class StorageRecord(ResponseBaseModel):
153
+ """Response for the storage list endpoint."""
154
+ name: str
155
+ launched_at: int
156
+ store: List[data.StoreType]
157
+ last_use: str
158
+ status: status_lib.StorageStatus
159
+
160
+
161
+ # TODO (syang) figure out which fields are always present
162
+ # and therefore can be non-optional.
163
+ class ManagedJobRecord(ResponseBaseModel):
164
+ """A single managed job record."""
165
+ # The job_id in the spot table
166
+ task_job_id: Optional[int] = pydantic.Field(None, alias='_job_id')
167
+ job_id: Optional[int] = None
168
+ task_id: Optional[int] = None
169
+ job_name: Optional[str] = None
170
+ task_name: Optional[str] = None
171
+ job_duration: Optional[float] = None
172
+ workspace: Optional[str] = None
173
+ status: Optional[job_state.ManagedJobStatus] = None
174
+ schedule_state: Optional[str] = None
175
+ resources: Optional[str] = None
176
+ cluster_resources: Optional[str] = None
177
+ cluster_resources_full: Optional[str] = None
178
+ cloud: Optional[str] = None
179
+ region: Optional[str] = None
180
+ zone: Optional[str] = None
181
+ infra: Optional[str] = None
182
+ recovery_count: Optional[int] = None
183
+ details: Optional[str] = None
184
+ failure_reason: Optional[str] = None
185
+ user_name: Optional[str] = None
186
+ user_hash: Optional[str] = None
187
+ submitted_at: Optional[float] = None
188
+ start_at: Optional[float] = None
189
+ end_at: Optional[float] = None
190
+ user_yaml: Optional[str] = None
191
+ entrypoint: Optional[str] = None
192
+ metadata: Optional[Dict[str, Any]] = None
193
+ controller_pid: Optional[int] = None
194
+ controller_pid_started_at: Optional[float] = None
195
+ dag_yaml_path: Optional[str] = None
196
+ env_file_path: Optional[str] = None
197
+ last_recovered_at: Optional[float] = None
198
+ run_timestamp: Optional[str] = None
199
+ priority: Optional[int] = None
200
+ original_user_yaml_path: Optional[str] = None
201
+ pool: Optional[str] = None
202
+ pool_hash: Optional[str] = None
203
+ current_cluster_name: Optional[str] = None
204
+ job_id_on_pool_cluster: Optional[int] = None
205
+ accelerators: Optional[Dict[str, int]] = None
206
+
207
+
208
+ class VolumeRecord(ResponseBaseModel):
209
+ """A single volume record."""
210
+ name: str
211
+ type: str
212
+ launched_at: int
213
+ cloud: str
214
+ region: Optional[str] = None
215
+ zone: Optional[str] = None
216
+ size: Optional[str] = None
217
+ config: Dict[str, Any]
218
+ name_on_cloud: str
219
+ user_hash: str
220
+ user_name: str
221
+ workspace: str
222
+ last_attached_at: Optional[int] = None
223
+ last_use: Optional[str] = None
224
+ status: Optional[str] = None
225
+ usedby_pods: List[str]
226
+ usedby_clusters: List[str]
227
+ is_ephemeral: bool = False
@@ -0,0 +1,34 @@
1
+ """Add skylet_ssh_tunnel_metadata to clusters.
2
+
3
+ Revision ID: 008
4
+ Revises: 007
5
+ Create Date: 2025-09-09
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '008'
18
+ down_revision: Union[str, Sequence[str], None] = '007'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add skylet_ssh_tunnel_metadata column to clusters."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('clusters',
27
+ 'skylet_ssh_tunnel_metadata',
28
+ sa.LargeBinary(),
29
+ server_default=None)
30
+
31
+
32
+ def downgrade():
33
+ """No-op for backward compatibility."""
34
+ pass
@@ -0,0 +1,89 @@
1
+ """Add last_activity_time and launched_at to cluster history.
2
+
3
+ Revision ID: 009
4
+ Revises: 008
5
+ Create Date: 2025-09-24
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ import pickle
10
+ from typing import Sequence, Union
11
+
12
+ from alembic import op
13
+ import sqlalchemy as sa
14
+
15
+ from sky.utils.db import db_utils
16
+
17
+ # revision identifiers, used by Alembic.
18
+ revision: str = '009'
19
+ down_revision: Union[str, Sequence[str], None] = '008'
20
+ branch_labels: Union[str, Sequence[str], None] = None
21
+ depends_on: Union[str, Sequence[str], None] = None
22
+
23
+
24
+ def upgrade():
25
+ """Add last_activity_time and launched_at columns to cluster history."""
26
+ with op.get_context().autocommit_block():
27
+ # Add the columns with indices
28
+ db_utils.add_column_to_table_alembic('cluster_history',
29
+ 'last_activity_time',
30
+ sa.Integer(),
31
+ server_default=None,
32
+ index=True)
33
+
34
+ db_utils.add_column_to_table_alembic('cluster_history',
35
+ 'launched_at',
36
+ sa.Integer(),
37
+ server_default=None,
38
+ index=True)
39
+
40
+ # Populate the columns for existing rows
41
+ _populate_cluster_history_columns()
42
+
43
+
44
+ def _populate_cluster_history_columns():
45
+ """Populate last_activity_time and launched_at for existing rows using
46
+ usage_intervals logic."""
47
+ connection = op.get_bind()
48
+
49
+ # Get all existing rows with usage_intervals
50
+ result = connection.execute(
51
+ sa.text('SELECT cluster_hash, usage_intervals FROM cluster_history '
52
+ 'WHERE usage_intervals IS NOT NULL'))
53
+
54
+ for row in result:
55
+ cluster_hash = row[0]
56
+ usage_intervals_blob = row[1]
57
+
58
+ try:
59
+ # Deserialize the usage_intervals
60
+ usage_intervals = pickle.loads(usage_intervals_blob)
61
+
62
+ if usage_intervals:
63
+ # Calculate last_activity_time: end time of last interval
64
+ # or start time if still running
65
+ last_interval = usage_intervals[-1]
66
+ last_activity_time = (last_interval[1] if last_interval[1]
67
+ is not None else last_interval[0])
68
+
69
+ # Calculate launched_at: start time of first interval
70
+ launched_at = usage_intervals[0][0]
71
+
72
+ # Update the row with both calculated values
73
+ connection.execute(
74
+ sa.text('UPDATE cluster_history '
75
+ 'SET last_activity_time = :last_activity_time, '
76
+ 'launched_at = :launched_at '
77
+ 'WHERE cluster_hash = :cluster_hash'), {
78
+ 'last_activity_time': last_activity_time,
79
+ 'launched_at': launched_at,
80
+ 'cluster_hash': cluster_hash
81
+ })
82
+ except (pickle.PickleError, AttributeError, IndexError):
83
+ # Skip rows with corrupted or invalid usage_intervals
84
+ continue
85
+
86
+
87
+ def downgrade():
88
+ """No-op for backward compatibility."""
89
+ pass
@@ -0,0 +1,66 @@
1
+ """Add ssh keys in filesystem to global user state.
2
+
3
+ Revision ID: 010
4
+ Revises: 009
5
+ Create Date: 2025-10-07
6
+
7
+ """
8
+ import glob
9
+ # pylint: disable=invalid-name
10
+ import os
11
+ from typing import Sequence, Union
12
+
13
+ from alembic import op
14
+ import sqlalchemy as sa
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '010'
18
+ down_revision: Union[str, Sequence[str], None] = '009'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add last_activity_time and launched_at columns to cluster history."""
25
+ connection = op.get_bind()
26
+
27
+ match_dirs = glob.glob(os.path.expanduser('~/.sky/clients/*/ssh'))
28
+ file_user_hashes = set()
29
+ for match_dir in match_dirs:
30
+ user_hash = match_dir.split('/')[-2]
31
+ file_user_hashes.add(user_hash)
32
+
33
+ # Get all existing ssh keys
34
+ existing_user_hashes = set()
35
+ result = connection.execute(sa.text('SELECT user_hash FROM ssh_key'))
36
+ for row in result:
37
+ existing_user_hashes.add(row[0])
38
+
39
+ user_hashes_to_add = file_user_hashes - existing_user_hashes
40
+ for user_hash in user_hashes_to_add:
41
+ match_dir = os.path.join(os.path.expanduser('~/.sky/clients'),
42
+ user_hash, 'ssh')
43
+ public_key_path = os.path.join(match_dir, 'sky-key.pub')
44
+ private_key_path = os.path.join(match_dir, 'sky-key')
45
+ try:
46
+ with open(public_key_path, 'r', encoding='utf-8') as f:
47
+ public_key = f.read().strip()
48
+ with open(private_key_path, 'r', encoding='utf-8') as f:
49
+ private_key = f.read().strip()
50
+ except FileNotFoundError:
51
+ # Skip if the key files are not found
52
+ continue
53
+ connection.execute(
54
+ sa.text('INSERT INTO ssh_key '
55
+ '(user_hash, ssh_public_key, ssh_private_key) '
56
+ 'VALUES (:user_hash, :ssh_public_key, :ssh_private_key) '
57
+ 'ON CONFLICT DO NOTHING'), {
58
+ 'user_hash': user_hash,
59
+ 'ssh_public_key': public_key,
60
+ 'ssh_private_key': private_key
61
+ })
62
+
63
+
64
+ def downgrade():
65
+ """No-op for backward compatibility."""
66
+ pass