skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,15 @@
1
1
  """Kubernetes pvc provisioning."""
2
- from typing import Any, Dict, List, Optional, Tuple
2
+ from typing import Any, Dict, List, Optional, Set, Tuple
3
3
 
4
4
  from sky import global_user_state
5
5
  from sky import models
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import kubernetes
8
+ from sky.provision import constants
8
9
  from sky.provision.kubernetes import config as config_lib
9
10
  from sky.provision.kubernetes import constants as k8s_constants
10
11
  from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.utils import resources_utils
11
13
  from sky.utils import volume as volume_lib
12
14
 
13
15
  logger = sky_logging.init_logger(__name__)
@@ -67,7 +69,7 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
67
69
  except kubernetes.api_exception() as e:
68
70
  raise config_lib.KubernetesError(
69
71
  f'Check storage class {storage_class_name} error: {e}')
70
- create_persistent_volume_claim(namespace, context, pvc_spec)
72
+ create_persistent_volume_claim(namespace, context, pvc_spec, config)
71
73
  return config
72
74
 
73
75
 
@@ -75,7 +77,6 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
75
77
  """Deletes a volume."""
76
78
  context, namespace = _get_context_namespace(config)
77
79
  pvc_name = config.name_on_cloud
78
- logger.info(f'Deleting PVC {pvc_name}')
79
80
  kubernetes_utils.delete_k8s_resource_with_retry(
80
81
  delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
81
82
  context).delete_namespaced_persistent_volume_claim(
@@ -84,6 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
84
85
  _request_timeout=config_lib.DELETION_TIMEOUT),
85
86
  resource_type='pvc',
86
87
  resource_name=pvc_name)
88
+ logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
87
89
  return config
88
90
 
89
91
 
@@ -128,7 +130,7 @@ def _get_volume_usedby(
128
130
  usedby_pods.append(pod.metadata.name)
129
131
  # Get the real cluster name
130
132
  cluster_name_on_cloud = pod.metadata.labels.get(
131
- k8s_constants.TAG_SKYPILOT_CLUSTER_NAME)
133
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
132
134
  if cluster_name_on_cloud is None:
133
135
  continue
134
136
  cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
@@ -160,21 +162,154 @@ def get_volume_usedby(
160
162
  return _get_volume_usedby(context, namespace, pvc_name)
161
163
 
162
164
 
163
- def create_persistent_volume_claim(namespace: str, context: Optional[str],
164
- pvc_spec: Dict[str, Any]) -> None:
165
+ def get_all_volumes_usedby(
166
+ configs: List[models.VolumeConfig],
167
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
168
+ """Gets the usedby resources of all volumes."""
169
+ field_selector = ','.join([
170
+ f'status.phase!={phase}'
171
+ for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
172
+ ])
173
+ label_selector = 'parent=skypilot'
174
+ context_to_namespaces: Dict[str, Set[str]] = {}
175
+ pvc_names = set()
176
+ for config in configs:
177
+ context, namespace = _get_context_namespace(config)
178
+ if context not in context_to_namespaces:
179
+ context_to_namespaces[context] = set()
180
+ context_to_namespaces[context].add(namespace)
181
+ pvc_names.add(config.name_on_cloud)
182
+ cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
183
+ # Get all pods in the namespace
184
+ used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
185
+ used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
186
+ for context, namespaces in context_to_namespaces.items():
187
+ used_by_pods[context] = {}
188
+ used_by_clusters[context] = {}
189
+ for namespace in namespaces:
190
+ used_by_pods[context][namespace] = {}
191
+ used_by_clusters[context][namespace] = {}
192
+ pods = kubernetes.core_api(context).list_namespaced_pod(
193
+ namespace=namespace,
194
+ field_selector=field_selector,
195
+ label_selector=label_selector)
196
+ for pod in pods.items:
197
+ if pod.spec.volumes is None:
198
+ continue
199
+ for volume in pod.spec.volumes:
200
+ if volume.persistent_volume_claim is None:
201
+ continue
202
+ volume_name = volume.persistent_volume_claim.claim_name
203
+ if volume_name not in pvc_names:
204
+ continue
205
+ if volume_name not in used_by_pods[context][namespace]:
206
+ used_by_pods[context][namespace][volume_name] = []
207
+ used_by_pods[context][namespace][volume_name].append(
208
+ pod.metadata.name)
209
+ cluster_name_on_cloud = pod.metadata.labels.get(
210
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
211
+ if cluster_name_on_cloud is None:
212
+ continue
213
+ cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
214
+ if cluster_name is None:
215
+ continue
216
+ if cluster_name not in used_by_clusters[context][namespace]:
217
+ used_by_clusters[context][namespace][cluster_name] = []
218
+ used_by_clusters[context][namespace][cluster_name].append(
219
+ cluster_name)
220
+ return used_by_pods, used_by_clusters
221
+
222
+
223
+ def map_all_volumes_usedby(
224
+ used_by_pods: Dict[str, Any], used_by_clusters: Dict[str, Any],
225
+ config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
226
+ """Maps the usedby resources of a volume."""
227
+ context, namespace = _get_context_namespace(config)
228
+ pvc_name = config.name_on_cloud
229
+
230
+ return (used_by_pods.get(context, {}).get(namespace, {}).get(pvc_name, []),
231
+ used_by_clusters.get(context, {}).get(namespace,
232
+ {}).get(pvc_name, []))
233
+
234
+
235
+ def _populate_config_from_pvc(config: models.VolumeConfig,
236
+ pvc_obj: Any) -> None:
237
+ """Populate missing fields in config from a PVC object.
238
+
239
+ Args:
240
+ config: VolumeConfig to populate
241
+ pvc_obj: V1PersistentVolumeClaim object from kubernetes client
242
+ """
243
+ if pvc_obj is None:
244
+ return
245
+ pvc_name = pvc_obj.metadata.name
246
+
247
+ # Populate storageClassName if not set
248
+ if config.config.get('storage_class_name') is None:
249
+ pvc_storage_class = getattr(pvc_obj.spec, 'storage_class_name', None)
250
+ if pvc_storage_class:
251
+ config.config['storage_class_name'] = pvc_storage_class
252
+
253
+ # Populate size if not set (prefer bound capacity, fallback to requested)
254
+ pvc_size = None
255
+ size_quantity = None
256
+ # Try status.capacity (dict) - actual bound size
257
+ capacity = getattr(getattr(pvc_obj, 'status', None), 'capacity', None)
258
+ if isinstance(capacity, dict) and 'storage' in capacity:
259
+ size_quantity = capacity['storage']
260
+ # Fallback to spec.resources.requests (dict) - requested size
261
+ if size_quantity is None:
262
+ requests = getattr(getattr(pvc_obj.spec, 'resources', None), 'requests',
263
+ None)
264
+ if isinstance(requests, dict):
265
+ size_quantity = requests.get('storage')
266
+ # Parse and normalize the size if found
267
+ if size_quantity:
268
+ try:
269
+ # Normalize to GB string (e.g., '20')
270
+ pvc_size = resources_utils.parse_memory_resource(
271
+ size_quantity, 'size', allow_rounding=True)
272
+ except ValueError as e:
273
+ # Just log the error since it is not critical.
274
+ logger.warning(f'Failed to parse PVC size {size_quantity!r} '
275
+ f'for PVC {pvc_name}: {e}')
276
+ if pvc_size is not None:
277
+ if config.size is not None and config.size != pvc_size:
278
+ logger.warning(f'PVC {pvc_name} has size {pvc_size} but config '
279
+ f'size is {config.size}, overriding the config size'
280
+ f' with the PVC size.')
281
+ config.size = pvc_size
282
+
283
+
284
+ def create_persistent_volume_claim(
285
+ namespace: str,
286
+ context: Optional[str],
287
+ pvc_spec: Dict[str, Any],
288
+ config: Optional[models.VolumeConfig] = None,
289
+ ) -> None:
165
290
  """Creates a persistent volume claim for SkyServe controller."""
166
291
  pvc_name = pvc_spec['metadata']['name']
167
292
  try:
168
- kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
169
- name=pvc_name, namespace=namespace)
293
+ pvc = kubernetes.core_api(
294
+ context).read_namespaced_persistent_volume_claim(
295
+ name=pvc_name, namespace=namespace)
296
+ if config is not None:
297
+ _populate_config_from_pvc(config, pvc)
170
298
  logger.debug(f'PVC {pvc_name} already exists')
171
299
  return
172
300
  except kubernetes.api_exception() as e:
173
301
  if e.status != 404: # Not found
174
302
  raise
175
- logger.info(f'Creating PVC {pvc_name}')
176
- kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
177
- namespace=namespace, body=pvc_spec)
303
+ use_existing = config is not None and config.config.get('use_existing')
304
+ if use_existing:
305
+ raise ValueError(
306
+ f'PVC {pvc_name} does not exist while use_existing is True.')
307
+ pvc = kubernetes.core_api(
308
+ context).create_namespaced_persistent_volume_claim(namespace=namespace,
309
+ body=pvc_spec)
310
+ logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
311
+ if config is not None:
312
+ _populate_config_from_pvc(config, pvc)
178
313
 
179
314
 
180
315
  def _get_pvc_spec(namespace: str,
@@ -183,8 +318,8 @@ def _get_pvc_spec(namespace: str,
183
318
  access_mode = config.config.get('access_mode')
184
319
  size = config.size
185
320
  # The previous code assumes that the access_mode and size are always set.
186
- assert access_mode is not None
187
- assert size is not None
321
+ assert access_mode is not None, f'access_mode is None for volume ' \
322
+ f'{config.name_on_cloud}'
188
323
  pvc_spec: Dict[str, Any] = {
189
324
  'metadata': {
190
325
  'name': config.name_on_cloud,
@@ -196,13 +331,10 @@ def _get_pvc_spec(namespace: str,
196
331
  },
197
332
  'spec': {
198
333
  'accessModes': [access_mode],
199
- 'resources': {
200
- 'requests': {
201
- 'storage': f'{size}Gi'
202
- }
203
- },
204
334
  }
205
335
  }
336
+ if size is not None:
337
+ pvc_spec['spec']['resources'] = {'requests': {'storage': f'{size}Gi'}}
206
338
  if config.labels:
207
339
  pvc_spec['metadata']['labels'].update(config.labels)
208
340
  storage_class = config.config.get('storage_class_name')
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
68
68
  return private_ip
69
69
 
70
70
 
71
- def run_instances(region: str, cluster_name_on_cloud: str,
71
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
72
72
  config: common.ProvisionConfig) -> common.ProvisionRecord:
73
73
  """Runs instances for the given cluster"""
74
+ del cluster_name # unused
74
75
  lambda_client = _get_lambda_client()
75
76
  pending_status = ['booting']
76
77
  while True:
@@ -106,34 +107,35 @@ def run_instances(region: str, cluster_name_on_cloud: str,
106
107
  created_instance_ids = []
107
108
  remote_ssh_key_name = config.authentication_config['remote_key_name']
108
109
 
109
- def launch_nodes(node_type: str, quantity: int) -> List[str]:
110
+ def launch_node(node_type: str) -> str:
110
111
  try:
111
112
  instance_ids = lambda_client.create_instances(
112
113
  instance_type=config.node_config['InstanceType'],
113
114
  region=region,
114
115
  name=f'{cluster_name_on_cloud}-{node_type}',
115
- quantity=quantity,
116
+ # Quantity cannot actually be greater than 1; see:
117
+ # https://github.com/skypilot-org/skypilot/issues/7084
118
+ quantity=1,
116
119
  ssh_key_name=remote_ssh_key_name,
117
120
  )
118
- logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
119
- f'instance_ids: {instance_ids}')
120
- return instance_ids
121
+ logger.info(f'Launched {node_type} node, '
122
+ f'instance_id: {instance_ids[0]}')
123
+ return instance_ids[0]
121
124
  except Exception as e:
122
125
  logger.warning(f'run_instances error: {e}')
123
126
  raise
124
127
 
125
128
  if head_instance_id is None:
126
- instance_ids = launch_nodes('head', 1)
127
- assert len(instance_ids) == 1
128
- created_instance_ids.append(instance_ids[0])
129
- head_instance_id = instance_ids[0]
129
+ head_instance_id = launch_node('head')
130
+ created_instance_ids.append(head_instance_id)
130
131
 
131
132
  assert head_instance_id is not None, 'head_instance_id should not be None'
132
133
 
133
134
  worker_node_count = to_start_count - 1
134
135
  if worker_node_count > 0:
135
- instance_ids = launch_nodes('worker', worker_node_count)
136
- created_instance_ids.extend(instance_ids)
136
+ for _ in range(worker_node_count):
137
+ worker_instance_id = launch_node('worker')
138
+ created_instance_ids.append(worker_instance_id)
137
139
 
138
140
  while True:
139
141
  instances = _filter_instances(cluster_name_on_cloud, ['active'])
@@ -230,9 +232,10 @@ def query_instances(
230
232
  cluster_name_on_cloud: str,
231
233
  provider_config: Optional[Dict[str, Any]] = None,
232
234
  non_terminated_only: bool = True,
235
+ retry_if_missing: bool = False,
233
236
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
234
237
  """See sky/provision/__init__.py"""
235
- del cluster_name # unused
238
+ del cluster_name, retry_if_missing # unused
236
239
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
237
240
  instances = _filter_instances(cluster_name_on_cloud, None)
238
241
 
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
65
65
  f' to be ready.')
66
66
 
67
67
 
68
- def run_instances(region: str, cluster_name_on_cloud: str,
68
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
69
69
  config: common.ProvisionConfig) -> common.ProvisionRecord:
70
70
  """Runs instances for the given cluster."""
71
+ del cluster_name # unused
71
72
  _wait_until_no_pending(region, cluster_name_on_cloud)
72
73
  running_instances = _filter_instances(region, cluster_name_on_cloud,
73
74
  ['RUNNING'])
@@ -137,6 +138,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
137
138
  use_spot=config.node_config['use_spot'],
138
139
  associate_public_ip_address=(
139
140
  not config.provider_config['use_internal_ips']),
141
+ use_static_ip_address=config.provider_config.get(
142
+ 'use_static_ip_address', False),
140
143
  filesystems=config.node_config.get('filesystems', []),
141
144
  network_tier=config.node_config.get('network_tier'))
142
145
  except Exception as e: # pylint: disable=broad-except
@@ -251,9 +254,10 @@ def query_instances(
251
254
  cluster_name_on_cloud: str,
252
255
  provider_config: Optional[Dict[str, Any]] = None,
253
256
  non_terminated_only: bool = True,
257
+ retry_if_missing: bool = False,
254
258
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
255
259
  """See sky/provision/__init__.py"""
256
- del cluster_name # unused
260
+ del cluster_name, retry_if_missing # unused
257
261
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
258
262
  instances = _filter_instances(provider_config['region'],
259
263
  cluster_name_on_cloud, None)
@@ -188,6 +188,7 @@ def launch(cluster_name_on_cloud: str,
188
188
  user_data: str,
189
189
  associate_public_ip_address: bool,
190
190
  filesystems: List[Dict[str, Any]],
191
+ use_static_ip_address: bool = False,
191
192
  use_spot: bool = False,
192
193
  network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
193
194
  # Each node must have a unique name to avoid conflicts between
@@ -281,93 +282,109 @@ def launch(cluster_name_on_cloud: str,
281
282
 
282
283
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
283
284
  logger.debug(f'Creating instance {instance_name} in project {project_id}.')
284
- nebius.sync_call(
285
- service.create(nebius.compute().CreateInstanceRequest(
286
- metadata=nebius.nebius_common().ResourceMetadata(
287
- parent_id=project_id,
288
- name=instance_name,
289
- ),
290
- spec=nebius.compute().InstanceSpec(
291
- gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
292
- id=cluster_id,) if cluster_id is not None else None,
293
- boot_disk=nebius.compute().AttachedDiskSpec(
294
- attach_mode=nebius.compute(
295
- ).AttachedDiskSpec.AttachMode.READ_WRITE,
296
- existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
297
- cloud_init_user_data=user_data,
298
- resources=nebius.compute().ResourcesSpec(platform=platform,
299
- preset=preset),
300
- filesystems=filesystems_spec if filesystems_spec else None,
301
- network_interfaces=[
302
- nebius.compute().NetworkInterfaceSpec(
303
- subnet_id=sub_net.items[0].metadata.id,
304
- ip_address=nebius.compute().IPAddress(),
305
- name='network-interface-0',
306
- public_ip_address=nebius.compute().PublicIPAddress()
307
- if associate_public_ip_address else None,
308
- )
309
- ],
310
- recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
311
- if use_spot else None,
312
- preemptible=nebius.compute().PreemptibleSpec(
313
- priority=1,
314
- on_preemption=nebius.compute().PreemptibleSpec.
315
- PreemptionPolicy.STOP) if use_spot else None,
316
- ))))
317
- instance_id = ''
318
- retry_count = 0
319
- while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
320
- service = nebius.compute().InstanceServiceClient(nebius.sdk())
321
- instance = nebius.sync_call(
322
- service.get_by_name(nebius.nebius_common().GetByNameRequest(
323
- parent_id=project_id,
324
- name=instance_name,
325
- )))
326
- instance_id = instance.metadata.id
327
- if instance.status.state.name == 'STARTING':
328
- break
329
-
330
- # All Instances initially have state=STOPPED and reconciling=True,
331
- # so we need to wait until reconciling is False.
332
- if instance.status.state.name == 'STOPPED' and \
333
- not instance.status.reconciling:
334
- next_token = ''
335
- total_operations = 0
336
- while True:
337
- operations_response = nebius.sync_call(
338
- service.list_operations_by_parent(
339
- nebius.compute().ListOperationsByParentRequest(
340
- parent_id=project_id,
341
- page_size=100,
342
- page_token=next_token,
343
- )))
344
- total_operations += len(operations_response.operations)
345
- for operation in operations_response.operations:
346
- # Find the most recent operation for the instance.
347
- if operation.resource_id == instance_id:
348
- error_msg = operation.description
349
- if operation.status:
350
- error_msg += f' {operation.status.message}'
351
- raise RuntimeError(error_msg)
352
- # If we've fetched too many operations, or there are no more
353
- # operations to fetch, just raise a generic error.
354
- if total_operations > _MAX_OPERATIONS_TO_FETCH or \
355
- not operations_response.next_page_token:
356
- raise RuntimeError(
357
- f'Instance {instance_name} failed to start.')
358
- next_token = operations_response.next_page_token
359
- time.sleep(POLL_INTERVAL)
360
- logger.debug(f'Waiting for instance {instance_name} to start running. '
361
- f'State: {instance.status.state.name}, '
362
- f'Reconciling: {instance.status.reconciling}')
363
- retry_count += 1
285
+ try:
286
+ nebius.sync_call(
287
+ service.create(nebius.compute().CreateInstanceRequest(
288
+ metadata=nebius.nebius_common().ResourceMetadata(
289
+ parent_id=project_id,
290
+ name=instance_name,
291
+ ),
292
+ spec=nebius.compute().InstanceSpec(
293
+ gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
294
+ id=cluster_id,) if cluster_id is not None else None,
295
+ boot_disk=nebius.compute().AttachedDiskSpec(
296
+ attach_mode=nebius.compute(
297
+ ).AttachedDiskSpec.AttachMode.READ_WRITE,
298
+ existing_disk=nebius.compute().ExistingDisk(
299
+ id=disk_id)),
300
+ cloud_init_user_data=user_data,
301
+ resources=nebius.compute().ResourcesSpec(platform=platform,
302
+ preset=preset),
303
+ filesystems=filesystems_spec if filesystems_spec else None,
304
+ network_interfaces=[
305
+ nebius.compute().NetworkInterfaceSpec(
306
+ subnet_id=sub_net.items[0].metadata.id,
307
+ ip_address=nebius.compute().IPAddress(),
308
+ name='network-interface-0',
309
+ public_ip_address=nebius.compute().PublicIPAddress(
310
+ static=use_static_ip_address)
311
+ if associate_public_ip_address else None,
312
+ )
313
+ ],
314
+ recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
315
+ if use_spot else None,
316
+ preemptible=nebius.compute().PreemptibleSpec(
317
+ priority=1,
318
+ on_preemption=nebius.compute().PreemptibleSpec.
319
+ PreemptionPolicy.STOP) if use_spot else None,
320
+ ))))
321
+ instance_id = ''
322
+ retry_count = 0
323
+ while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
324
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
325
+ instance = nebius.sync_call(
326
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
327
+ parent_id=project_id,
328
+ name=instance_name,
329
+ )))
330
+ instance_id = instance.metadata.id
331
+ if instance.status.state.name == 'STARTING':
332
+ break
333
+
334
+ # All Instances initially have state=STOPPED and reconciling=True,
335
+ # so we need to wait until reconciling is False.
336
+ if instance.status.state.name == 'STOPPED' and \
337
+ not instance.status.reconciling:
338
+ next_token = ''
339
+ total_operations = 0
340
+ while True:
341
+ operations_response = nebius.sync_call(
342
+ service.list_operations_by_parent(
343
+ nebius.compute().ListOperationsByParentRequest(
344
+ parent_id=project_id,
345
+ page_size=100,
346
+ page_token=next_token,
347
+ )))
348
+ total_operations += len(operations_response.operations)
349
+ for operation in operations_response.operations:
350
+ # Find the most recent operation for the instance.
351
+ if operation.resource_id == instance_id:
352
+ error_msg = operation.description
353
+ if operation.status:
354
+ error_msg += f' {operation.status.message}'
355
+ raise RuntimeError(error_msg)
356
+ # If we've fetched too many operations, or there are no more
357
+ # operations to fetch, just raise a generic error.
358
+ if total_operations > _MAX_OPERATIONS_TO_FETCH or \
359
+ not operations_response.next_page_token:
360
+ raise RuntimeError(
361
+ f'Instance {instance_name} failed to start.')
362
+ next_token = operations_response.next_page_token
363
+ time.sleep(POLL_INTERVAL)
364
+ logger.debug(
365
+ f'Waiting for instance {instance_name} to start running. '
366
+ f'State: {instance.status.state.name}, '
367
+ f'Reconciling: {instance.status.reconciling}')
368
+ retry_count += 1
364
369
 
365
- if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
366
- raise TimeoutError(
367
- f'Exceeded maximum retries '
368
- f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
369
- f' seconds) while waiting for instance {instance_name}'
370
- f' to be ready.')
370
+ if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
371
+ raise TimeoutError(
372
+ f'Exceeded maximum retries '
373
+ f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
374
+ f' seconds) while waiting for instance {instance_name}'
375
+ f' to be ready.')
376
+ except nebius.request_error() as e:
377
+ # Handle ResourceExhausted quota limit error. In this case, we need to
378
+ # clean up the disk as VM creation failed and we can't proceed.
379
+ # It cannot be handled by the caller (provisioner)'s teardown logic,
380
+ # as we cannot retrieve the disk id, after the instance creation
381
+ # fails
382
+ logger.warning(f'Failed to launch instance {instance_name}: {e}')
383
+ service = nebius.compute().DiskServiceClient(nebius.sdk())
384
+ nebius.sync_call(
385
+ service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
386
+ logger.debug(f'Disk {disk_id} deleted.')
387
+ raise e
371
388
  return instance_id
372
389
 
373
390
 
@@ -36,6 +36,7 @@ def query_instances(
36
36
  cluster_name_on_cloud: str,
37
37
  provider_config: Optional[Dict[str, Any]] = None,
38
38
  non_terminated_only: bool = True,
39
+ retry_if_missing: bool = False,
39
40
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
40
41
  """Query instances.
41
42
 
@@ -44,7 +45,7 @@ def query_instances(
44
45
  A None status means the instance is marked as "terminated"
45
46
  or "terminating".
46
47
  """
47
- del cluster_name # unusedå
48
+ del cluster_name, retry_if_missing # unused
48
49
  assert provider_config is not None, cluster_name_on_cloud
49
50
  region = provider_config['region']
50
51
 
@@ -65,9 +66,10 @@ def query_instances(
65
66
 
66
67
 
67
68
  @query_utils.debug_enabled(logger)
68
- def run_instances(region: str, cluster_name_on_cloud: str,
69
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
69
70
  config: common.ProvisionConfig) -> common.ProvisionRecord:
70
71
  """Start instances with bootstrapped configuration."""
72
+ del cluster_name # unused
71
73
  tags = dict(sorted(copy.deepcopy(config.tags).items()))
72
74
 
73
75
  start_time = round(time.time() * 1000)
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
48
48
  return head_instance_id
49
49
 
50
50
 
51
- def run_instances(region: str, cluster_name_on_cloud: str,
51
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
52
52
  config: common.ProvisionConfig) -> common.ProvisionRecord:
53
53
  """Runs instances for the given cluster."""
54
-
54
+ del cluster_name # unused
55
55
  pending_status = [
56
56
  'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
57
57
  ]
@@ -281,9 +281,10 @@ def query_instances(
281
281
  cluster_name_on_cloud: str,
282
282
  provider_config: Optional[Dict[str, Any]] = None,
283
283
  non_terminated_only: bool = True,
284
+ retry_if_missing: bool = False,
284
285
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
285
286
  """See sky/provision/__init__.py"""
286
- del cluster_name, non_terminated_only #unused
287
+ del cluster_name, non_terminated_only, retry_if_missing #unused
287
288
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
288
289
  instances = _filter_instances(cluster_name_on_cloud, None)
289
290
 
@@ -0,0 +1,10 @@
1
+ """Prime Intellect provisioner for SkyPilot."""
2
+
3
+ from sky.provision.primeintellect.config import bootstrap_instances
4
+ from sky.provision.primeintellect.instance import cleanup_ports
5
+ from sky.provision.primeintellect.instance import get_cluster_info
6
+ from sky.provision.primeintellect.instance import query_instances
7
+ from sky.provision.primeintellect.instance import run_instances
8
+ from sky.provision.primeintellect.instance import stop_instances
9
+ from sky.provision.primeintellect.instance import terminate_instances
10
+ from sky.provision.primeintellect.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Prime Intellect configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config