skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,9 @@
1
1
  """SCP instance provisioning."""
2
2
 
3
+ from concurrent.futures import as_completed
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from copy import deepcopy
6
+ import hashlib
3
7
  import logging
4
8
  import random
5
9
  import string
@@ -13,25 +17,29 @@ from sky.utils import status_lib
13
17
  logger = logging.getLogger(__name__)
14
18
 
15
19
 
16
- def run_instances(region: str, cluster_name_on_cloud: str,
20
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
17
21
  config: common.ProvisionConfig) -> common.ProvisionRecord:
18
-
22
+ del cluster_name # unused
19
23
  zone_id = config.node_config['zone_id']
24
+
20
25
  running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
21
- head_instance_id = _get_head_instance_id(running_instances)
22
26
 
23
27
  to_start_count = config.count - len(running_instances)
28
+
24
29
  if to_start_count < 0:
25
30
  raise RuntimeError(
26
31
  f'Cluster {cluster_name_on_cloud} already has '
27
- f'{len(running_instances)} nodes, but {config.count} are required.')
32
+ f'{len(running_instances)} instances, but {config.count} '
33
+ 'are required')
28
34
 
29
35
  if to_start_count == 0:
36
+ head_instance_id = _get_head_instance_id(running_instances)
30
37
  if head_instance_id is None:
31
38
  raise RuntimeError(
32
- f'Cluster {cluster_name_on_cloud} has no head node.')
33
- logger.info(f'Cluster {cluster_name_on_cloud} already has '
34
- f'{len(running_instances)} nodes, no need to start more.')
39
+ f'Cluster {cluster_name_on_cloud} has no head instance')
40
+ logger.info(
41
+ f'Cluster {cluster_name_on_cloud} already has '
42
+ f'{len(running_instances)} instances, no need to start more')
35
43
  return common.ProvisionRecord(provider_name='scp',
36
44
  cluster_name=cluster_name_on_cloud,
37
45
  region=region,
@@ -40,64 +48,192 @@ def run_instances(region: str, cluster_name_on_cloud: str,
40
48
  resumed_instance_ids=[],
41
49
  created_instance_ids=[])
42
50
 
43
- stopped_instances = _filter_instances(cluster_name_on_cloud, ['STOPPED'])
44
- if to_start_count <= len(stopped_instances):
45
- head_instance_id = _get_head_instance_id(stopped_instances)
46
- scp_utils.SCPClient().start_instance(head_instance_id)
51
+ existing_instances = _filter_instances(cluster_name_on_cloud, None)
52
+ stopped_instances = _filter_instances(cluster_name_on_cloud,
53
+ ['STOPPED', 'STOPPING'])
54
+
55
+ def _detect_naming_version(existing_instances,
56
+ cluster_name_on_cloud) -> str:
57
+ v2_head = _head(cluster_name_on_cloud)
58
+ v2_worker_prefix = _worker(cluster_name_on_cloud)
59
+ has_v2 = any(instance['virtualServerName'] == v2_head or
60
+ instance['virtualServerName'].startswith(v2_worker_prefix)
61
+ for instance in existing_instances)
62
+ if has_v2:
63
+ return 'v2'
64
+ has_v1 = any(instance['virtualServerName'] == cluster_name_on_cloud
65
+ for instance in existing_instances)
66
+ if has_v1:
67
+ return 'v1'
68
+
69
+ if not existing_instances:
70
+ logger.debug(
71
+ 'detect_naming_version: no instances for cluster %s; '
72
+ 'defaulting to v2.', cluster_name_on_cloud)
73
+ else:
74
+ logger.error(
75
+ 'detect_naming_version: unexpected instance names for cluster '
76
+ '%s: %s; defaulting to v2.', cluster_name_on_cloud, [
77
+ instance['virtualServerName']
78
+ for instance in existing_instances
79
+ ])
80
+ return 'v2'
81
+
82
+ naming_version = _detect_naming_version(existing_instances,
83
+ cluster_name_on_cloud)
84
+
85
+ if naming_version == 'v2':
86
+ cluster_instance_names = [_head(cluster_name_on_cloud)] + [
87
+ f'{_worker(cluster_name_on_cloud)}-{i:02d}'
88
+ for i in range(1, config.count)
89
+ ]
90
+ else:
91
+ if config.count > 1:
92
+ raise RuntimeError(
93
+ 'This cluster uses the legacy naming scheme and cannot be '
94
+ 'scaled to multi-node automatically. '
95
+ 'Please `sky down` and relaunch.')
96
+ cluster_instance_names = [cluster_name_on_cloud]
97
+
98
+ existing_instance_names = [
99
+ instance['virtualServerName'] for instance in existing_instances
100
+ ]
101
+ resume_instance_names = [
102
+ instance['virtualServerName'] for instance in stopped_instances
103
+ ]
104
+ create_instance_names = [
105
+ instance_name for instance_name in cluster_instance_names
106
+ if instance_name not in existing_instance_names
107
+ ]
108
+
109
+ vpc_subnets = _get_or_create_vpc_subnets(zone_id)
110
+
111
+ def _resume(instance_name):
112
+ instance_id = _get_instance_id(instance_name, cluster_name_on_cloud)
47
113
  while True:
48
- instance_info = scp_utils.SCPClient().get_instance_info(
49
- head_instance_id)
50
- if instance_info['virtualServerState'] == 'RUNNING':
114
+ state = scp_utils.SCPClient().get_instance_info(
115
+ instance_id)['virtualServerState']
116
+ if state == 'RUNNING':
117
+ return instance_id, 'resumed'
118
+ if state == 'STOPPED':
51
119
  break
52
120
  time.sleep(2)
53
- resumed_instance_ids = [head_instance_id]
54
- return common.ProvisionRecord(provider_name='scp',
55
- cluster_name=cluster_name_on_cloud,
56
- region=region,
57
- zone=None,
58
- head_instance_id=head_instance_id,
59
- resumed_instance_ids=resumed_instance_ids,
60
- created_instance_ids=[])
61
121
 
62
- # SCP does not support multi-node
63
- instance_config = config.docker_config
64
- instance_config['virtualServerName'] = cluster_name_on_cloud
122
+ scp_utils.SCPClient().start_instance(instance_id)
123
+ while True:
124
+ info = scp_utils.SCPClient().get_instance_info(instance_id)
125
+ if info['virtualServerState'] == 'RUNNING':
126
+ return instance_id, 'resumed'
127
+ time.sleep(2)
65
128
 
66
- instance_id = None
67
- vpc_subnets = _get_or_create_vpc_subnets(zone_id)
68
- for vpc, subnets in vpc_subnets.items():
69
- sg_id = _create_security_group(zone_id, vpc)
70
- if sg_id is None:
71
- continue
72
- try:
73
- instance_config['securityGroupIds'] = [sg_id]
74
- for subnet in subnets:
75
- instance_config['nic']['subnetId'] = subnet
76
- instance_id = _create_instance(vpc, instance_config)
77
- if instance_id is not None:
78
- break
79
- except Exception as e: # pylint: disable=broad-except
80
- _delete_security_group(sg_id)
81
- logger.error(f'run_instances error: {e}')
82
- continue
129
+ def _create(instance_name):
130
+ instance_config = deepcopy(config.docker_config)
131
+ instance_config['virtualServerName'] = instance_name
132
+ cnt = config.count
133
+
134
+ for vpc, subnets in vpc_subnets.items():
135
+ sg_id = _create_security_group(zone_id, vpc, cnt)
136
+ if not sg_id:
137
+ continue
138
+
139
+ created_in_this_vpc = False
140
+ try:
141
+ instance_config['securityGroupIds'] = [sg_id]
142
+ for subnet in subnets:
143
+ instance_config['nic']['subnetId'] = subnet
144
+ instance_id = _create_instance(vpc, instance_config, cnt)
145
+ if instance_id:
146
+ created_in_this_vpc = True
147
+ return instance_id, 'created'
148
+ except Exception as e: # pylint: disable=broad-except
149
+ logger.error(f'run_instances error ({instance_name}): {e}')
150
+ finally:
151
+ if not created_in_this_vpc:
152
+ try:
153
+ _delete_security_group(sg_id)
154
+ except Exception: # pylint: disable=broad-except
155
+ pass
156
+
157
+ raise RuntimeError(f'instance creation error: {instance_name}')
158
+
159
+ tasks = (
160
+ [(_resume, instance_name) for instance_name in resume_instance_names] +
161
+ [(_create, instance_name) for instance_name in create_instance_names])
162
+
163
+ instance_ids_statuses = []
164
+ if tasks:
165
+ with ThreadPoolExecutor(max_workers=min(len(tasks), 32)) as ex:
166
+ execution = [
167
+ ex.submit(function, instance_name)
168
+ for function, instance_name in tasks
169
+ ]
170
+ for e in as_completed(execution):
171
+ try:
172
+ instance_ids_statuses.append(e.result())
173
+ except Exception as e: # pylint: disable=broad-except
174
+ logger.error(f'run_instances error: {e}')
175
+
176
+ wait_time = time.time() + 600
177
+ while time.time() < wait_time:
178
+ running_instances = _filter_instances(cluster_name_on_cloud,
179
+ ['RUNNING'])
180
+ if len(running_instances) == config.count:
181
+ break
182
+ pending_instances = _filter_instances(
183
+ cluster_name_on_cloud,
184
+ ['CREATING', 'EDITING', 'STARTING', 'RESTARTING', 'STOPPING'])
185
+ if not pending_instances:
186
+ break
187
+ time.sleep(3)
83
188
 
84
- if instance_id is None:
85
- raise RuntimeError('instance creation error')
189
+ running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
190
+ if len(running_instances) != config.count:
191
+ raise RuntimeError(f'Expected {config.count} running instances, '
192
+ f'but got {len(running_instances)} instances')
86
193
 
194
+ head_instance_id = _get_head_instance_id(running_instances)
87
195
  if head_instance_id is None:
88
- head_instance_id = instance_id
196
+ raise RuntimeError('Head instance is not running')
89
197
 
90
- created_instance_ids = [instance_id]
198
+ resumed_instance_ids = []
199
+ created_instance_ids = []
200
+ for instance_id, status in instance_ids_statuses:
201
+ if status == 'resumed':
202
+ resumed_instance_ids.append(instance_id)
203
+ elif status == 'created':
204
+ created_instance_ids.append(instance_id)
91
205
 
92
206
  return common.ProvisionRecord(provider_name='scp',
93
207
  cluster_name=cluster_name_on_cloud,
94
208
  region=region,
95
209
  zone=None,
96
210
  head_instance_id=head_instance_id,
97
- resumed_instance_ids=[],
211
+ resumed_instance_ids=resumed_instance_ids,
98
212
  created_instance_ids=created_instance_ids)
99
213
 
100
214
 
215
+ def _head(cluster_name_on_cloud: str):
216
+ return (f'{cluster_name_on_cloud[:8]}-'
217
+ f'{_suffix(cluster_name_on_cloud)}-head')
218
+
219
+
220
+ def _worker(cluster_name_on_cloud: str):
221
+ return (f'{cluster_name_on_cloud[:8]}-'
222
+ f'{_suffix(cluster_name_on_cloud)}-worker')
223
+
224
+
225
+ def _suffix(name: str, n: int = 5):
226
+ return hashlib.sha1(name.encode()).hexdigest()[:n]
227
+
228
+
229
+ def _get_instance_id(instance_name, cluster_name_on_cloud):
230
+ instances = _filter_instances(cluster_name_on_cloud, None)
231
+ for instance in instances:
232
+ if instance_name == instance['virtualServerName']:
233
+ return instance['virtualServerId']
234
+ return None
235
+
236
+
101
237
  def _get_or_create_vpc_subnets(zone_id):
102
238
  while len(_get_vcp_subnets(zone_id)) == 0:
103
239
  try:
@@ -182,28 +318,36 @@ def _get_vcp_subnets(zone_id):
182
318
  def _filter_instances(cluster_name_on_cloud,
183
319
  status_filter: Optional[List[str]]):
184
320
  instances = scp_utils.SCPClient().get_instances()
185
- filtered_instances = []
186
- if status_filter is not None:
187
- for instance in instances:
188
- if instance[
189
- 'virtualServerName'] == cluster_name_on_cloud and instance[
190
- 'virtualServerState'] in status_filter:
191
- filtered_instances.append(instance)
192
- return filtered_instances
193
- else:
194
- return instances
321
+ v2_head_instance_name = _head(cluster_name_on_cloud)
322
+ v2_worker_prefix = _worker(cluster_name_on_cloud)
323
+ v1_head_instance_name = cluster_name_on_cloud
324
+
325
+ cluster_instances = [
326
+ instance for instance in instances
327
+ if instance['virtualServerName'] == v2_head_instance_name or
328
+ instance['virtualServerName'].startswith(v2_worker_prefix) or
329
+ instance['virtualServerName'] == v1_head_instance_name
330
+ ]
331
+
332
+ if status_filter is None:
333
+ return cluster_instances
334
+ return [
335
+ instance for instance in cluster_instances
336
+ if instance['virtualServerState'] in status_filter
337
+ ]
195
338
 
196
339
 
197
340
  def _get_head_instance_id(instances):
198
- head_instance_id = None
199
341
  if len(instances) > 0:
200
- head_instance_id = instances[0]['virtualServerId']
201
- return head_instance_id
342
+ for instance in instances:
343
+ if instance['virtualServerName'].endswith('-head'):
344
+ return instance['virtualServerId']
345
+ return instances[0]['virtualServerId']
346
+ return None
202
347
 
203
348
 
204
- def _create_security_group(zone_id, vpc):
349
+ def _create_security_group(zone_id, vpc, cnt):
205
350
  sg_name = 'sky' + ''.join(random.choices(string.ascii_lowercase, k=8))
206
-
207
351
  undo_func_stack = []
208
352
  try:
209
353
  response = scp_utils.SCPClient().create_security_group(
@@ -222,8 +366,8 @@ def _create_security_group(zone_id, vpc):
222
366
  break
223
367
  time.sleep(5)
224
368
 
225
- scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', None)
226
- scp_utils.SCPClient().add_security_group_rule(sg_id, 'OUT', None)
369
+ scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', None, cnt)
370
+ scp_utils.SCPClient().add_security_group_rule(sg_id, 'OUT', None, cnt)
227
371
 
228
372
  return sg_id
229
373
  except Exception as e: # pylint: disable=broad-except
@@ -252,7 +396,7 @@ def _undo_functions(undo_func_list):
252
396
  func()
253
397
 
254
398
 
255
- def _create_instance(vpc_id, instance_config):
399
+ def _create_instance(vpc_id, instance_config, cnt):
256
400
  undo_func_stack = []
257
401
  try:
258
402
  instance = scp_utils.SCPClient().create_instance(instance_config)
@@ -265,10 +409,12 @@ def _create_instance(vpc_id, instance_config):
265
409
  undo_func_stack.append(lambda: _delete_instance(instance_id))
266
410
  firewall_id = _get_firewall_id(vpc_id)
267
411
  internal_ip = instance_info['ip']
268
- in_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'IN', None)
412
+ in_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'IN', None,
413
+ cnt)
269
414
  undo_func_stack.append(
270
415
  lambda: _delete_firewall_rule(firewall_id, in_rule_id))
271
- out_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'OUT', None)
416
+ out_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'OUT', None,
417
+ cnt)
272
418
  undo_func_stack.append(
273
419
  lambda: _delete_firewall_rule(firewall_id, out_rule_id))
274
420
  return instance_id
@@ -305,20 +451,22 @@ def _get_firewall_id(vpc_id):
305
451
 
306
452
 
307
453
  def _add_firewall_rule(firewall_id, internal_ip, direction,
308
- ports: Optional[List[str]]):
454
+ ports: Optional[List[str]], cnt: Optional[int]):
309
455
  attempts = 0
310
456
  max_attempts = 300
311
-
312
457
  while attempts < max_attempts:
313
458
  try:
314
459
  rule_info = scp_utils.SCPClient().add_firewall_rule(
315
- firewall_id, internal_ip, direction, ports)
316
- rule_id = rule_info['resourceId']
317
- while True:
318
- rule_info = scp_utils.SCPClient().get_firewall_rule_info(
319
- firewall_id, rule_id)
320
- if rule_info['ruleState'] == 'ACTIVE':
321
- return rule_id
460
+ firewall_id, internal_ip, direction, ports, cnt)
461
+ if rule_info is not None:
462
+ rule_id = rule_info['resourceId']
463
+ while True:
464
+ rule_info = scp_utils.SCPClient().get_firewall_rule_info(
465
+ firewall_id, rule_id)
466
+ if rule_info['ruleState'] == 'ACTIVE':
467
+ return rule_id
468
+ else:
469
+ return None
322
470
  except Exception as e: # pylint: disable=broad-except
323
471
  attempts += 1
324
472
  time.sleep(10)
@@ -330,13 +478,12 @@ def _add_firewall_rule(firewall_id, internal_ip, direction,
330
478
  def _delete_firewall_rule(firewall_id, rule_ids):
331
479
  if not isinstance(rule_ids, list):
332
480
  rule_ids = [rule_ids]
333
-
334
481
  attempts = 0
335
482
  max_attempts = 300
336
483
  while attempts < max_attempts:
337
484
  try:
338
485
  scp_utils.SCPClient().delete_firewall_rule(firewall_id, rule_ids)
339
- if _remaining_firewall_rule(firewall_id, rule_ids) is False:
486
+ if not _remaining_firewall_rule(firewall_id, rule_ids):
340
487
  return
341
488
  except Exception as e: # pylint: disable=broad-except
342
489
  attempts += 1
@@ -385,19 +532,35 @@ def stop_instances(
385
532
  provider_config: Optional[Dict[str, Any]] = None,
386
533
  worker_only: bool = False,
387
534
  ) -> None:
388
- del provider_config, worker_only
389
- instances = scp_utils.SCPClient().get_instances()
535
+ del provider_config
536
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
390
537
 
391
- for instance in instances:
392
- if instance['virtualServerName'] == cluster_name_on_cloud:
538
+ if worker_only:
539
+ head_instance_name = _head(cluster_name_on_cloud)
540
+ instances = [
541
+ instance for instance in instances
542
+ if instance['virtualServerName'] != head_instance_name
543
+ ]
544
+
545
+ if not instances:
546
+ return
547
+
548
+ def _stop(instance):
549
+ try:
393
550
  instance_id = instance['virtualServerId']
394
551
  scp_utils.SCPClient().stop_instance(instance_id)
395
552
  while True:
396
- instance_info = scp_utils.SCPClient().get_instance_info(
397
- instance_id)
553
+ info = scp_utils.SCPClient().get_instance_info(instance_id)
554
+ if info['virtualServerState'] == 'STOPPED':
555
+ return instance_id
398
556
  time.sleep(2)
399
- if instance_info['virtualServerState'] == 'STOPPED':
400
- break
557
+ except Exception as e: # pylint: disable=broad-except
558
+ logger.error(f'stop_instances error: {e}')
559
+
560
+ with ThreadPoolExecutor(max_workers=min(len(instances), 32)) as ex:
561
+ execution = [ex.submit(_stop, instance) for instance in instances]
562
+ for e in as_completed(execution):
563
+ e.result()
401
564
 
402
565
 
403
566
  def terminate_instances(
@@ -405,25 +568,37 @@ def terminate_instances(
405
568
  provider_config: Optional[Dict[str, Any]] = None,
406
569
  worker_only: bool = False,
407
570
  ) -> None:
408
- del provider_config, worker_only
409
- instances = scp_utils.SCPClient().get_instances()
571
+ del provider_config
572
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING', 'STOPPED'])
410
573
 
411
- for instance in instances:
412
- if instance['virtualServerName'] == cluster_name_on_cloud:
413
- try:
414
- instance_id = instance['virtualServerId']
415
- instance_info = scp_utils.SCPClient().get_instance_info(
416
- instance_id)
417
- vpc_id = instance_info['vpcId']
418
- sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
419
- firewall_id = _get_firewall_id(vpc_id)
420
- rule_ids = _get_firewall_rule_ids(instance_info, firewall_id,
421
- None)
422
- _delete_firewall_rule(firewall_id, rule_ids)
423
- _delete_instance(instance_id)
424
- _delete_security_group(sg_id)
425
- except Exception as e: # pylint: disable=broad-except
426
- logger.error(f'terminate_instances error: {e}')
574
+ if worker_only:
575
+ head_instance_name = _head(cluster_name_on_cloud)
576
+ instances = [
577
+ instance for instance in instances
578
+ if instance['virtualServerName'] != head_instance_name
579
+ ]
580
+
581
+ if not instances:
582
+ return
583
+
584
+ def _terminate(instance):
585
+ try:
586
+ instance_id = instance['virtualServerId']
587
+ instance_info = scp_utils.SCPClient().get_instance_info(instance_id)
588
+ vpc_id = instance_info['vpcId']
589
+ sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
590
+ firewall_id = _get_firewall_id(vpc_id)
591
+ rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, None)
592
+ _delete_firewall_rule(firewall_id, rule_ids)
593
+ _delete_instance(instance_id)
594
+ _delete_security_group(sg_id)
595
+ except Exception as e: # pylint: disable=broad-except
596
+ logger.error(f'terminate_instances error: {e}')
597
+
598
+ with ThreadPoolExecutor(max_workers=min(len(instances), 32)) as ex:
599
+ execution = [ex.submit(_terminate, instance) for instance in instances]
600
+ for e in as_completed(execution):
601
+ e.result()
427
602
 
428
603
 
429
604
  def query_instances(
@@ -431,8 +606,9 @@ def query_instances(
431
606
  cluster_name_on_cloud: str,
432
607
  provider_config: Optional[Dict[str, Any]] = None,
433
608
  non_terminated_only: bool = True,
609
+ retry_if_missing: bool = False,
434
610
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
435
- del cluster_name # unused
611
+ del cluster_name, retry_if_missing # unused
436
612
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
437
613
  instances = _filter_instances(cluster_name_on_cloud, None)
438
614
 
@@ -467,7 +643,6 @@ def get_cluster_info(
467
643
  cluster_name_on_cloud: str,
468
644
  provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
469
645
  del region
470
-
471
646
  running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
472
647
  head_instance_id = _get_head_instance_id(running_instances)
473
648
 
@@ -482,9 +657,19 @@ def get_cluster_info(
482
657
  tags={})
483
658
  ]
484
659
 
660
+ # max-worker-port - min-worker-port should be at least 3 * nproc
661
+ # RAY_worker_maximum_startup_concurrency for the performance
662
+ custom_ray_options = {
663
+ 'node-manager-port': 11001,
664
+ 'min-worker-port': 11002,
665
+ 'max-worker-port': 11200,
666
+ 'ray-client-server-port': 10001
667
+ }
668
+
485
669
  return common.ClusterInfo(
486
670
  instances=instances,
487
671
  head_instance_id=head_instance_id,
672
+ custom_ray_options=custom_ray_options,
488
673
  provider_name='scp',
489
674
  provider_config=provider_config,
490
675
  )
@@ -495,20 +680,16 @@ def open_ports(
495
680
  ports: List[str],
496
681
  provider_config: Optional[Dict[str, Any]] = None,
497
682
  ) -> None:
498
-
499
683
  del provider_config
500
- instances = scp_utils.SCPClient().get_instances()
501
-
502
- for instance in instances:
503
- if instance['virtualServerName'] == cluster_name_on_cloud:
504
- instance_info = scp_utils.SCPClient().get_instance_info(
505
- instance['virtualServerId'])
506
- sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
507
- scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', ports)
508
- vpc_id = instance_info['vpcId']
509
- internal_ip = instance_info['ip']
510
- firewall_id = _get_firewall_id(vpc_id)
511
- _add_firewall_rule(firewall_id, internal_ip, 'IN', ports)
684
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
685
+ head_instance_id = _get_head_instance_id(instances)
686
+ instance_info = scp_utils.SCPClient().get_instance_info(head_instance_id)
687
+ sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
688
+ scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', ports, None)
689
+ vpc_id = instance_info['vpcId']
690
+ internal_ip = instance_info['ip']
691
+ firewall_id = _get_firewall_id(vpc_id)
692
+ _add_firewall_rule(firewall_id, internal_ip, 'IN', ports, None)
512
693
 
513
694
 
514
695
  def cleanup_ports(
@@ -516,15 +697,11 @@ def cleanup_ports(
516
697
  ports: List[str],
517
698
  provider_config: Optional[Dict[str, Any]] = None,
518
699
  ) -> None:
519
-
520
700
  del provider_config
521
- instances = scp_utils.SCPClient().get_instances()
522
-
523
- for instance in instances:
524
- if instance['virtualServerName'] == cluster_name_on_cloud:
525
- instance_info = scp_utils.SCPClient().get_instance_info(
526
- instance['virtualServerId'])
527
- vpc_id = instance_info['vpcId']
528
- firewall_id = _get_firewall_id(vpc_id)
529
- rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, ports)
530
- _delete_firewall_rule(firewall_id, rule_ids)
701
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
702
+ head_instance_id = _get_head_instance_id(instances)
703
+ instance_info = scp_utils.SCPClient().get_instance_info(head_instance_id)
704
+ vpc_id = instance_info['vpcId']
705
+ firewall_id = _get_firewall_id(vpc_id)
706
+ rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, ports)
707
+ _delete_firewall_rule(firewall_id, rule_ids)
@@ -0,0 +1,11 @@
1
+ """Seeweb provisioner for SkyPilot."""
2
+
3
+ from sky.provision.seeweb.config import bootstrap_instances
4
+ from sky.provision.seeweb.instance import cleanup_ports
5
+ from sky.provision.seeweb.instance import get_cluster_info
6
+ from sky.provision.seeweb.instance import open_ports
7
+ from sky.provision.seeweb.instance import query_instances
8
+ from sky.provision.seeweb.instance import run_instances
9
+ from sky.provision.seeweb.instance import stop_instances
10
+ from sky.provision.seeweb.instance import terminate_instances
11
+ from sky.provision.seeweb.instance import wait_instances
@@ -0,0 +1,13 @@
1
+ """Configuration for Seeweb provisioning."""
2
+
3
+ from typing import Any, Dict
4
+
5
+
6
+ def bootstrap_instances(*args, **_kwargs) -> Dict[str, Any]:
7
+ """Bootstrap instances for Seeweb.
8
+
9
+ Seeweb doesn't require any special configuration bootstrapping,
10
+ so we just return the config as-is.
11
+ """
12
+ config = args[2]
13
+ return config