skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,812 @@
1
+ """Seeweb provisioner for SkyPilot / Ray autoscaler.
2
+
3
+ Prerequisites:
4
+ pip install ecsapi
5
+ """
6
+
7
+ import os
8
+ import subprocess
9
+ import time
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+ from sky import sky_logging
13
+ from sky.adaptors import seeweb as seeweb_adaptor
14
+ from sky.provision import common
15
+ from sky.provision.common import ClusterInfo
16
+ from sky.provision.common import InstanceInfo
17
+ from sky.provision.common import ProvisionConfig
18
+ from sky.provision.common import ProvisionRecord
19
+ from sky.utils import auth_utils
20
+ from sky.utils import command_runner # Unified SSH helper
21
+ from sky.utils import common_utils
22
+ from sky.utils import status_lib
23
+
24
+ logger = sky_logging.init_logger(__name__)
25
+
26
+ # Singleton Seeweb client reused across the module
27
+ _seeweb_client = None
28
+
29
+
30
+ def _get_seeweb_client():
31
+ """Return a singleton Seeweb ECS API client."""
32
+ global _seeweb_client
33
+ if _seeweb_client is None:
34
+ # Initialize via adaptor's cached client
35
+ _seeweb_client = seeweb_adaptor.client()
36
+ return _seeweb_client
37
+
38
+
39
+ # --------------------------------------------------------------------------- #
40
+ # Useful constants
41
+ # --------------------------------------------------------------------------- #
42
+ _POLL_INTERVAL = 5 # sec
43
+ _MAX_BOOT_TIME = 1200 # sec
44
+ _ACTION_WATCH_MAX_RETRY = 360 # number of polls before giving up
45
+ _ACTION_WATCH_FETCH_EVERY = 5 # seconds between polls
46
+ _API_RETRY_MAX_RETRIES = 5
47
+ _API_RETRY_INITIAL_BACKOFF = 1
48
+
49
+
50
+ # --------------------------------------------------------------------------- #
51
+ # Class required by the Ray backend
52
+ # --------------------------------------------------------------------------- #
53
+ class SeewebNodeProvider:
54
+ """Minimalist provisioner for Seeweb ECS."""
55
+
56
+ def __init__(self, provider_config: ProvisionConfig, cluster_name: str):
57
+ """provider_config: dict populated by template (plan, image, location,
58
+ remote_key_name, optional gpu…)
59
+ cluster_name : SkyPilot name on cloud (used in notes)
60
+ """
61
+ self.config = provider_config
62
+ self.cluster_name = cluster_name
63
+ # Reuse a singleton Seeweb client to avoid repeated authentications/API
64
+ # object creations across different provider instances.
65
+ self.ecs = _get_seeweb_client()
66
+
67
+ def _get_ssh_user(self) -> str:
68
+ # Prefer auth config; fallback to template default for Seeweb
69
+ return (self.config.authentication_config.get('ssh_user') if self.config
70
+ and self.config.authentication_config else None) or 'ecuser'
71
+
72
+ def _get_private_key_path(self) -> str:
73
+ # Prefer explicit path from auth config; otherwise use SkyPilot key
74
+ key_path = None
75
+ if self.config and self.config.authentication_config:
76
+ key_path = self.config.authentication_config.get('ssh_private_key')
77
+ if not key_path:
78
+ key_path, _ = auth_utils.get_or_generate_keys()
79
+ return os.path.expanduser(key_path)
80
+
81
+ # ------------------------------------------------------------------ #
82
+ # Helper: run a command on the VM via SSH using CommandRunner
83
+ # ------------------------------------------------------------------ #
84
+ def _run_remote(self,
85
+ server_ip: str,
86
+ cmd: str,
87
+ *,
88
+ timeout: int = 30,
89
+ stream_logs: bool = False) -> subprocess.CompletedProcess:
90
+ """Execute *cmd* on the remote host.
91
+
92
+ Uses sky.utils.command_runner.SSHCommandRunner for consistent SSH
93
+ options across all providers.
94
+ Returns a subprocess.CompletedProcess-like
95
+ object with returncode, stdout, stderr.
96
+ """
97
+ runner = command_runner.SSHCommandRunner(
98
+ node=(server_ip, 22),
99
+ ssh_user=self._get_ssh_user(),
100
+ ssh_private_key=self._get_private_key_path(),
101
+ )
102
+ rc, stdout, stderr = runner.run(cmd,
103
+ stream_logs=stream_logs,
104
+ require_outputs=True,
105
+ connect_timeout=timeout)
106
+ # Convert to simple namespace for compatibility
107
+ proc = subprocess.CompletedProcess(args=cmd,
108
+ returncode=rc,
109
+ stdout=stdout.encode(),
110
+ stderr=stderr.encode())
111
+ return proc
112
+
113
+ # --------------------------------------------------------------------- #
114
+ # 1. bootstrap_instances – no preprocessing needed here
115
+ # --------------------------------------------------------------------- #
116
+
117
+ # --------------------------------------------------------------------- #
118
+ # 2. run_instances: restart or create until we reach count
119
+ # --------------------------------------------------------------------- #
120
+ def run_instances(self, config: Dict, count: int) -> None:
121
+ existing = self._query_cluster_nodes()
122
+ del config # unused
123
+ running = [
124
+ s for s in existing if s.status in ('Booted', 'Running', 'RUNNING',
125
+ 'Booting', 'PoweringOn')
126
+ ]
127
+
128
+ # a) restart Off servers
129
+ for srv in (s for s in existing if s.status == 'Booted'):
130
+ specific_status = self.ecs.fetch_server_status(srv.name)
131
+ if specific_status == 'SHUTOFF':
132
+ logger.info(f'Powering on server {srv.name}')
133
+ self._power_on(srv.name)
134
+ running.append(srv)
135
+ if len(running) >= count:
136
+ break
137
+
138
+ # b) create new VMs if missing
139
+ while len(running) < count:
140
+ self._create_server()
141
+ running.append({}) # placeholder
142
+
143
+ # --------------------------------------------------------------------- #
144
+ # 3. terminate_instances
145
+ # --------------------------------------------------------------------- #
146
+ def terminate_instances(self) -> None:
147
+ for srv in self._query_cluster_nodes():
148
+ logger.info('Deleting server %s …', srv.name)
149
+ self.ecs.delete_server(srv.name) # DELETE /servers/{name}
150
+
151
+ # Retry deletion with exponential backoff
152
+ # to handle transient API errors
153
+ common_utils.retry(self.ecs.delete_server,
154
+ max_retries=5,
155
+ initial_backoff=1)(srv.name)
156
+
157
+ # --------------------------------------------------------------------- #
158
+ # 4. stop_instances
159
+ # --------------------------------------------------------------------- #
160
+ def stop_instances(self) -> None:
161
+ cluster_nodes = self._query_cluster_nodes()
162
+
163
+ for srv in cluster_nodes:
164
+ specific_status = self.ecs.fetch_server_status(srv.name)
165
+
166
+ if specific_status == 'SHUTOFF':
167
+ logger.info(f'\nServer {srv.name} is already stopped\n')
168
+ continue
169
+ elif srv.status in ('Booted', 'Running', 'RUNNING'):
170
+ # Get specific status to check if server is not already SHUTOFF
171
+ try:
172
+ specific_status = self.ecs.fetch_server_status(srv.name)
173
+ # Continue with power off only if
174
+ # specific_status is not SHUTOFF
175
+ # and general status is not STOPPED
176
+ if specific_status != 'SHUTOFF' and srv.status != 'STOPPED':
177
+ self._power_off(srv.name)
178
+ except Exception: # pylint: disable=broad-except
179
+ # Fallback: if we can't get specific
180
+ # status, use general status check
181
+ if srv.status != 'STOPPED':
182
+ self._power_off(srv.name)
183
+ else:
184
+ logger.info(f'\nServer {srv.name} has status'
185
+ f'{srv.status}, skipping\n')
186
+ # Wait for all servers to be actually stopped with forced refresh
187
+ self._wait_for_stop_with_forced_refresh()
188
+
189
+ # --------------------------------------------------------------------- #
190
+ # 5. query_instances
191
+ # --------------------------------------------------------------------- #
192
+ def query_instances(self) -> Dict[str, str]:
193
+ """Query instances status using both fetch_servers()
194
+ and fetch_server_status().
195
+
196
+ Seeweb has two different APIs:
197
+ - fetch_servers() returns states like 'Booted', 'Booting'
198
+ - fetch_server_status() returns states like 'SHUTOFF' (stopped)
199
+
200
+ We need to use fetch_server_status() to get the correct stopped state.
201
+ """
202
+ instances = {}
203
+ cluster_nodes = self._query_cluster_nodes()
204
+
205
+ for server in cluster_nodes:
206
+ # Always try to get the specific status first for more accuracy
207
+ try:
208
+ specific_status = self.ecs.fetch_server_status(server.name)
209
+ instances[server.name] = specific_status
210
+ except Exception: # pylint: disable=broad-except
211
+ # Fallback to general status if fetch_server_status fails
212
+ general_status = server.status
213
+ instances[server.name] = general_status
214
+
215
+ return instances
216
+
217
+ # --------------------------------------------------------------------- #
218
+ # 6. wait_instances
219
+ # --------------------------------------------------------------------- #
220
+ def wait_instances(self, desired_state: str = 'Booted') -> None:
221
+ deadline = time.time() + _MAX_BOOT_TIME
222
+
223
+ while time.time() < deadline:
224
+ cluster_nodes = self._query_cluster_nodes()
225
+
226
+ # For SHUTOFF state, we need to use fetch_server_status()
227
+ # to get the real status
228
+ if desired_state == 'SHUTOFF':
229
+ all_shutoff = True
230
+ for server in cluster_nodes:
231
+ try:
232
+ specific_status = self.ecs.fetch_server_status(
233
+ server.name)
234
+ if specific_status != 'SHUTOFF':
235
+ all_shutoff = False
236
+ except Exception: # pylint: disable=broad-except
237
+ all_shutoff = False
238
+
239
+ if all_shutoff:
240
+ return
241
+ else:
242
+ # For other states, use the general status
243
+ states = {srv.status for srv in cluster_nodes}
244
+
245
+ if states <= {desired_state}:
246
+ # If all servers are Booted, wait
247
+ # for them to be truly stable
248
+ if desired_state == 'Booted':
249
+ if self._wait_for_all_servers_stable():
250
+ return
251
+ else:
252
+ time.sleep(_POLL_INTERVAL)
253
+ continue
254
+ return
255
+
256
+ time.sleep(_POLL_INTERVAL)
257
+
258
+ raise TimeoutError(
259
+ f'Nodes are not all in state {desired_state} within timeout')
260
+
261
+ def _wait_for_all_servers_stable(self, max_wait: int = 600) -> bool:
262
+ """Waits for all cluster servers to be stable."""
263
+ logger.info('Checking stability of all cluster servers...')
264
+
265
+ start_time = time.time()
266
+ while time.time() - start_time < max_wait:
267
+ cluster_nodes = self._query_cluster_nodes()
268
+ all_stable = True
269
+
270
+ for node in cluster_nodes:
271
+ if node.status == 'Booted':
272
+ # Check that server is reachable via ping
273
+ if not self._ping_server(node.ipv4):
274
+ logger.warning(f'Server {node.name} ({node.ipv4})'
275
+ f'not reachable via ping')
276
+ all_stable = False
277
+ break
278
+
279
+ # SSH readiness handled by provisioner.wait_for_ssh()
280
+
281
+ logger.info(f'Server {node.name} ({node.ipv4}) is stable')
282
+
283
+ if all_stable:
284
+ logger.info('All servers are stable')
285
+ # Safety sleep to allow for late reboots
286
+ logger.info('Waiting 1 second to allow for late reboots...')
287
+ time.sleep(1)
288
+ return True
289
+
290
+ logger.info('Waiting for all servers to be stable...')
291
+ time.sleep(1)
292
+
293
+ logger.error('Timeout waiting for server stability')
294
+ return False
295
+
296
+ def _ping_server(self, server_ip: str) -> bool:
297
+ """Check that server is reachable via ping."""
298
+ try:
299
+ result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
300
+ capture_output=True,
301
+ timeout=10,
302
+ check=False)
303
+ return result.returncode == 0
304
+ except Exception as e: # pylint: disable=broad-except
305
+ logger.debug(f'Error pinging {server_ip}: {e}')
306
+ return False
307
+
308
+ def _check_ssh_ready(self, server_ip: str) -> bool:
309
+ """Check that SSH is available on the server."""
310
+ try:
311
+ ssh_user = self._get_ssh_user()
312
+ private_key_path = self._get_private_key_path()
313
+ result = subprocess.run([
314
+ 'ssh', '-o', 'ConnectTimeout=10', '-o',
315
+ 'StrictHostKeyChecking=no', '-o',
316
+ f'UserKnownHostsFile={os.devnull}', '-o',
317
+ f'GlobalKnownHostsFile={os.devnull}', '-o',
318
+ 'IdentitiesOnly=yes', '-i', private_key_path,
319
+ f'{ssh_user}@{server_ip}', 'echo "SSH ready"'
320
+ ],
321
+ capture_output=True,
322
+ timeout=15,
323
+ check=False)
324
+ return result.returncode == 0
325
+ except Exception as e: # pylint: disable=broad-except
326
+ logger.debug(f'Error checking SSH on {server_ip}: {e}')
327
+ return False
328
+
329
+ # ------------------------------------------------------------------ #
330
+ # 7. open_ports / cleanup_ports – Seeweb has all ports open by default
331
+ # ------------------------------------------------------------------ #
332
+ def open_ports(
333
+ self,
334
+ cluster_name_on_cloud: str,
335
+ ports: List[str],
336
+ provider_config: Optional[Dict[str, Any]] = None,
337
+ ) -> None:
338
+ """See sky/provision/__init__.py"""
339
+ logger.debug(f'Skip opening ports {ports} for Seeweb instances, as all '
340
+ 'ports are open by default.')
341
+ del cluster_name_on_cloud, provider_config, ports
342
+
343
+ def cleanup_ports(
344
+ self,
345
+ cluster_name_on_cloud: str,
346
+ ports: List[str],
347
+ provider_config: Optional[Dict[str, Any]] = None,
348
+ ) -> None:
349
+ del cluster_name_on_cloud, ports, provider_config # Unused.
350
+
351
+ # ====================== private helpers ========================= #
352
+ def _query_cluster_nodes(self):
353
+ """List servers with notes == cluster_name."""
354
+ servers = common_utils.retry(
355
+ self.ecs.fetch_servers,
356
+ max_retries=_API_RETRY_MAX_RETRIES,
357
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
358
+ return [
359
+ s for s in servers
360
+ if s.notes and s.notes.startswith(self.cluster_name)
361
+ ]
362
+
363
+ def query_cluster_nodes(self):
364
+ """Public wrapper for querying cluster nodes for this cluster."""
365
+ return common_utils.retry(self._query_cluster_nodes,
366
+ max_retries=_API_RETRY_MAX_RETRIES,
367
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
368
+
369
+ def _get_head_instance_id(self) -> Optional[str]:
370
+ """Return head instance id for this cluster.
371
+
372
+ Prefer notes == "{cluster}-head"; fallback to first node if none
373
+ matches (legacy naming).
374
+ """
375
+ nodes = common_utils.retry(self._query_cluster_nodes,
376
+ max_retries=_API_RETRY_MAX_RETRIES,
377
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
378
+ for node in nodes:
379
+ try:
380
+ if getattr(node, 'notes', None) == f'{self.cluster_name}-head':
381
+ return node.name
382
+ if getattr(node, 'name', None) and node.name.endswith('-head'):
383
+ return node.name
384
+ except Exception: # pylint: disable=broad-except
385
+ continue
386
+ return nodes[0].name if nodes else None
387
+
388
+ def get_head_instance_id(self) -> Optional[str]:
389
+ """Public wrapper for getting head instance id."""
390
+ return common_utils.retry(self._get_head_instance_id,
391
+ max_retries=_API_RETRY_MAX_RETRIES,
392
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
393
+
394
+ def _create_server(self):
395
+ """POST /servers with complete payload."""
396
+ node_type = 'head'
397
+ payload = {
398
+ 'plan': self.config.node_config.get('plan'), # e.g. eCS4
399
+ 'image': self.config.node_config.get('image'), # e.g. ubuntu-2204
400
+ 'location': self.config.node_config.get('location'), # e.g. it-mi2
401
+ 'notes': f'{self.cluster_name}-{node_type}',
402
+ 'ssh_key': self.config.authentication_config.get('remote_key_name'
403
+ ), # remote key
404
+ }
405
+
406
+ # Optional GPU
407
+ if 'gpu' in self.config.node_config:
408
+ payload.update({
409
+ 'gpu': self.config.node_config.get('gpu'),
410
+ 'gpu_label': self.config.node_config.get('gpu_label', ''),
411
+ })
412
+
413
+ # Add user_customize if present (Seeweb Cloud Script)
414
+ if 'user_customize' in self.config.node_config:
415
+ payload['user_customize'] = self.config.node_config[
416
+ 'user_customize']
417
+
418
+ # Build the request object expected by ecsapi
419
+ server_create_request_cls = (
420
+ seeweb_adaptor.ecsapi.ServerCreateRequest # type: ignore
421
+ )
422
+ create_request = server_create_request_cls(**payload)
423
+
424
+ logger.info('Creating Seeweb server %s', payload)
425
+
426
+ # POST /servers – returns (response, action_id)
427
+ _, action_id = common_utils.retry(
428
+ self.ecs.create_server,
429
+ max_retries=_API_RETRY_MAX_RETRIES,
430
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(
431
+ create_request, check_if_can_create=False)
432
+ self.ecs.watch_action(action_id,
433
+ max_retry=_ACTION_WATCH_MAX_RETRY,
434
+ fetch_every=_ACTION_WATCH_FETCH_EVERY)
435
+
436
+ def _power_on(self, server_id: str):
437
+ try:
438
+ common_utils.retry(
439
+ self.ecs.turn_on_server,
440
+ max_retries=_API_RETRY_MAX_RETRIES,
441
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
442
+ except seeweb_adaptor.SeewebError as e:
443
+ logger.error(f'Error in _power_on for {server_id}: {e}')
444
+ raise
445
+
446
+ def _power_off(self, server_id: str):
447
+ try:
448
+ common_utils.retry(
449
+ self.ecs.turn_off_server,
450
+ max_retries=_API_RETRY_MAX_RETRIES,
451
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
452
+ except seeweb_adaptor.SeewebError as e:
453
+ logger.error(f'\n\nError in _power_off for {server_id}: {e}')
454
+ raise
455
+
456
+ def _wait_action(self, action_id: int):
457
+ """Poll action until it completes."""
458
+ while True:
459
+ action = common_utils.retry(
460
+ self.ecs.fetch_action,
461
+ max_retries=_API_RETRY_MAX_RETRIES,
462
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(action_id)
463
+ if action['status'] in ('completed', 'ok', 'no_content'):
464
+ return
465
+ if action['status'] == 'error':
466
+ raise RuntimeError(f'Seeweb action {action_id} failed')
467
+ time.sleep(_POLL_INTERVAL)
468
+
469
+ def _wait_for_stop_with_forced_refresh(self, max_wait: int = 300) -> None:
470
+ """Wait for servers to be stopped with
471
+ aggressive polling and forced refresh."""
472
+ start_time = time.time()
473
+ poll_interval = 1 # 1 second for aggressive polling
474
+
475
+ while time.time() - start_time < max_wait:
476
+ # Force refresh by re-fetching cluster nodes
477
+ cluster_nodes = common_utils.retry(
478
+ self._query_cluster_nodes,
479
+ max_retries=_API_RETRY_MAX_RETRIES,
480
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
481
+
482
+ all_stopped = True
483
+ for server in cluster_nodes:
484
+ try:
485
+ # Always use fetch_server_status() for accurate status
486
+ specific_status = common_utils.retry(
487
+ self.ecs.fetch_server_status,
488
+ max_retries=_API_RETRY_MAX_RETRIES,
489
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server.name)
490
+
491
+ if specific_status != 'SHUTOFF':
492
+ all_stopped = False
493
+
494
+ except Exception: # pylint: disable=broad-except
495
+ all_stopped = False
496
+
497
+ if all_stopped:
498
+ return
499
+
500
+ time.sleep(poll_interval)
501
+
502
+ raise TimeoutError(f'Servers not stopped within {max_wait} seconds')
503
+
504
+
505
+ # =============================================================================
506
+ # Standalone functions required by the provisioning interface
507
+ # =============================================================================
508
+
509
+
510
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
511
+ config: ProvisionConfig) -> ProvisionRecord:
512
+ """Run instances for Seeweb cluster."""
513
+ del cluster_name # unused
514
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
515
+ provider.run_instances(config.node_config, config.count)
516
+
517
+ # Find the head node using notes convention
518
+ cluster_nodes = provider.query_cluster_nodes()
519
+ if not cluster_nodes:
520
+ raise RuntimeError(
521
+ f'No nodes found for cluster {cluster_name_on_cloud}')
522
+ head_node_id = provider.get_head_instance_id()
523
+ assert head_node_id is not None, 'head_instance_id should not be None'
524
+
525
+ return ProvisionRecord(
526
+ provider_name='Seeweb',
527
+ region=region,
528
+ zone=None, # Seeweb doesn't use zones
529
+ cluster_name=cluster_name_on_cloud,
530
+ head_instance_id=head_node_id,
531
+ resumed_instance_ids=[], # Empty for now
532
+ created_instance_ids=[node.name for node in cluster_nodes],
533
+ )
534
+
535
+
536
+ def stop_instances(
537
+ cluster_name_on_cloud: str,
538
+ provider_config: Optional[Dict[str, Any]] = None,
539
+ worker_only: bool = False,
540
+ ) -> None:
541
+ """Stop instances for Seeweb cluster."""
542
+ del worker_only # unused - Seeweb doesn't distinguish between head/worker
543
+ assert provider_config is not None
544
+
545
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
546
+ config = common.ProvisionConfig(
547
+ provider_config=provider_config,
548
+ authentication_config={},
549
+ docker_config={},
550
+ node_config=provider_config,
551
+ count=1, # Not used for stop operation
552
+ tags={},
553
+ resume_stopped_nodes=False,
554
+ ports_to_open_on_launch=None,
555
+ )
556
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
557
+ provider.stop_instances()
558
+
559
+
560
+ def terminate_instances(
561
+ cluster_name_on_cloud: str,
562
+ provider_config: Optional[Dict[str, Any]] = None,
563
+ worker_only: bool = False,
564
+ ) -> None:
565
+ """Terminate instances for Seeweb cluster."""
566
+ del worker_only # unused - Seeweb doesn't distinguish between head/worker
567
+ assert provider_config is not None
568
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
569
+ config = common.ProvisionConfig(
570
+ provider_config=provider_config,
571
+ authentication_config={},
572
+ docker_config={},
573
+ node_config=provider_config,
574
+ count=1, # Not used for terminate operation
575
+ tags={},
576
+ resume_stopped_nodes=False,
577
+ ports_to_open_on_launch=None,
578
+ )
579
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
580
+ provider.terminate_instances()
581
+
582
+
583
+ def wait_instances(
584
+ region: str,
585
+ cluster_name_on_cloud: str,
586
+ state: Optional[status_lib.ClusterStatus],
587
+ ) -> None:
588
+ del region # unused
589
+ # Map ClusterStatus to Seeweb string
590
+ if state == status_lib.ClusterStatus.UP:
591
+ seeweb_state = 'Booted'
592
+ elif state == status_lib.ClusterStatus.STOPPED:
593
+ seeweb_state = 'SHUTOFF'
594
+ elif state is None:
595
+ seeweb_state = 'Terminated' # For termination
596
+ else:
597
+ seeweb_state = 'Booted' # Default fallback
598
+
599
+ # Create Seeweb client directly and wait
600
+ client = _get_seeweb_client()
601
+ deadline = time.time() + _MAX_BOOT_TIME
602
+ while time.time() < deadline:
603
+ cluster_nodes = [
604
+ s for s in client.fetch_servers()
605
+ if s.notes and s.notes.startswith(cluster_name_on_cloud)
606
+ ]
607
+ if not cluster_nodes:
608
+ time.sleep(_POLL_INTERVAL)
609
+ continue
610
+
611
+ states = {srv.status for srv in cluster_nodes}
612
+ if states <= {seeweb_state}:
613
+ # If all servers are Booted, wait for them to be truly stable
614
+ if seeweb_state == 'Booted':
615
+ if _wait_for_all_servers_stable_standalone(cluster_nodes):
616
+ return
617
+ else:
618
+ time.sleep(_POLL_INTERVAL)
619
+ continue
620
+ return
621
+ time.sleep(_POLL_INTERVAL)
622
+
623
+ raise TimeoutError(
624
+ f'Nodes are not all in state {seeweb_state} within timeout')
625
+
626
+
627
+ def _wait_for_all_servers_stable_standalone(cluster_nodes,
628
+ max_wait: int = 300) -> bool:
629
+ """Waits for all cluster servers to be stable (standalone version)."""
630
+ start_time = time.time()
631
+ while time.time() - start_time < max_wait:
632
+ all_stable = True
633
+
634
+ for node in cluster_nodes:
635
+ if node.status == 'Booted':
636
+ # Check that server is reachable via ping
637
+ if not _ping_server_standalone(node.ipv4):
638
+ all_stable = False
639
+ break
640
+
641
+ # Do not check SSH here; handled by provisioner.wait_for_ssh().
642
+
643
+ if all_stable:
644
+ # Safety sleep to allow for late reboots
645
+ time.sleep(1)
646
+ return True
647
+
648
+ time.sleep(1)
649
+
650
+ return False
651
+
652
+
653
+ def _ping_server_standalone(server_ip: str) -> bool:
654
+ """Check that server is reachable via ping (standalone version)."""
655
+ try:
656
+ result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
657
+ capture_output=True,
658
+ timeout=10,
659
+ check=False)
660
+ return result.returncode == 0
661
+ except Exception as e: # pylint: disable=broad-except
662
+ logger.error(f'Error pinging {server_ip}: {e}')
663
+ return False
664
+
665
+
666
+ def _check_ssh_ready_standalone(server_ip: str) -> bool:
667
+ """Check that SSH is available on the server (standalone version)."""
668
+ try:
669
+ private_key_path, _ = auth_utils.get_or_generate_keys()
670
+ private_key_path = os.path.expanduser(private_key_path)
671
+ ssh_user = 'ecuser'
672
+ result = subprocess.run([
673
+ 'ssh', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=no',
674
+ '-o', f'UserKnownHostsFile={os.devnull}', '-o',
675
+ f'GlobalKnownHostsFile={os.devnull}', '-o', 'IdentitiesOnly=yes',
676
+ '-i', private_key_path, f'{ssh_user}@{server_ip}',
677
+ 'echo "SSH ready"'
678
+ ],
679
+ capture_output=True,
680
+ timeout=15,
681
+ check=False)
682
+ return result.returncode == 0
683
+ except Exception: # pylint: disable=broad-except
684
+ return False
685
+
686
+
687
+ def query_instances(
688
+ cluster_name: str,
689
+ cluster_name_on_cloud: str,
690
+ provider_config: Optional[Dict[str, Any]] = None,
691
+ non_terminated_only: bool = True,
692
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
693
+ """Query instances status for Seeweb cluster."""
694
+ del cluster_name # unused
695
+ # Use the provided provider_config or default to empty dict
696
+ if provider_config is None:
697
+ provider_config = {}
698
+
699
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
700
+ config = common.ProvisionConfig(
701
+ provider_config=provider_config,
702
+ authentication_config={},
703
+ docker_config={},
704
+ node_config=provider_config,
705
+ count=1, # Not used for query operation
706
+ tags={},
707
+ resume_stopped_nodes=False,
708
+ ports_to_open_on_launch=None,
709
+ )
710
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
711
+ seeweb_instances = provider.query_instances()
712
+
713
+ # Map Seeweb status to SkyPilot status
714
+ status_map = {
715
+ 'Booted':
716
+ status_lib.ClusterStatus.UP, # Seeweb uses "Booted" for running
717
+ 'RUNNING': status_lib.ClusterStatus.UP, # All caps version
718
+ 'Booting': status_lib.ClusterStatus.INIT,
719
+ 'PoweringOn': status_lib.ClusterStatus.INIT,
720
+ 'Off': status_lib.ClusterStatus.STOPPED,
721
+ 'Stopped': status_lib.ClusterStatus.STOPPED,
722
+ 'SHUTOFF':
723
+ status_lib.ClusterStatus.STOPPED, # Add missing SHUTOFF status
724
+ 'PoweringOff': status_lib.ClusterStatus.
725
+ STOPPED, # Fixed: should be STOPPED, not INIT
726
+ }
727
+
728
+ result: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
729
+ Optional[str]]] = {}
730
+ for name, seeweb_status in seeweb_instances.items():
731
+ if non_terminated_only and seeweb_status in ('Terminated', 'Deleted'):
732
+ continue
733
+ mapped_status = status_map.get(seeweb_status,
734
+ status_lib.ClusterStatus.INIT)
735
+ # Return tuple of (status, reason) where reason is None for Seeweb
736
+ result[name] = (mapped_status, None)
737
+
738
+ return result
739
+
740
+
741
+ # Signature should not include provider_name; router strips it before calling
742
+ def get_cluster_info(
743
+ region: str,
744
+ cluster_name_on_cloud: str,
745
+ provider_config: Optional[Dict[str, Any]] = None,
746
+ ) -> 'ClusterInfo':
747
+ del region # unused
748
+ # Use Seeweb client to get cluster instances
749
+ client = _get_seeweb_client()
750
+ cluster_nodes = [
751
+ s for s in client.fetch_servers()
752
+ if s.notes and s.notes.startswith(cluster_name_on_cloud)
753
+ ]
754
+
755
+ if not cluster_nodes:
756
+ raise RuntimeError(
757
+ f'No instances found for cluster {cluster_name_on_cloud}')
758
+
759
+ instances = {}
760
+ head_instance = None
761
+ for node in cluster_nodes:
762
+ if getattr(node, 'notes', None) == f'{cluster_name_on_cloud}-head':
763
+ head_instance = node.name
764
+ break
765
+ if head_instance is None:
766
+ head_instance = cluster_nodes[0].name
767
+
768
+ for node in cluster_nodes:
769
+ # For Seeweb, we take the first node as head
770
+ if head_instance is None:
771
+ head_instance = node.name
772
+
773
+ # Get server IP (Seeweb uses 'ipv4' attribute)
774
+ external_ip = node.ipv4
775
+ internal_ip = external_ip # For Seeweb, internal IP = external IP
776
+
777
+ instances[node.name] = [
778
+ InstanceInfo(
779
+ instance_id=node.name,
780
+ internal_ip=internal_ip,
781
+ external_ip=external_ip,
782
+ ssh_port=22,
783
+ tags={},
784
+ )
785
+ ]
786
+
787
+ return ClusterInfo(
788
+ instances=instances,
789
+ head_instance_id=head_instance,
790
+ provider_name='Seeweb',
791
+ provider_config=provider_config,
792
+ )
793
+
794
+
795
+ def open_ports(
796
+ cluster_name_on_cloud: str,
797
+ ports: List[str],
798
+ provider_config: Optional[Dict[str, Any]] = None,
799
+ ) -> None:
800
+ del provider_config # Unused
801
+ logger.debug(f'Seeweb: skipping open_ports for {cluster_name_on_cloud}'
802
+ f'ports={ports} all ports are open by default')
803
+ return
804
+
805
+
806
+ def cleanup_ports(
807
+ cluster_name_on_cloud: str,
808
+ ports: List[str],
809
+ provider_config: Optional[Dict[str, Any]] = None,
810
+ ) -> None:
811
+ del cluster_name_on_cloud, ports, provider_config # Unused.
812
+ return