skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,807 @@
1
+ """Seeweb provisioner for SkyPilot / Ray autoscaler.
2
+
3
+ Prerequisites:
4
+ pip install ecsapi
5
+ """
6
+
7
+ import os
8
+ import subprocess
9
+ import time
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+ from sky import sky_logging
13
+ from sky.adaptors import seeweb as seeweb_adaptor
14
+ from sky.provision import common
15
+ from sky.provision.common import ClusterInfo
16
+ from sky.provision.common import InstanceInfo
17
+ from sky.provision.common import ProvisionConfig
18
+ from sky.provision.common import ProvisionRecord
19
+ from sky.utils import auth_utils
20
+ from sky.utils import command_runner # Unified SSH helper
21
+ from sky.utils import common_utils
22
+ from sky.utils import status_lib
23
+
24
+ logger = sky_logging.init_logger(__name__)
25
+
26
+ # Singleton Seeweb client reused across the module
27
+ _seeweb_client = None
28
+
29
+
30
+ def _get_seeweb_client():
31
+ """Return a singleton Seeweb ECS API client."""
32
+ global _seeweb_client
33
+ if _seeweb_client is None:
34
+ # Initialize via adaptor's cached client
35
+ _seeweb_client = seeweb_adaptor.client()
36
+ return _seeweb_client
37
+
38
+
39
+ # --------------------------------------------------------------------------- #
40
+ # Useful constants
41
+ # --------------------------------------------------------------------------- #
42
+ _POLL_INTERVAL = 5 # sec
43
+ _MAX_BOOT_TIME = 1200 # sec
44
+ _ACTION_WATCH_MAX_RETRY = 360 # number of polls before giving up
45
+ _ACTION_WATCH_FETCH_EVERY = 5 # seconds between polls
46
+ _API_RETRY_MAX_RETRIES = 5
47
+ _API_RETRY_INITIAL_BACKOFF = 1
48
+
49
+
50
+ # --------------------------------------------------------------------------- #
51
+ # Class required by the Ray backend
52
+ # --------------------------------------------------------------------------- #
53
+ class SeewebNodeProvider:
54
+ """Minimalist provisioner for Seeweb ECS."""
55
+
56
+ def __init__(self, provider_config: ProvisionConfig, cluster_name: str):
57
+ """provider_config: dict populated by template (plan, image, location,
58
+ remote_key_name, optional gpu…)
59
+ cluster_name : SkyPilot name on cloud (used in notes)
60
+ """
61
+ self.config = provider_config
62
+ self.cluster_name = cluster_name
63
+ # Reuse a singleton Seeweb client to avoid repeated authentications/API
64
+ # object creations across different provider instances.
65
+ self.ecs = _get_seeweb_client()
66
+
67
+ def _get_ssh_user(self) -> str:
68
+ # Prefer auth config; fallback to template default for Seeweb
69
+ return (self.config.authentication_config.get('ssh_user') if self.config
70
+ and self.config.authentication_config else None) or 'ecuser'
71
+
72
+ def _get_private_key_path(self) -> str:
73
+ # Prefer explicit path from auth config; otherwise use SkyPilot key
74
+ key_path = None
75
+ if self.config and self.config.authentication_config:
76
+ key_path = self.config.authentication_config.get('ssh_private_key')
77
+ if not key_path:
78
+ key_path, _ = auth_utils.get_or_generate_keys()
79
+ return os.path.expanduser(key_path)
80
+
81
+ # ------------------------------------------------------------------ #
82
+ # Helper: run a command on the VM via SSH using CommandRunner
83
+ # ------------------------------------------------------------------ #
84
+ def _run_remote(self,
85
+ server_ip: str,
86
+ cmd: str,
87
+ *,
88
+ timeout: int = 30,
89
+ stream_logs: bool = False) -> subprocess.CompletedProcess:
90
+ """Execute *cmd* on the remote host.
91
+
92
+ Uses sky.utils.command_runner.SSHCommandRunner for consistent SSH
93
+ options across all providers.
94
+ Returns a subprocess.CompletedProcess-like
95
+ object with returncode, stdout, stderr.
96
+ """
97
+ runner = command_runner.SSHCommandRunner(
98
+ node=(server_ip, 22),
99
+ ssh_user=self._get_ssh_user(),
100
+ ssh_private_key=self._get_private_key_path(),
101
+ )
102
+ rc, stdout, stderr = runner.run(cmd,
103
+ stream_logs=stream_logs,
104
+ require_outputs=True,
105
+ connect_timeout=timeout)
106
+ # Convert to simple namespace for compatibility
107
+ proc = subprocess.CompletedProcess(args=cmd,
108
+ returncode=rc,
109
+ stdout=stdout.encode(),
110
+ stderr=stderr.encode())
111
+ return proc
112
+
113
+ # --------------------------------------------------------------------- #
114
+ # 1. bootstrap_instances – no preprocessing needed here
115
+ # --------------------------------------------------------------------- #
116
+
117
+ # --------------------------------------------------------------------- #
118
+ # 2. run_instances: restart or create until we reach count
119
+ # --------------------------------------------------------------------- #
120
+ def run_instances(self, config: Dict, count: int) -> None:
121
+ existing = self._query_cluster_nodes()
122
+ del config # unused
123
+ running = [
124
+ s for s in existing if s.status in ('Booted', 'Running', 'RUNNING',
125
+ 'Booting', 'PoweringOn')
126
+ ]
127
+
128
+ # a) restart Off servers
129
+ for srv in (s for s in existing if s.status == 'Booted'):
130
+ specific_status = self.ecs.fetch_server_status(srv.name)
131
+ if specific_status == 'SHUTOFF':
132
+ logger.info(f'Powering on server {srv.name}')
133
+ self._power_on(srv.name)
134
+ running.append(srv)
135
+ if len(running) >= count:
136
+ break
137
+
138
+ # b) create new VMs if missing
139
+ while len(running) < count:
140
+ self._create_server()
141
+ running.append({}) # placeholder
142
+
143
+ # --------------------------------------------------------------------- #
144
+ # 3. terminate_instances
145
+ # --------------------------------------------------------------------- #
146
+ def terminate_instances(self) -> None:
147
+ for srv in self._query_cluster_nodes():
148
+ logger.info('Deleting server %s …', srv.name)
149
+ self.ecs.delete_server(srv.name) # DELETE /servers/{name}
150
+
151
+ # Retry deletion with exponential backoff
152
+ # to handle transient API errors
153
+ common_utils.retry(self.ecs.delete_server,
154
+ max_retries=5,
155
+ initial_backoff=1)(srv.name)
156
+
157
+ # --------------------------------------------------------------------- #
158
+ # 4. stop_instances
159
+ # --------------------------------------------------------------------- #
160
+ def stop_instances(self) -> None:
161
+ cluster_nodes = self._query_cluster_nodes()
162
+
163
+ for srv in cluster_nodes:
164
+ specific_status = self.ecs.fetch_server_status(srv.name)
165
+
166
+ if specific_status == 'SHUTOFF':
167
+ logger.info(f'\nServer {srv.name} is already stopped\n')
168
+ continue
169
+ elif srv.status in ('Booted', 'Running', 'RUNNING'):
170
+ # Get specific status to check if server is not already SHUTOFF
171
+ try:
172
+ specific_status = self.ecs.fetch_server_status(srv.name)
173
+ # Continue with power off only if
174
+ # specific_status is not SHUTOFF
175
+ # and general status is not STOPPED
176
+ if specific_status != 'SHUTOFF' and srv.status != 'STOPPED':
177
+ self._power_off(srv.name)
178
+ except Exception: # pylint: disable=broad-except
179
+ # Fallback: if we can't get specific
180
+ # status, use general status check
181
+ if srv.status != 'STOPPED':
182
+ self._power_off(srv.name)
183
+ else:
184
+ logger.info(f'\nServer {srv.name} has status'
185
+ f'{srv.status}, skipping\n')
186
+ # Wait for all servers to be actually stopped with forced refresh
187
+ self._wait_for_stop_with_forced_refresh()
188
+
189
+ # --------------------------------------------------------------------- #
190
+ # 5. query_instances
191
+ # --------------------------------------------------------------------- #
192
+ def query_instances(self) -> Dict[str, str]:
193
+ """Query instances status using both fetch_servers()
194
+ and fetch_server_status().
195
+
196
+ Seeweb has two different APIs:
197
+ - fetch_servers() returns states like 'Booted', 'Booting'
198
+ - fetch_server_status() returns states like 'SHUTOFF' (stopped)
199
+
200
+ We need to use fetch_server_status() to get the correct stopped state.
201
+ """
202
+ instances = {}
203
+ cluster_nodes = self._query_cluster_nodes()
204
+
205
+ for server in cluster_nodes:
206
+ # Always try to get the specific status first for more accuracy
207
+ try:
208
+ specific_status = self.ecs.fetch_server_status(server.name)
209
+ instances[server.name] = specific_status
210
+ except Exception: # pylint: disable=broad-except
211
+ # Fallback to general status if fetch_server_status fails
212
+ general_status = server.status
213
+ instances[server.name] = general_status
214
+
215
+ return instances
216
+
217
+ # --------------------------------------------------------------------- #
218
+ # 6. wait_instances
219
+ # --------------------------------------------------------------------- #
220
+ def wait_instances(self, desired_state: str = 'Booted') -> None:
221
+ deadline = time.time() + _MAX_BOOT_TIME
222
+
223
+ while time.time() < deadline:
224
+ cluster_nodes = self._query_cluster_nodes()
225
+
226
+ # For SHUTOFF state, we need to use fetch_server_status()
227
+ # to get the real status
228
+ if desired_state == 'SHUTOFF':
229
+ all_shutoff = True
230
+ for server in cluster_nodes:
231
+ try:
232
+ specific_status = self.ecs.fetch_server_status(
233
+ server.name)
234
+ if specific_status != 'SHUTOFF':
235
+ all_shutoff = False
236
+ except Exception: # pylint: disable=broad-except
237
+ all_shutoff = False
238
+
239
+ if all_shutoff:
240
+ return
241
+ else:
242
+ # For other states, use the general status
243
+ states = {srv.status for srv in cluster_nodes}
244
+
245
+ if states <= {desired_state}:
246
+ # If all servers are Booted, wait
247
+ # for them to be truly stable
248
+ if desired_state == 'Booted':
249
+ if self._wait_for_all_servers_stable():
250
+ return
251
+ else:
252
+ time.sleep(_POLL_INTERVAL)
253
+ continue
254
+ return
255
+
256
+ time.sleep(_POLL_INTERVAL)
257
+
258
+ raise TimeoutError(
259
+ f'Nodes are not all in state {desired_state} within timeout')
260
+
261
+ def _wait_for_all_servers_stable(self, max_wait: int = 600) -> bool:
262
+ """Waits for all cluster servers to be stable."""
263
+ logger.info('Checking stability of all cluster servers...')
264
+
265
+ start_time = time.time()
266
+ while time.time() - start_time < max_wait:
267
+ cluster_nodes = self._query_cluster_nodes()
268
+ all_stable = True
269
+
270
+ for node in cluster_nodes:
271
+ if node.status == 'Booted':
272
+ # Check that server is reachable via ping
273
+ if not self._ping_server(node.ipv4):
274
+ logger.warning(f'Server {node.name} ({node.ipv4})'
275
+ f'not reachable via ping')
276
+ all_stable = False
277
+ break
278
+
279
+ # SSH readiness handled by provisioner.wait_for_ssh()
280
+
281
+ logger.info(f'Server {node.name} ({node.ipv4}) is stable')
282
+
283
+ if all_stable:
284
+ logger.info('All servers are stable')
285
+ # Safety sleep to allow for late reboots
286
+ logger.info('Waiting 1 second to allow for late reboots...')
287
+ time.sleep(1)
288
+ return True
289
+
290
+ logger.info('Waiting for all servers to be stable...')
291
+ time.sleep(1)
292
+
293
+ logger.error('Timeout waiting for server stability')
294
+ return False
295
+
296
+ def _ping_server(self, server_ip: str) -> bool:
297
+ """Check that server is reachable via ping."""
298
+ try:
299
+ result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
300
+ capture_output=True,
301
+ timeout=10,
302
+ check=False)
303
+ return result.returncode == 0
304
+ except Exception as e: # pylint: disable=broad-except
305
+ logger.debug(f'Error pinging {server_ip}: {e}')
306
+ return False
307
+
308
+ def _check_ssh_ready(self, server_ip: str) -> bool:
309
+ """Check that SSH is available on the server."""
310
+ try:
311
+ ssh_user = self._get_ssh_user()
312
+ private_key_path = self._get_private_key_path()
313
+ result = subprocess.run([
314
+ 'ssh', '-o', 'ConnectTimeout=10', '-o',
315
+ 'StrictHostKeyChecking=no', '-o',
316
+ f'UserKnownHostsFile={os.devnull}', '-o',
317
+ f'GlobalKnownHostsFile={os.devnull}', '-o',
318
+ 'IdentitiesOnly=yes', '-i', private_key_path,
319
+ f'{ssh_user}@{server_ip}', 'echo "SSH ready"'
320
+ ],
321
+ capture_output=True,
322
+ timeout=15,
323
+ check=False)
324
+ return result.returncode == 0
325
+ except Exception as e: # pylint: disable=broad-except
326
+ logger.debug(f'Error checking SSH on {server_ip}: {e}')
327
+ return False
328
+
329
+ # ------------------------------------------------------------------ #
330
+ # 7. open_ports / cleanup_ports – Seeweb has all ports open by default
331
+ # ------------------------------------------------------------------ #
332
+ def open_ports(
333
+ self,
334
+ cluster_name_on_cloud: str,
335
+ ports: List[str],
336
+ provider_config: Optional[Dict[str, Any]] = None,
337
+ ) -> None:
338
+ """See sky/provision/__init__.py"""
339
+ logger.debug(f'Skip opening ports {ports} for Seeweb instances, as all '
340
+ 'ports are open by default.')
341
+ del cluster_name_on_cloud, provider_config, ports
342
+
343
+ def cleanup_ports(
344
+ self,
345
+ cluster_name_on_cloud: str,
346
+ ports: List[str],
347
+ provider_config: Optional[Dict[str, Any]] = None,
348
+ ) -> None:
349
+ del cluster_name_on_cloud, ports, provider_config # Unused.
350
+
351
+ # ====================== private helpers ========================= #
352
+ def _query_cluster_nodes(self):
353
+ """List servers with notes == cluster_name."""
354
+ servers = common_utils.retry(
355
+ self.ecs.fetch_servers,
356
+ max_retries=_API_RETRY_MAX_RETRIES,
357
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
358
+ return [
359
+ s for s in servers
360
+ if s.notes and s.notes.startswith(self.cluster_name)
361
+ ]
362
+
363
+ def query_cluster_nodes(self):
364
+ """Public wrapper for querying cluster nodes for this cluster."""
365
+ return common_utils.retry(self._query_cluster_nodes,
366
+ max_retries=_API_RETRY_MAX_RETRIES,
367
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
368
+
369
+ def _get_head_instance_id(self) -> Optional[str]:
370
+ """Return head instance id for this cluster.
371
+
372
+ Prefer notes == "{cluster}-head"; fallback to first node if none
373
+ matches (legacy naming).
374
+ """
375
+ nodes = common_utils.retry(self._query_cluster_nodes,
376
+ max_retries=_API_RETRY_MAX_RETRIES,
377
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
378
+ for node in nodes:
379
+ try:
380
+ if getattr(node, 'notes', None) == f'{self.cluster_name}-head':
381
+ return node.name
382
+ if getattr(node, 'name', None) and node.name.endswith('-head'):
383
+ return node.name
384
+ except Exception: # pylint: disable=broad-except
385
+ continue
386
+ return nodes[0].name if nodes else None
387
+
388
+ def get_head_instance_id(self) -> Optional[str]:
389
+ """Public wrapper for getting head instance id."""
390
+ return common_utils.retry(self._get_head_instance_id,
391
+ max_retries=_API_RETRY_MAX_RETRIES,
392
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
393
+
394
+ def _create_server(self):
395
+ """POST /servers with complete payload."""
396
+ node_type = 'head'
397
+ payload = {
398
+ 'plan': self.config.node_config.get('plan'), # e.g. eCS4
399
+ 'image': self.config.node_config.get('image'), # e.g. ubuntu-2204
400
+ 'location': self.config.node_config.get('location'), # e.g. it-mi2
401
+ 'notes': f'{self.cluster_name}-{node_type}',
402
+ 'ssh_key': self.config.authentication_config.get('remote_key_name'
403
+ ), # remote key
404
+ }
405
+
406
+ # Optional GPU
407
+ if 'gpu' in self.config.node_config:
408
+ payload.update({
409
+ 'gpu': self.config.node_config.get('gpu'),
410
+ 'gpu_label': self.config.node_config.get('gpu_label', ''),
411
+ })
412
+
413
+ # Build the request object expected by ecsapi
414
+ server_create_request_cls = (
415
+ seeweb_adaptor.ecsapi.ServerCreateRequest # type: ignore
416
+ )
417
+ create_request = server_create_request_cls(**payload)
418
+
419
+ logger.info('Creating Seeweb server %s', payload)
420
+
421
+ # POST /servers – returns (response, action_id)
422
+ _, action_id = common_utils.retry(
423
+ self.ecs.create_server,
424
+ max_retries=_API_RETRY_MAX_RETRIES,
425
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(
426
+ create_request, check_if_can_create=False)
427
+ self.ecs.watch_action(action_id,
428
+ max_retry=_ACTION_WATCH_MAX_RETRY,
429
+ fetch_every=_ACTION_WATCH_FETCH_EVERY)
430
+
431
+ def _power_on(self, server_id: str):
432
+ try:
433
+ common_utils.retry(
434
+ self.ecs.turn_on_server,
435
+ max_retries=_API_RETRY_MAX_RETRIES,
436
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
437
+ except seeweb_adaptor.SeewebError as e:
438
+ logger.error(f'Error in _power_on for {server_id}: {e}')
439
+ raise
440
+
441
+ def _power_off(self, server_id: str):
442
+ try:
443
+ common_utils.retry(
444
+ self.ecs.turn_off_server,
445
+ max_retries=_API_RETRY_MAX_RETRIES,
446
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
447
+ except seeweb_adaptor.SeewebError as e:
448
+ logger.error(f'\n\nError in _power_off for {server_id}: {e}')
449
+ raise
450
+
451
+ def _wait_action(self, action_id: int):
452
+ """Poll action until it completes."""
453
+ while True:
454
+ action = common_utils.retry(
455
+ self.ecs.fetch_action,
456
+ max_retries=_API_RETRY_MAX_RETRIES,
457
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(action_id)
458
+ if action['status'] in ('completed', 'ok', 'no_content'):
459
+ return
460
+ if action['status'] == 'error':
461
+ raise RuntimeError(f'Seeweb action {action_id} failed')
462
+ time.sleep(_POLL_INTERVAL)
463
+
464
+ def _wait_for_stop_with_forced_refresh(self, max_wait: int = 300) -> None:
465
+ """Wait for servers to be stopped with
466
+ aggressive polling and forced refresh."""
467
+ start_time = time.time()
468
+ poll_interval = 1 # 1 second for aggressive polling
469
+
470
+ while time.time() - start_time < max_wait:
471
+ # Force refresh by re-fetching cluster nodes
472
+ cluster_nodes = common_utils.retry(
473
+ self._query_cluster_nodes,
474
+ max_retries=_API_RETRY_MAX_RETRIES,
475
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
476
+
477
+ all_stopped = True
478
+ for server in cluster_nodes:
479
+ try:
480
+ # Always use fetch_server_status() for accurate status
481
+ specific_status = common_utils.retry(
482
+ self.ecs.fetch_server_status,
483
+ max_retries=_API_RETRY_MAX_RETRIES,
484
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server.name)
485
+
486
+ if specific_status != 'SHUTOFF':
487
+ all_stopped = False
488
+
489
+ except Exception: # pylint: disable=broad-except
490
+ all_stopped = False
491
+
492
+ if all_stopped:
493
+ return
494
+
495
+ time.sleep(poll_interval)
496
+
497
+ raise TimeoutError(f'Servers not stopped within {max_wait} seconds')
498
+
499
+
500
+ # =============================================================================
501
+ # Standalone functions required by the provisioning interface
502
+ # =============================================================================
503
+
504
+
505
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
506
+ config: ProvisionConfig) -> ProvisionRecord:
507
+ """Run instances for Seeweb cluster."""
508
+ del cluster_name # unused
509
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
510
+ provider.run_instances(config.node_config, config.count)
511
+
512
+ # Find the head node using notes convention
513
+ cluster_nodes = provider.query_cluster_nodes()
514
+ if not cluster_nodes:
515
+ raise RuntimeError(
516
+ f'No nodes found for cluster {cluster_name_on_cloud}')
517
+ head_node_id = provider.get_head_instance_id()
518
+ assert head_node_id is not None, 'head_instance_id should not be None'
519
+
520
+ return ProvisionRecord(
521
+ provider_name='Seeweb',
522
+ region=region,
523
+ zone=None, # Seeweb doesn't use zones
524
+ cluster_name=cluster_name_on_cloud,
525
+ head_instance_id=head_node_id,
526
+ resumed_instance_ids=[], # Empty for now
527
+ created_instance_ids=[node.name for node in cluster_nodes],
528
+ )
529
+
530
+
531
+ def stop_instances(
532
+ cluster_name_on_cloud: str,
533
+ provider_config: Optional[Dict[str, Any]] = None,
534
+ worker_only: bool = False,
535
+ ) -> None:
536
+ """Stop instances for Seeweb cluster."""
537
+ del worker_only # unused - Seeweb doesn't distinguish between head/worker
538
+ assert provider_config is not None
539
+
540
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
541
+ config = common.ProvisionConfig(
542
+ provider_config=provider_config,
543
+ authentication_config={},
544
+ docker_config={},
545
+ node_config=provider_config,
546
+ count=1, # Not used for stop operation
547
+ tags={},
548
+ resume_stopped_nodes=False,
549
+ ports_to_open_on_launch=None,
550
+ )
551
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
552
+ provider.stop_instances()
553
+
554
+
555
+ def terminate_instances(
556
+ cluster_name_on_cloud: str,
557
+ provider_config: Optional[Dict[str, Any]] = None,
558
+ worker_only: bool = False,
559
+ ) -> None:
560
+ """Terminate instances for Seeweb cluster."""
561
+ del worker_only # unused - Seeweb doesn't distinguish between head/worker
562
+ assert provider_config is not None
563
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
564
+ config = common.ProvisionConfig(
565
+ provider_config=provider_config,
566
+ authentication_config={},
567
+ docker_config={},
568
+ node_config=provider_config,
569
+ count=1, # Not used for terminate operation
570
+ tags={},
571
+ resume_stopped_nodes=False,
572
+ ports_to_open_on_launch=None,
573
+ )
574
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
575
+ provider.terminate_instances()
576
+
577
+
578
+ def wait_instances(
579
+ region: str,
580
+ cluster_name_on_cloud: str,
581
+ state: Optional[status_lib.ClusterStatus],
582
+ ) -> None:
583
+ del region # unused
584
+ # Map ClusterStatus to Seeweb string
585
+ if state == status_lib.ClusterStatus.UP:
586
+ seeweb_state = 'Booted'
587
+ elif state == status_lib.ClusterStatus.STOPPED:
588
+ seeweb_state = 'SHUTOFF'
589
+ elif state is None:
590
+ seeweb_state = 'Terminated' # For termination
591
+ else:
592
+ seeweb_state = 'Booted' # Default fallback
593
+
594
+ # Create Seeweb client directly and wait
595
+ client = _get_seeweb_client()
596
+ deadline = time.time() + _MAX_BOOT_TIME
597
+ while time.time() < deadline:
598
+ cluster_nodes = [
599
+ s for s in client.fetch_servers()
600
+ if s.notes and s.notes.startswith(cluster_name_on_cloud)
601
+ ]
602
+ if not cluster_nodes:
603
+ time.sleep(_POLL_INTERVAL)
604
+ continue
605
+
606
+ states = {srv.status for srv in cluster_nodes}
607
+ if states <= {seeweb_state}:
608
+ # If all servers are Booted, wait for them to be truly stable
609
+ if seeweb_state == 'Booted':
610
+ if _wait_for_all_servers_stable_standalone(cluster_nodes):
611
+ return
612
+ else:
613
+ time.sleep(_POLL_INTERVAL)
614
+ continue
615
+ return
616
+ time.sleep(_POLL_INTERVAL)
617
+
618
+ raise TimeoutError(
619
+ f'Nodes are not all in state {seeweb_state} within timeout')
620
+
621
+
622
+ def _wait_for_all_servers_stable_standalone(cluster_nodes,
623
+ max_wait: int = 300) -> bool:
624
+ """Waits for all cluster servers to be stable (standalone version)."""
625
+ start_time = time.time()
626
+ while time.time() - start_time < max_wait:
627
+ all_stable = True
628
+
629
+ for node in cluster_nodes:
630
+ if node.status == 'Booted':
631
+ # Check that server is reachable via ping
632
+ if not _ping_server_standalone(node.ipv4):
633
+ all_stable = False
634
+ break
635
+
636
+ # Do not check SSH here; handled by provisioner.wait_for_ssh().
637
+
638
+ if all_stable:
639
+ # Safety sleep to allow for late reboots
640
+ time.sleep(1)
641
+ return True
642
+
643
+ time.sleep(1)
644
+
645
+ return False
646
+
647
+
648
+ def _ping_server_standalone(server_ip: str) -> bool:
649
+ """Check that server is reachable via ping (standalone version)."""
650
+ try:
651
+ result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
652
+ capture_output=True,
653
+ timeout=10,
654
+ check=False)
655
+ return result.returncode == 0
656
+ except Exception as e: # pylint: disable=broad-except
657
+ logger.error(f'Error pinging {server_ip}: {e}')
658
+ return False
659
+
660
+
661
+ def _check_ssh_ready_standalone(server_ip: str) -> bool:
662
+ """Check that SSH is available on the server (standalone version)."""
663
+ try:
664
+ private_key_path, _ = auth_utils.get_or_generate_keys()
665
+ private_key_path = os.path.expanduser(private_key_path)
666
+ ssh_user = 'ecuser'
667
+ result = subprocess.run([
668
+ 'ssh', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=no',
669
+ '-o', f'UserKnownHostsFile={os.devnull}', '-o',
670
+ f'GlobalKnownHostsFile={os.devnull}', '-o', 'IdentitiesOnly=yes',
671
+ '-i', private_key_path, f'{ssh_user}@{server_ip}',
672
+ 'echo "SSH ready"'
673
+ ],
674
+ capture_output=True,
675
+ timeout=15,
676
+ check=False)
677
+ return result.returncode == 0
678
+ except Exception: # pylint: disable=broad-except
679
+ return False
680
+
681
+
682
+ def query_instances(
683
+ cluster_name: str,
684
+ cluster_name_on_cloud: str,
685
+ provider_config: Optional[Dict[str, Any]] = None,
686
+ non_terminated_only: bool = True,
687
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
688
+ """Query instances status for Seeweb cluster."""
689
+ del cluster_name # unused
690
+ # Use the provided provider_config or default to empty dict
691
+ if provider_config is None:
692
+ provider_config = {}
693
+
694
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
695
+ config = common.ProvisionConfig(
696
+ provider_config=provider_config,
697
+ authentication_config={},
698
+ docker_config={},
699
+ node_config=provider_config,
700
+ count=1, # Not used for query operation
701
+ tags={},
702
+ resume_stopped_nodes=False,
703
+ ports_to_open_on_launch=None,
704
+ )
705
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
706
+ seeweb_instances = provider.query_instances()
707
+
708
+ # Map Seeweb status to SkyPilot status
709
+ status_map = {
710
+ 'Booted':
711
+ status_lib.ClusterStatus.UP, # Seeweb uses "Booted" for running
712
+ 'RUNNING': status_lib.ClusterStatus.UP, # All caps version
713
+ 'Booting': status_lib.ClusterStatus.INIT,
714
+ 'PoweringOn': status_lib.ClusterStatus.INIT,
715
+ 'Off': status_lib.ClusterStatus.STOPPED,
716
+ 'Stopped': status_lib.ClusterStatus.STOPPED,
717
+ 'SHUTOFF':
718
+ status_lib.ClusterStatus.STOPPED, # Add missing SHUTOFF status
719
+ 'PoweringOff': status_lib.ClusterStatus.
720
+ STOPPED, # Fixed: should be STOPPED, not INIT
721
+ }
722
+
723
+ result: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
724
+ Optional[str]]] = {}
725
+ for name, seeweb_status in seeweb_instances.items():
726
+ if non_terminated_only and seeweb_status in ('Terminated', 'Deleted'):
727
+ continue
728
+ mapped_status = status_map.get(seeweb_status,
729
+ status_lib.ClusterStatus.INIT)
730
+ # Return tuple of (status, reason) where reason is None for Seeweb
731
+ result[name] = (mapped_status, None)
732
+
733
+ return result
734
+
735
+
736
+ # Signature should not include provider_name; router strips it before calling
737
+ def get_cluster_info(
738
+ region: str,
739
+ cluster_name_on_cloud: str,
740
+ provider_config: Optional[Dict[str, Any]] = None,
741
+ ) -> 'ClusterInfo':
742
+ del region # unused
743
+ # Use Seeweb client to get cluster instances
744
+ client = _get_seeweb_client()
745
+ cluster_nodes = [
746
+ s for s in client.fetch_servers()
747
+ if s.notes and s.notes.startswith(cluster_name_on_cloud)
748
+ ]
749
+
750
+ if not cluster_nodes:
751
+ raise RuntimeError(
752
+ f'No instances found for cluster {cluster_name_on_cloud}')
753
+
754
+ instances = {}
755
+ head_instance = None
756
+ for node in cluster_nodes:
757
+ if getattr(node, 'notes', None) == f'{cluster_name_on_cloud}-head':
758
+ head_instance = node.name
759
+ break
760
+ if head_instance is None:
761
+ head_instance = cluster_nodes[0].name
762
+
763
+ for node in cluster_nodes:
764
+ # For Seeweb, we take the first node as head
765
+ if head_instance is None:
766
+ head_instance = node.name
767
+
768
+ # Get server IP (Seeweb uses 'ipv4' attribute)
769
+ external_ip = node.ipv4
770
+ internal_ip = external_ip # For Seeweb, internal IP = external IP
771
+
772
+ instances[node.name] = [
773
+ InstanceInfo(
774
+ instance_id=node.name,
775
+ internal_ip=internal_ip,
776
+ external_ip=external_ip,
777
+ ssh_port=22,
778
+ tags={},
779
+ )
780
+ ]
781
+
782
+ return ClusterInfo(
783
+ instances=instances,
784
+ head_instance_id=head_instance,
785
+ provider_name='Seeweb',
786
+ provider_config=provider_config,
787
+ )
788
+
789
+
790
+ def open_ports(
791
+ cluster_name_on_cloud: str,
792
+ ports: List[str],
793
+ provider_config: Optional[Dict[str, Any]] = None,
794
+ ) -> None:
795
+ del provider_config # Unused
796
+ logger.debug(f'Seeweb: skipping open_ports for {cluster_name_on_cloud}'
797
+ f'ports={ports} all ports are open by default')
798
+ return
799
+
800
+
801
+ def cleanup_ports(
802
+ cluster_name_on_cloud: str,
803
+ ports: List[str],
804
+ provider_config: Optional[Dict[str, Any]] = None,
805
+ ) -> None:
806
+ del cluster_name_on_cloud, ports, provider_config # Unused.
807
+ return