skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,707 @@
1
+ """SCP instance provisioning."""
2
+
3
+ from concurrent.futures import as_completed
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from copy import deepcopy
6
+ import hashlib
7
+ import logging
8
+ import random
9
+ import string
10
+ import time
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ from sky.clouds.utils import scp_utils
14
+ from sky.provision import common
15
+ from sky.utils import status_lib
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
21
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
22
+ del cluster_name # unused
23
+ zone_id = config.node_config['zone_id']
24
+
25
+ running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
26
+
27
+ to_start_count = config.count - len(running_instances)
28
+
29
+ if to_start_count < 0:
30
+ raise RuntimeError(
31
+ f'Cluster {cluster_name_on_cloud} already has '
32
+ f'{len(running_instances)} instances, but {config.count} '
33
+ 'are required')
34
+
35
+ if to_start_count == 0:
36
+ head_instance_id = _get_head_instance_id(running_instances)
37
+ if head_instance_id is None:
38
+ raise RuntimeError(
39
+ f'Cluster {cluster_name_on_cloud} has no head instance')
40
+ logger.info(
41
+ f'Cluster {cluster_name_on_cloud} already has '
42
+ f'{len(running_instances)} instances, no need to start more')
43
+ return common.ProvisionRecord(provider_name='scp',
44
+ cluster_name=cluster_name_on_cloud,
45
+ region=region,
46
+ zone=None,
47
+ head_instance_id=head_instance_id,
48
+ resumed_instance_ids=[],
49
+ created_instance_ids=[])
50
+
51
+ existing_instances = _filter_instances(cluster_name_on_cloud, None)
52
+ stopped_instances = _filter_instances(cluster_name_on_cloud,
53
+ ['STOPPED', 'STOPPING'])
54
+
55
+ def _detect_naming_version(existing_instances,
56
+ cluster_name_on_cloud) -> str:
57
+ v2_head = _head(cluster_name_on_cloud)
58
+ v2_worker_prefix = _worker(cluster_name_on_cloud)
59
+ has_v2 = any(instance['virtualServerName'] == v2_head or
60
+ instance['virtualServerName'].startswith(v2_worker_prefix)
61
+ for instance in existing_instances)
62
+ if has_v2:
63
+ return 'v2'
64
+ has_v1 = any(instance['virtualServerName'] == cluster_name_on_cloud
65
+ for instance in existing_instances)
66
+ if has_v1:
67
+ return 'v1'
68
+
69
+ if not existing_instances:
70
+ logger.debug(
71
+ 'detect_naming_version: no instances for cluster %s; '
72
+ 'defaulting to v2.', cluster_name_on_cloud)
73
+ else:
74
+ logger.error(
75
+ 'detect_naming_version: unexpected instance names for cluster '
76
+ '%s: %s; defaulting to v2.', cluster_name_on_cloud, [
77
+ instance['virtualServerName']
78
+ for instance in existing_instances
79
+ ])
80
+ return 'v2'
81
+
82
+ naming_version = _detect_naming_version(existing_instances,
83
+ cluster_name_on_cloud)
84
+
85
+ if naming_version == 'v2':
86
+ cluster_instance_names = [_head(cluster_name_on_cloud)] + [
87
+ f'{_worker(cluster_name_on_cloud)}-{i:02d}'
88
+ for i in range(1, config.count)
89
+ ]
90
+ else:
91
+ if config.count > 1:
92
+ raise RuntimeError(
93
+ 'This cluster uses the legacy naming scheme and cannot be '
94
+ 'scaled to multi-node automatically. '
95
+ 'Please `sky down` and relaunch.')
96
+ cluster_instance_names = [cluster_name_on_cloud]
97
+
98
+ existing_instance_names = [
99
+ instance['virtualServerName'] for instance in existing_instances
100
+ ]
101
+ resume_instance_names = [
102
+ instance['virtualServerName'] for instance in stopped_instances
103
+ ]
104
+ create_instance_names = [
105
+ instance_name for instance_name in cluster_instance_names
106
+ if instance_name not in existing_instance_names
107
+ ]
108
+
109
+ vpc_subnets = _get_or_create_vpc_subnets(zone_id)
110
+
111
+ def _resume(instance_name):
112
+ instance_id = _get_instance_id(instance_name, cluster_name_on_cloud)
113
+ while True:
114
+ state = scp_utils.SCPClient().get_instance_info(
115
+ instance_id)['virtualServerState']
116
+ if state == 'RUNNING':
117
+ return instance_id, 'resumed'
118
+ if state == 'STOPPED':
119
+ break
120
+ time.sleep(2)
121
+
122
+ scp_utils.SCPClient().start_instance(instance_id)
123
+ while True:
124
+ info = scp_utils.SCPClient().get_instance_info(instance_id)
125
+ if info['virtualServerState'] == 'RUNNING':
126
+ return instance_id, 'resumed'
127
+ time.sleep(2)
128
+
129
+ def _create(instance_name):
130
+ instance_config = deepcopy(config.docker_config)
131
+ instance_config['virtualServerName'] = instance_name
132
+ cnt = config.count
133
+
134
+ for vpc, subnets in vpc_subnets.items():
135
+ sg_id = _create_security_group(zone_id, vpc, cnt)
136
+ if not sg_id:
137
+ continue
138
+
139
+ created_in_this_vpc = False
140
+ try:
141
+ instance_config['securityGroupIds'] = [sg_id]
142
+ for subnet in subnets:
143
+ instance_config['nic']['subnetId'] = subnet
144
+ instance_id = _create_instance(vpc, instance_config, cnt)
145
+ if instance_id:
146
+ created_in_this_vpc = True
147
+ return instance_id, 'created'
148
+ except Exception as e: # pylint: disable=broad-except
149
+ logger.error(f'run_instances error ({instance_name}): {e}')
150
+ finally:
151
+ if not created_in_this_vpc:
152
+ try:
153
+ _delete_security_group(sg_id)
154
+ except Exception: # pylint: disable=broad-except
155
+ pass
156
+
157
+ raise RuntimeError(f'instance creation error: {instance_name}')
158
+
159
+ tasks = (
160
+ [(_resume, instance_name) for instance_name in resume_instance_names] +
161
+ [(_create, instance_name) for instance_name in create_instance_names])
162
+
163
+ instance_ids_statuses = []
164
+ if tasks:
165
+ with ThreadPoolExecutor(max_workers=min(len(tasks), 32)) as ex:
166
+ execution = [
167
+ ex.submit(function, instance_name)
168
+ for function, instance_name in tasks
169
+ ]
170
+ for e in as_completed(execution):
171
+ try:
172
+ instance_ids_statuses.append(e.result())
173
+ except Exception as e: # pylint: disable=broad-except
174
+ logger.error(f'run_instances error: {e}')
175
+
176
+ wait_time = time.time() + 600
177
+ while time.time() < wait_time:
178
+ running_instances = _filter_instances(cluster_name_on_cloud,
179
+ ['RUNNING'])
180
+ if len(running_instances) == config.count:
181
+ break
182
+ pending_instances = _filter_instances(
183
+ cluster_name_on_cloud,
184
+ ['CREATING', 'EDITING', 'STARTING', 'RESTARTING', 'STOPPING'])
185
+ if not pending_instances:
186
+ break
187
+ time.sleep(3)
188
+
189
+ running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
190
+ if len(running_instances) != config.count:
191
+ raise RuntimeError(f'Expected {config.count} running instances, '
192
+ f'but got {len(running_instances)} instances')
193
+
194
+ head_instance_id = _get_head_instance_id(running_instances)
195
+ if head_instance_id is None:
196
+ raise RuntimeError('Head instance is not running')
197
+
198
+ resumed_instance_ids = []
199
+ created_instance_ids = []
200
+ for instance_id, status in instance_ids_statuses:
201
+ if status == 'resumed':
202
+ resumed_instance_ids.append(instance_id)
203
+ elif status == 'created':
204
+ created_instance_ids.append(instance_id)
205
+
206
+ return common.ProvisionRecord(provider_name='scp',
207
+ cluster_name=cluster_name_on_cloud,
208
+ region=region,
209
+ zone=None,
210
+ head_instance_id=head_instance_id,
211
+ resumed_instance_ids=resumed_instance_ids,
212
+ created_instance_ids=created_instance_ids)
213
+
214
+
215
+ def _head(cluster_name_on_cloud: str):
216
+ return (f'{cluster_name_on_cloud[:8]}-'
217
+ f'{_suffix(cluster_name_on_cloud)}-head')
218
+
219
+
220
+ def _worker(cluster_name_on_cloud: str):
221
+ return (f'{cluster_name_on_cloud[:8]}-'
222
+ f'{_suffix(cluster_name_on_cloud)}-worker')
223
+
224
+
225
+ def _suffix(name: str, n: int = 5):
226
+ return hashlib.sha1(name.encode()).hexdigest()[:n]
227
+
228
+
229
+ def _get_instance_id(instance_name, cluster_name_on_cloud):
230
+ instances = _filter_instances(cluster_name_on_cloud, None)
231
+ for instance in instances:
232
+ if instance_name == instance['virtualServerName']:
233
+ return instance['virtualServerId']
234
+ return None
235
+
236
+
237
+ def _get_or_create_vpc_subnets(zone_id):
238
+ while len(_get_vcp_subnets(zone_id)) == 0:
239
+ try:
240
+ response = scp_utils.SCPClient().create_vpc(zone_id)
241
+ time.sleep(5)
242
+ vpc_id = response['resourceId']
243
+ while True:
244
+ vpc_info = scp_utils.SCPClient().get_vpc_info(vpc_id)
245
+ if vpc_info['vpcState'] == 'ACTIVE':
246
+ break
247
+ else:
248
+ time.sleep(5)
249
+
250
+ response = scp_utils.SCPClient().create_subnet(vpc_id, zone_id)
251
+ time.sleep(5)
252
+ subnet_id = response['resourceId']
253
+ while True:
254
+ subnet_info = scp_utils.SCPClient().get_subnet_info(subnet_id)
255
+ if subnet_info['subnetState'] == 'ACTIVE':
256
+ break
257
+ else:
258
+ time.sleep(5)
259
+
260
+ response = scp_utils.SCPClient().create_internet_gateway(vpc_id)
261
+ time.sleep(5)
262
+ internet_gateway_id = response['resourceId']
263
+ while True:
264
+ internet_gateway_info = scp_utils.SCPClient(
265
+ ).get_internet_gateway_info(internet_gateway_id)
266
+ if internet_gateway_info['internetGatewayState'] == 'ATTACHED':
267
+ break
268
+ else:
269
+ time.sleep(5)
270
+
271
+ while True:
272
+ vpc_info = scp_utils.SCPClient().get_vpc_info(vpc_id)
273
+ if vpc_info['vpcState'] == 'ACTIVE':
274
+ break
275
+ else:
276
+ time.sleep(5)
277
+
278
+ break
279
+ except Exception as e: # pylint: disable=broad-except
280
+ time.sleep(10)
281
+ logger.error(f'vpc creation error: {e}')
282
+ continue
283
+
284
+ vpc_subnets = _get_vcp_subnets(zone_id)
285
+ return vpc_subnets
286
+
287
+
288
+ def _get_vcp_subnets(zone_id):
289
+ vpc_contents = scp_utils.SCPClient().get_vpcs(zone_id)
290
+ vpc_list = [
291
+ item['vpcId'] for item in vpc_contents if item['vpcState'] == 'ACTIVE'
292
+ ]
293
+
294
+ igw_contents = scp_utils.SCPClient().get_internet_gateway()
295
+ vpc_with_igw = [
296
+ item['vpcId']
297
+ for item in igw_contents
298
+ if item['internetGatewayState'] == 'ATTACHED'
299
+ ]
300
+
301
+ vpc_list = [vpc for vpc in vpc_list if vpc in vpc_with_igw]
302
+
303
+ subnet_contents = scp_utils.SCPClient().get_subnets()
304
+
305
+ vpc_subnets = {}
306
+ for vpc in vpc_list:
307
+ subnet_list = [
308
+ item['subnetId']
309
+ for item in subnet_contents
310
+ if item['subnetState'] == 'ACTIVE' and item['vpcId'] == vpc
311
+ ]
312
+ if subnet_list:
313
+ vpc_subnets[vpc] = subnet_list
314
+
315
+ return vpc_subnets
316
+
317
+
318
+ def _filter_instances(cluster_name_on_cloud,
319
+ status_filter: Optional[List[str]]):
320
+ instances = scp_utils.SCPClient().get_instances()
321
+ v2_head_instance_name = _head(cluster_name_on_cloud)
322
+ v2_worker_prefix = _worker(cluster_name_on_cloud)
323
+ v1_head_instance_name = cluster_name_on_cloud
324
+
325
+ cluster_instances = [
326
+ instance for instance in instances
327
+ if instance['virtualServerName'] == v2_head_instance_name or
328
+ instance['virtualServerName'].startswith(v2_worker_prefix) or
329
+ instance['virtualServerName'] == v1_head_instance_name
330
+ ]
331
+
332
+ if status_filter is None:
333
+ return cluster_instances
334
+ return [
335
+ instance for instance in cluster_instances
336
+ if instance['virtualServerState'] in status_filter
337
+ ]
338
+
339
+
340
+ def _get_head_instance_id(instances):
341
+ if len(instances) > 0:
342
+ for instance in instances:
343
+ if instance['virtualServerName'].endswith('-head'):
344
+ return instance['virtualServerId']
345
+ return instances[0]['virtualServerId']
346
+ return None
347
+
348
+
349
+ def _create_security_group(zone_id, vpc, cnt):
350
+ sg_name = 'sky' + ''.join(random.choices(string.ascii_lowercase, k=8))
351
+ undo_func_stack = []
352
+ try:
353
+ response = scp_utils.SCPClient().create_security_group(
354
+ zone_id, vpc, sg_name)
355
+ sg_id = response['resourceId']
356
+ undo_func_stack.append(lambda: _delete_security_group(sg_id))
357
+ while True:
358
+ sg_contents = scp_utils.SCPClient().get_security_groups(
359
+ vpc, sg_name)
360
+ sg = [
361
+ sg['securityGroupState']
362
+ for sg in sg_contents
363
+ if sg['securityGroupId'] == sg_id
364
+ ]
365
+ if sg and sg[0] == 'ACTIVE':
366
+ break
367
+ time.sleep(5)
368
+
369
+ scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', None, cnt)
370
+ scp_utils.SCPClient().add_security_group_rule(sg_id, 'OUT', None, cnt)
371
+
372
+ return sg_id
373
+ except Exception as e: # pylint: disable=broad-except
374
+ _undo_functions(undo_func_stack)
375
+ logger.error(f'security group creation error: {e}')
376
+ return None
377
+
378
+
379
+ def _delete_security_group(sg_id):
380
+ scp_utils.SCPClient().delete_security_group(sg_id)
381
+ while True:
382
+ time.sleep(5)
383
+ sg_contents = scp_utils.SCPClient().get_security_groups()
384
+ sg = [
385
+ sg['securityGroupState']
386
+ for sg in sg_contents
387
+ if sg['securityGroupId'] == sg_id
388
+ ]
389
+ if not sg:
390
+ break
391
+
392
+
393
+ def _undo_functions(undo_func_list):
394
+ while undo_func_list:
395
+ func = undo_func_list.pop()
396
+ func()
397
+
398
+
399
+ def _create_instance(vpc_id, instance_config, cnt):
400
+ undo_func_stack = []
401
+ try:
402
+ instance = scp_utils.SCPClient().create_instance(instance_config)
403
+ instance_id = instance['resourceId']
404
+ while True:
405
+ time.sleep(10)
406
+ instance_info = scp_utils.SCPClient().get_instance_info(instance_id)
407
+ if instance_info['virtualServerState'] == 'RUNNING':
408
+ break
409
+ undo_func_stack.append(lambda: _delete_instance(instance_id))
410
+ firewall_id = _get_firewall_id(vpc_id)
411
+ internal_ip = instance_info['ip']
412
+ in_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'IN', None,
413
+ cnt)
414
+ undo_func_stack.append(
415
+ lambda: _delete_firewall_rule(firewall_id, in_rule_id))
416
+ out_rule_id = _add_firewall_rule(firewall_id, internal_ip, 'OUT', None,
417
+ cnt)
418
+ undo_func_stack.append(
419
+ lambda: _delete_firewall_rule(firewall_id, out_rule_id))
420
+ return instance_id
421
+
422
+ except Exception as e: # pylint: disable=broad-except
423
+ _undo_functions(undo_func_stack)
424
+ logger.error(f'instance creation error: {e}')
425
+ return None
426
+
427
+
428
+ def _delete_instance(instance_id):
429
+ scp_utils.SCPClient().terminate_instance(instance_id)
430
+ while True:
431
+ time.sleep(10)
432
+ instances = scp_utils.SCPClient().get_instances()
433
+ inst = [
434
+ instance['virtualServerId']
435
+ for instance in instances
436
+ if instance['virtualServerId'] == instance_id
437
+ ]
438
+ if not inst:
439
+ break
440
+
441
+
442
+ def _get_firewall_id(vpc_id):
443
+ firewalls = scp_utils.SCPClient().get_firewalls()
444
+ firewall_id = [
445
+ firewall['firewallId']
446
+ for firewall in firewalls
447
+ if firewall['vpcId'] == vpc_id and
448
+ (firewall['firewallState'] in ['ACTIVE', 'DEPLOYING'])
449
+ ][0]
450
+ return firewall_id
451
+
452
+
453
+ def _add_firewall_rule(firewall_id, internal_ip, direction,
454
+ ports: Optional[List[str]], cnt: Optional[int]):
455
+ attempts = 0
456
+ max_attempts = 300
457
+ while attempts < max_attempts:
458
+ try:
459
+ rule_info = scp_utils.SCPClient().add_firewall_rule(
460
+ firewall_id, internal_ip, direction, ports, cnt)
461
+ if rule_info is not None:
462
+ rule_id = rule_info['resourceId']
463
+ while True:
464
+ rule_info = scp_utils.SCPClient().get_firewall_rule_info(
465
+ firewall_id, rule_id)
466
+ if rule_info['ruleState'] == 'ACTIVE':
467
+ return rule_id
468
+ else:
469
+ return None
470
+ except Exception as e: # pylint: disable=broad-except
471
+ attempts += 1
472
+ time.sleep(10)
473
+ logger.error(f'add firewall rule error: {e}')
474
+ continue
475
+ raise RuntimeError('add firewall rule error')
476
+
477
+
478
+ def _delete_firewall_rule(firewall_id, rule_ids):
479
+ if not isinstance(rule_ids, list):
480
+ rule_ids = [rule_ids]
481
+ attempts = 0
482
+ max_attempts = 300
483
+ while attempts < max_attempts:
484
+ try:
485
+ scp_utils.SCPClient().delete_firewall_rule(firewall_id, rule_ids)
486
+ if not _remaining_firewall_rule(firewall_id, rule_ids):
487
+ return
488
+ except Exception as e: # pylint: disable=broad-except
489
+ attempts += 1
490
+ time.sleep(5)
491
+ logger.error(f'delete firewall rule error: {e}')
492
+ continue
493
+ raise RuntimeError('delete firewall rule error')
494
+
495
+
496
+ def _remaining_firewall_rule(firewall_id, rule_ids):
497
+ firewall_rules = scp_utils.SCPClient().get_firewall_rules(firewall_id)
498
+ for rule_id in rule_ids:
499
+ if rule_id in firewall_rules:
500
+ return True
501
+ return False
502
+
503
+
504
+ def _get_firewall_rule_ids(instance_info, firewall_id,
505
+ ports: Optional[List[str]]):
506
+ rule_ids = []
507
+ if ports is not None:
508
+ destination_ip = instance_info['ip']
509
+ rules = scp_utils.SCPClient().get_firewall_rules(firewall_id)
510
+ for rule in rules:
511
+ port_list = ','.join(rule['tcpServices'])
512
+ port = ','.join(ports)
513
+ if destination_ip == rule['destinationIpAddresses'][
514
+ 0] and '0.0.0.0/0' == rule['sourceIpAddresses'][
515
+ 0] and port == port_list:
516
+ rule_ids.append(rule['ruleId'])
517
+ else:
518
+ ip = instance_info['ip']
519
+ rules = scp_utils.SCPClient().get_firewall_rules(firewall_id)
520
+ for rule in rules:
521
+ if ip == rule['destinationIpAddresses'][0] and '0.0.0.0/0' == rule[
522
+ 'sourceIpAddresses'][0]:
523
+ rule_ids.append(rule['ruleId'])
524
+ if ip == rule['sourceIpAddresses'][0] and '0.0.0.0/0' == rule[
525
+ 'destinationIpAddresses'][0]:
526
+ rule_ids.append(rule['ruleId'])
527
+ return rule_ids
528
+
529
+
530
+ def stop_instances(
531
+ cluster_name_on_cloud: str,
532
+ provider_config: Optional[Dict[str, Any]] = None,
533
+ worker_only: bool = False,
534
+ ) -> None:
535
+ del provider_config
536
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
537
+
538
+ if worker_only:
539
+ head_instance_name = _head(cluster_name_on_cloud)
540
+ instances = [
541
+ instance for instance in instances
542
+ if instance['virtualServerName'] != head_instance_name
543
+ ]
544
+
545
+ if not instances:
546
+ return
547
+
548
+ def _stop(instance):
549
+ try:
550
+ instance_id = instance['virtualServerId']
551
+ scp_utils.SCPClient().stop_instance(instance_id)
552
+ while True:
553
+ info = scp_utils.SCPClient().get_instance_info(instance_id)
554
+ if info['virtualServerState'] == 'STOPPED':
555
+ return instance_id
556
+ time.sleep(2)
557
+ except Exception as e: # pylint: disable=broad-except
558
+ logger.error(f'stop_instances error: {e}')
559
+
560
+ with ThreadPoolExecutor(max_workers=min(len(instances), 32)) as ex:
561
+ execution = [ex.submit(_stop, instance) for instance in instances]
562
+ for e in as_completed(execution):
563
+ e.result()
564
+
565
+
566
+ def terminate_instances(
567
+ cluster_name_on_cloud: str,
568
+ provider_config: Optional[Dict[str, Any]] = None,
569
+ worker_only: bool = False,
570
+ ) -> None:
571
+ del provider_config
572
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING', 'STOPPED'])
573
+
574
+ if worker_only:
575
+ head_instance_name = _head(cluster_name_on_cloud)
576
+ instances = [
577
+ instance for instance in instances
578
+ if instance['virtualServerName'] != head_instance_name
579
+ ]
580
+
581
+ if not instances:
582
+ return
583
+
584
+ def _terminate(instance):
585
+ try:
586
+ instance_id = instance['virtualServerId']
587
+ instance_info = scp_utils.SCPClient().get_instance_info(instance_id)
588
+ vpc_id = instance_info['vpcId']
589
+ sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
590
+ firewall_id = _get_firewall_id(vpc_id)
591
+ rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, None)
592
+ _delete_firewall_rule(firewall_id, rule_ids)
593
+ _delete_instance(instance_id)
594
+ _delete_security_group(sg_id)
595
+ except Exception as e: # pylint: disable=broad-except
596
+ logger.error(f'terminate_instances error: {e}')
597
+
598
+ with ThreadPoolExecutor(max_workers=min(len(instances), 32)) as ex:
599
+ execution = [ex.submit(_terminate, instance) for instance in instances]
600
+ for e in as_completed(execution):
601
+ e.result()
602
+
603
+
604
+ def query_instances(
605
+ cluster_name: str,
606
+ cluster_name_on_cloud: str,
607
+ provider_config: Optional[Dict[str, Any]] = None,
608
+ non_terminated_only: bool = True,
609
+ retry_if_missing: bool = False,
610
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
611
+ del cluster_name, retry_if_missing # unused
612
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
613
+ instances = _filter_instances(cluster_name_on_cloud, None)
614
+
615
+ status_map = {
616
+ 'CREATING': status_lib.ClusterStatus.INIT,
617
+ 'EDITING': status_lib.ClusterStatus.INIT,
618
+ 'RUNNING': status_lib.ClusterStatus.UP,
619
+ 'STARTING': status_lib.ClusterStatus.INIT,
620
+ 'RESTARTING': status_lib.ClusterStatus.INIT,
621
+ 'STOPPING': status_lib.ClusterStatus.STOPPED,
622
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
623
+ 'TERMINATING': None,
624
+ 'TERMINATED': None,
625
+ }
626
+
627
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
628
+ Optional[str]]] = {}
629
+ for instance in instances:
630
+ status = status_map[instance['virtualServerState']]
631
+ if non_terminated_only and status is None:
632
+ continue
633
+ statuses[instance['virtualServerId']] = (status, None)
634
+ return statuses
635
+
636
+
637
+ def wait_instances(region: str, cluster_name_on_cloud: str, state: str) -> None:
638
+ del region, cluster_name_on_cloud, state
639
+
640
+
641
+ def get_cluster_info(
642
+ region: str,
643
+ cluster_name_on_cloud: str,
644
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
645
+ del region
646
+ running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
647
+ head_instance_id = _get_head_instance_id(running_instances)
648
+
649
+ instances = {}
650
+ for instance in running_instances:
651
+ instances[instance['virtualServerId']] = [
652
+ common.InstanceInfo(
653
+ instance_id=instance['virtualServerId'],
654
+ internal_ip=instance['ip'],
655
+ external_ip=scp_utils.SCPClient().get_external_ip(
656
+ instance['virtualServerId'], instance['ip']),
657
+ tags={})
658
+ ]
659
+
660
+ # max-worker-port - min-worker-port should be at least 3 * nproc
661
+ # RAY_worker_maximum_startup_concurrency for the performance
662
+ custom_ray_options = {
663
+ 'node-manager-port': 11001,
664
+ 'min-worker-port': 11002,
665
+ 'max-worker-port': 11200,
666
+ 'ray-client-server-port': 10001
667
+ }
668
+
669
+ return common.ClusterInfo(
670
+ instances=instances,
671
+ head_instance_id=head_instance_id,
672
+ custom_ray_options=custom_ray_options,
673
+ provider_name='scp',
674
+ provider_config=provider_config,
675
+ )
676
+
677
+
678
+ def open_ports(
679
+ cluster_name_on_cloud: str,
680
+ ports: List[str],
681
+ provider_config: Optional[Dict[str, Any]] = None,
682
+ ) -> None:
683
+ del provider_config
684
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
685
+ head_instance_id = _get_head_instance_id(instances)
686
+ instance_info = scp_utils.SCPClient().get_instance_info(head_instance_id)
687
+ sg_id = instance_info['securityGroupIds'][0]['securityGroupId']
688
+ scp_utils.SCPClient().add_security_group_rule(sg_id, 'IN', ports, None)
689
+ vpc_id = instance_info['vpcId']
690
+ internal_ip = instance_info['ip']
691
+ firewall_id = _get_firewall_id(vpc_id)
692
+ _add_firewall_rule(firewall_id, internal_ip, 'IN', ports, None)
693
+
694
+
695
+ def cleanup_ports(
696
+ cluster_name_on_cloud: str,
697
+ ports: List[str],
698
+ provider_config: Optional[Dict[str, Any]] = None,
699
+ ) -> None:
700
+ del provider_config
701
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
702
+ head_instance_id = _get_head_instance_id(instances)
703
+ instance_info = scp_utils.SCPClient().get_instance_info(head_instance_id)
704
+ vpc_id = instance_info['vpcId']
705
+ firewall_id = _get_firewall_id(vpc_id)
706
+ rule_ids = _get_firewall_rule_ids(instance_info, firewall_id, ports)
707
+ _delete_firewall_rule(firewall_id, rule_ids)