skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@ docker:
19
19
  username: |-
20
20
  {{docker_login_config.username}}
21
21
  password: |-
22
- {{docker_login_config.password}}
22
+ {{docker_login_config.password | indent(6) }}
23
23
  server: |-
24
24
  {{docker_login_config.server}}
25
25
  {%- endif %}
@@ -9,6 +9,8 @@ provider:
9
9
  type: external
10
10
  module: sky.provision.nebius
11
11
  region: "{{region}}"
12
+ use_internal_ips: {{use_internal_ips}}
13
+ use_static_ip_address: {{ use_static_ip_address }}
12
14
 
13
15
  {%- if docker_image is not none %}
14
16
  docker:
@@ -24,7 +26,7 @@ docker:
24
26
  username: |-
25
27
  {{docker_login_config.username}}
26
28
  password: |-
27
- {{docker_login_config.password}}
29
+ {{docker_login_config.password | indent(6) }}
28
30
  server: |-
29
31
  {{docker_login_config.server}}
30
32
  {%- endif %}
@@ -34,6 +36,9 @@ docker:
34
36
  auth:
35
37
  ssh_user: ubuntu
36
38
  ssh_private_key: {{ssh_private_key}}
39
+ {% if ssh_proxy_command is not none %}
40
+ ssh_proxy_command: {{ssh_proxy_command}}
41
+ {% endif %}
37
42
 
38
43
  available_node_types:
39
44
  ray_head_default:
@@ -42,18 +47,21 @@ available_node_types:
42
47
  InstanceType: {{instance_type}}
43
48
  ImageId: {{image_id}}
44
49
  DiskSize: {{disk_size}}
50
+ use_spot: {{ use_spot }}
51
+ network_tier: {{network_tier}}
52
+ filesystems:
53
+ {%- for fs in filesystems %}
54
+ - filesystem_id: {{ fs.filesystem_id }}
55
+ filesystem_mount_tag: {{ fs.filesystem_mount_tag }}
56
+ filesystem_attach_mode: {{ fs.filesystem_attach_mode }}
57
+ filesystem_mount_path: {{ fs.filesystem_mount_path }}
58
+ {%- endfor %}
45
59
  UserData: |
46
- {%- if docker_image is not none %}
47
- runcmd:
48
- - sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
49
- - systemctl restart sshd
50
- {%- endif %}
51
-
52
60
  {# Two available OS images:
53
- 1. ubuntu22.04-driverless - requires Docker installation
54
- 2. ubuntu22.04-cuda12 - comes with Docker pre-installed
55
- To optimize deployment speed, Docker is only installed when using ubuntu22.04-driverless #}
56
- {%- if docker_image is not none and image_id == 'ubuntu22.04-driverless' %}
61
+ 1. ubuntu24.04-driverless - requires Docker installation
62
+ 2. ubuntu24.04-cuda12 - comes with Docker pre-installed
63
+ To optimize deployment speed, Docker is only installed when using ubuntu24.04-driverless #}
64
+ {%- if docker_image is not none and image_id.endswith('-driverless') %}
57
65
  apt:
58
66
  sources:
59
67
  docker.list:
@@ -101,6 +109,7 @@ file_mounts: {
101
109
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
102
110
  {%- for remote_path, local_path in credentials.items() %}
103
111
  "{{remote_path}}": "{{local_path}}",
112
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
104
113
  {%- endfor %}
105
114
  }
106
115
 
@@ -116,6 +125,7 @@ initialization_commands: []
116
125
  # Increment the following for catching performance bugs easier:
117
126
  # current num items (num SSH connections): 1
118
127
  setup_commands:
128
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
119
129
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
120
130
  # Create ~/.ssh/config file in case the file does not exist in the image.
121
131
  # Line 'rm ..': there is another installation of pip.
@@ -126,6 +136,11 @@ setup_commands:
126
136
  - {%- for initial_setup_command in initial_setup_commands %}
127
137
  {{ initial_setup_command }}
128
138
  {%- endfor %}
139
+ {%- for fs in filesystems %}
140
+ sudo mkdir {{ fs.filesystem_mount_path }};
141
+ sudo mount -t virtiofs {{ fs.filesystem_mount_tag }} {{ fs.filesystem_mount_path }};
142
+ sudo chmod a+w {{ fs.filesystem_mount_path }};
143
+ {%- endfor %}
129
144
  sudo systemctl stop unattended-upgrades || true;
130
145
  sudo systemctl disable unattended-upgrades || true;
131
146
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
@@ -136,8 +151,14 @@ setup_commands:
136
151
  mkdir -p ~/.ssh; touch ~/.ssh/config;
137
152
  {{ conda_installation_commands }}
138
153
  {{ ray_skypilot_installation_commands }}
154
+ {%- if env_vars is defined %}
155
+ {%- for env_var, env_value in env_vars.items() %}
156
+ echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
157
+ {%- endfor %}
158
+ {%- endif %}
159
+ IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
139
160
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
140
161
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
141
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
162
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
142
163
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
143
164
  {{ ssh_max_sessions_config }}
@@ -19,7 +19,7 @@ docker:
19
19
  username: |-
20
20
  {{docker_login_config.username}}
21
21
  password: |-
22
- {{docker_login_config.password}}
22
+ {{docker_login_config.password | indent(6) }}
23
23
  server: |-
24
24
  {{docker_login_config.server}}
25
25
  {%- endif %}
@@ -0,0 +1,71 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.primeintellect
11
+ region: "{{region}}"
12
+ zones: "{{zones}}"
13
+
14
+ auth:
15
+ ssh_user: skypilot:ssh_user
16
+ ssh_private_key: {{ssh_private_key}}
17
+
18
+ available_node_types:
19
+ ray_head_default:
20
+ resources: {}
21
+ node_config:
22
+ InstanceType: {{instance_type}}
23
+ DiskSize: {{disk_size}}
24
+ ImageId: {{image_id}}
25
+ PublicKey: |-
26
+ skypilot:ssh_public_key_content
27
+
28
+ head_node_type: ray_head_default
29
+
30
+ # Format: `REMOTE_PATH : LOCAL_PATH`
31
+ file_mounts: {
32
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
33
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
34
+ {%- for remote_path, local_path in credentials.items() %}
35
+ "{{remote_path}}": "{{local_path}}",
36
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
37
+ {%- endfor %}
38
+ }
39
+
40
+ rsync_exclude: []
41
+
42
+ initialization_commands: []
43
+
44
+ # List of shell commands to run to set up nodes.
45
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
46
+ # connection, which is expensive. Try your best to co-locate commands into fewer
47
+ # items!
48
+ #
49
+ # Increment the following for catching performance bugs easier:
50
+ # current num items (num SSH connections): 1
51
+ setup_commands:
52
+ # Disable unattended-upgrades and handle apt-get locks
53
+ # Install patch utility for Ray
54
+ # Install conda and Ray
55
+ # Set system limits for Ray performance (nofile and TasksMax)
56
+ - {%- for initial_setup_command in initial_setup_commands %}
57
+ {{ initial_setup_command }}
58
+ {%- endfor %}
59
+ sudo systemctl stop unattended-upgrades || true;
60
+ sudo systemctl disable unattended-upgrades || true;
61
+ sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
62
+ sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
63
+ sudo pkill -9 apt-get;
64
+ sudo pkill -9 dpkg;
65
+ sudo dpkg --configure -a;
66
+ which patch > /dev/null || sudo apt install -y patch;
67
+ {{ conda_installation_commands }}
68
+ {{ ray_skypilot_installation_commands }}
69
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
70
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
71
+ {{ ssh_max_sessions_config }}
@@ -20,7 +20,7 @@ provider:
20
20
  username: |-
21
21
  {{docker_login_config.username}}
22
22
  password: |-
23
- {{docker_login_config.password}}
23
+ {{docker_login_config.password | indent(6) }}
24
24
  server: |-
25
25
  {{docker_login_config.server}}
26
26
  {%- endif %}
@@ -40,6 +40,14 @@ available_node_types:
40
40
  skypilot:ssh_public_key_content
41
41
  Preemptible: {{use_spot}}
42
42
  BidPerGPU: {{bid_per_gpu}}
43
+ {%- if volume_mounts and volume_mounts|length > 0 %}
44
+ VolumeMounts:
45
+ {%- for vm in volume_mounts %}
46
+ - VolumeNameOnCloud: {{ vm.volume_name_on_cloud }}
47
+ VolumeIdOnCloud: {{ vm.volume_id_on_cloud }}
48
+ MountPath: {{ vm.path }}
49
+ {%- endfor %}
50
+ {%- endif %}
43
51
 
44
52
  head_node_type: ray_head_default
45
53
 
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
7
7
 
8
8
  provider:
9
9
  type: external
10
- module: sky.skylet.providers.scp.SCPNodeProvider
10
+ module: sky.provision.scp
11
11
  region: {{region}}
12
12
  cache_stopped_nodes: True
13
13
 
@@ -24,19 +24,6 @@ available_node_types:
24
24
  InstanceType: {{instance_type}}
25
25
  imageId: {{image_id}}
26
26
  diskSize: {{disk_size}}
27
- {% if num_nodes > 1 %}
28
- ray_worker_default:
29
- min_workers: {{num_nodes - 1}}
30
- max_workers: {{num_nodes - 1}}
31
- resources: {}
32
- node_config:
33
- AuthorizedKey: |
34
- skypilot:ssh_public_key_content
35
- InstanceType: {{instance_type}}
36
- imageId: {{image_id}}
37
- diskSize: {{disk_size}}
38
-
39
- {%- endif %}
40
27
 
41
28
  head_node_type: ray_head_default
42
29
 
@@ -50,10 +37,6 @@ file_mounts: {
50
37
  {%- endfor %}
51
38
  }
52
39
 
53
- rsync_exclude: []
54
-
55
- initialization_commands: []
56
-
57
40
  # List of shell commands to run to set up nodes.
58
41
  # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
59
42
  # connection, which is expensive. Try your best to co-locate commands into fewer
@@ -77,36 +60,6 @@ setup_commands:
77
60
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
78
61
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
79
62
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
80
- {{ ssh_max_sessions_config }}
81
-
82
- # Command to start ray on the head node. You don't need to change this.
83
- # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
84
- # connection, which is expensive. Try your best to co-locate commands into fewer
85
- # items! The same comment applies for worker_start_ray_commands.
86
- #
87
- # Increment the following for catching performance bugs easier:
88
- # current num items (num SSH connections): 1
89
- head_start_ray_commands:
90
- # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
91
- # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
92
- # all the sessions to be reloaded. This is a workaround.
93
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
94
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
95
- {{dump_port_command}}; {{ray_head_wait_initialized_command}}
96
-
97
- {%- if num_nodes > 1 %}
98
- worker_start_ray_commands:
99
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
100
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
101
- {%- else %}
102
- worker_start_ray_commands: []
103
- {%- endif %}
104
-
105
- head_node: {}
106
- worker_nodes: {}
107
63
 
108
- # These fields are required for external cloud providers.
109
- head_setup_commands: []
110
- worker_setup_commands: []
111
- cluster_synced_files: []
112
- file_mounts_sync_continuously: False
64
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
65
+ # We do not need to list it here anymore.
@@ -0,0 +1,108 @@
1
+ cluster_name: {{ cluster_name_on_cloud }}
2
+
3
+ max_workers: {{ num_nodes - 1 }}
4
+ upscaling_speed: {{ num_nodes - 1 }}
5
+ idle_timeout_minutes: 5
6
+
7
+ provider:
8
+ type: external
9
+ module: sky.provision.seeweb
10
+ region: "{{ region }}"
11
+
12
+ auth:
13
+ ssh_user: ecuser
14
+ ssh_private_key: {{ ssh_private_key }}
15
+
16
+ available_node_types:
17
+ ray_head_default:
18
+ resources: {}
19
+ node_config:
20
+ plan: {{ instance_type }}
21
+ image: {{ image_id }}
22
+ location: {{ region }}
23
+ {% if seeweb_gpu_config is not none %}
24
+ gpu: {{ seeweb_gpu_config.gpu }}
25
+ gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
26
+ {% endif %}
27
+ disk: {{ disk_size }}
28
+
29
+ head_node_type: ray_head_default
30
+
31
+ file_mounts: {
32
+ "~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
33
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
34
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
35
+ {%- for remote_path, local_path in credentials.items() %}
36
+ "{{remote_path}}": "{{local_path}}",
37
+ {%- endfor %}
38
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
39
+ }
40
+
41
+ rsync_exclude: []
42
+
43
+ setup_commands:
44
+ - |
45
+ touch ~/.bashrc;
46
+ echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
47
+ echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
48
+ sudo systemctl stop unattended-upgrades || true;
49
+ sudo systemctl disable unattended-upgrades || true;
50
+ sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
51
+ {{ conda_installation_commands }}
52
+ {{ ray_skypilot_installation_commands }}
53
+
54
+ head_start_ray_commands:
55
+ - |
56
+ retry_ray() {
57
+ local n=0; local max=30
58
+ until [ $n -ge $max ]; do
59
+ export SKYPILOT_NUM_GPUS=0
60
+ command -v nvidia-smi >/dev/null 2>&1 && \
61
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
62
+
63
+ ray stop || true
64
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
65
+ ray start --disable-usage-stats --head \
66
+ --port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
67
+ --object-manager-port=8076 \
68
+ --autoscaling-config=~/ray_bootstrap_config.yaml \
69
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
70
+
71
+ echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
72
+ sleep 5
73
+ done
74
+ [ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
75
+ }
76
+ retry_ray
77
+
78
+ worker_start_ray_commands:
79
+ - |
80
+ retry_ray() {
81
+ local n=0; local max=30
82
+ until [ $n -ge $max ]; do
83
+ SKYPILOT_NUM_GPUS=0
84
+ command -v nvidia-smi >/dev/null 2>&1 && \
85
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
86
+
87
+ ray stop || true
88
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
89
+ ray start --disable-usage-stats \
90
+ --address=$RAY_HEAD_IP:{{ ray_port }} \
91
+ --object-manager-port=8076 \
92
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
93
+
94
+ echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
95
+ sleep 5
96
+ done
97
+ [ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
98
+ }
99
+ retry_ray
100
+
101
+ head_node: {}
102
+ worker_nodes: {}
103
+
104
+ head_setup_commands: []
105
+ worker_setup_commands: []
106
+
107
+ cluster_synced_files: []
108
+ file_mounts_sync_continuously: False
@@ -0,0 +1,72 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.shadeform
11
+ region: "{{region}}"
12
+ disable_launch_config_check: true
13
+
14
+ auth:
15
+ ssh_user: shadeform
16
+ ssh_private_key: {{ssh_private_key}}
17
+ ssh_key_id: {{ssh_key_id}}
18
+
19
+ available_node_types:
20
+ ray_head_default:
21
+ {%- if custom_resources %}
22
+ resources: {{custom_resources}}
23
+ {%- else %}
24
+ resources: {}
25
+ {%- endif %}
26
+ node_config:
27
+ InstanceType: {{instance_type}}
28
+ PublicKey: |-
29
+ skypilot:ssh_public_key_content
30
+
31
+ head_node_type: ray_head_default
32
+
33
+ # Format: `REMOTE_PATH : LOCAL_PATH`
34
+ file_mounts: {
35
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
36
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
37
+ {%- for remote_path, local_path in credentials.items() %}
38
+ "{{remote_path}}": "{{local_path}}",
39
+ {%- endfor %}
40
+ }
41
+
42
+ rsync_exclude: []
43
+
44
+ initialization_commands: []
45
+
46
+ # List of shell commands to run to set up nodes.
47
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
48
+ # connection, which is expensive. Try your best to co-locate commands into fewer
49
+ # items!
50
+ #
51
+ # Increment the following for catching performance bugs easier:
52
+ # current num items (num SSH connections): 1
53
+ setup_commands:
54
+ # Create ~/.ssh/config file in case the file does not exist in the image.
55
+ # Line 'rm ..': there is another installation of pip.
56
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
57
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
58
+ # Line 'mkdir -p ..': disable host key check
59
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
60
+ - {%- for initial_setup_command in initial_setup_commands %}
61
+ {{ initial_setup_command }}
62
+ {%- endfor %}
63
+ mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
64
+ {{ conda_installation_commands }}
65
+ {{ ray_skypilot_installation_commands }}
66
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
67
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
68
+ (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
69
+ {{ ssh_max_sessions_config }}
70
+
71
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
72
+ # We do not need to list it here anymore.
@@ -34,6 +34,9 @@ file_mounts:
34
34
  {%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
35
35
  {{remote_catalog_path}}: {{local_catalog_path}}
36
36
  {%- endfor %}
37
+ {%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
38
+ {{controller_file_mount_path}}: {{local_file_mount_path}}
39
+ {%- endfor %}
37
40
  {%- if use_tls %}
38
41
  {{remote_tls_keyfile}}: {{local_tls_keyfile}}
39
42
  {{remote_tls_certfile}}: {{local_tls_certfile}}
@@ -42,13 +45,30 @@ file_mounts:
42
45
  run: |
43
46
  # Activate the Python environment, so that cloud SDKs can be found in the
44
47
  # PATH.
48
+ {%- if consolidation_mode_job_id is none %}
45
49
  {{ sky_activate_python_env }}
50
+ {%- endif %}
46
51
  # Start sky serve service.
47
- python -u -m sky.serve.service \
52
+ {%- if consolidation_mode_job_id is not none %}
53
+ {{sky_python_cmd}} \
54
+ {%- else %}
55
+ python \
56
+ {%- endif %}
57
+ -u -m sky.serve.service \
48
58
  --service-name {{service_name}} \
49
59
  --task-yaml {{remote_task_yaml_path}} \
60
+ --entrypoint {{entrypoint}} \
61
+ {%- if consolidation_mode_job_id is not none %}
62
+ --job-id {{consolidation_mode_job_id}} \
63
+ {%- else %}
50
64
  --job-id $SKYPILOT_INTERNAL_JOB_ID \
51
- >> {{controller_log_file}} 2>&1
65
+ {%- endif %}
66
+ >> {{controller_log_file}} 2>&1 \
67
+ {%- if consolidation_mode_job_id is not none %}
68
+ &
69
+ {%- endif %}
70
+ # For consolidation mode, we need to run the service in the background so
71
+ # that it can immediately return in serve.core.up().
52
72
 
53
73
  envs:
54
74
  {%- for env_name, env_value in controller_envs.items() %}