skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,7 @@
3
3
  import dataclasses
4
4
  import shlex
5
5
  import time
6
- from typing import Any, Dict, List
6
+ from typing import Any, Dict, List, Optional
7
7
 
8
8
  from sky import sky_logging
9
9
  from sky.skylet import constants
@@ -15,23 +15,52 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Configure environment variables. A docker image can have environment variables
16
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
17
17
  # shell environment, so that our ssh session can access them.
18
+ # Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
19
+ # Docker images with Ray 2.48.0+ set this for UV package manager support,
20
+ # but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
21
+ # See: https://github.com/skypilot-org/skypilot/pull/7181
18
22
  SETUP_ENV_VARS_CMD = (
19
23
  'prefix_cmd() '
20
24
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
- 'export -p > ~/container_env_var.sh && '
25
+ 'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
22
26
  '$(prefix_cmd) '
23
27
  'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
24
28
 
25
29
  # Docker daemon may not be ready when the machine is firstly started. The error
26
30
  # message starts with the following string. We should wait for a while and retry
27
31
  # the command.
28
- DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
29
- 'the Docker daemon socket')
32
+ DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')
30
33
 
31
34
  DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
35
+ DOCKER_SOCKET_NOT_READY_STR_2 = (
36
+ 'check if the path is correct and if the daemon is running')
32
37
 
33
38
  _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
34
39
 
40
+ # Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
41
+ # AWS CLI v2 is installed as a standalone binary, not a Python package. See:
42
+ # https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
43
+ INSTALL_AWS_CLI_CMD = (
44
+ 'which aws || ((command -v unzip >/dev/null 2>&1 || '
45
+ '(sudo apt-get update && sudo apt-get install -y unzip)) && '
46
+ 'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
47
+ '-o "/tmp/awscliv2.zip" && '
48
+ 'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
49
+ '&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
50
+
51
+
52
+ def _extract_region_from_ecr_server(server: str) -> str:
53
+ """Extract AWS region from ECR server URL.
54
+
55
+ ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
56
+ Returns the region part from the URL.
57
+ """
58
+ # Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
59
+ parts = server.split('.')
60
+ if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
61
+ return parts[3]
62
+ raise ValueError(f'Invalid ECR server format: {server}')
63
+
35
64
 
36
65
  @dataclasses.dataclass
37
66
  class DockerLoginConfig:
@@ -83,6 +112,21 @@ def check_docker_image(cname, docker_cmd):
83
112
  return _check_helper(cname, '.Config.Image', docker_cmd)
84
113
 
85
114
 
115
+ def maybe_remove_container_cmds(container_name, docker_cmd):
116
+ """Remove the container if it exists. If not, it will be a no-op.
117
+ """
118
+ docker_rm = [
119
+ docker_cmd,
120
+ 'rm',
121
+ '-f',
122
+ container_name,
123
+ '2>/dev/null',
124
+ '||',
125
+ 'true',
126
+ ]
127
+ return ' '.join(docker_rm)
128
+
129
+
86
130
  def docker_start_cmds(
87
131
  image,
88
132
  container_name,
@@ -142,19 +186,23 @@ class DockerInitializer:
142
186
  self.docker_config = docker_config
143
187
  self.container_name = docker_config['container_name']
144
188
  self.runner = runner
145
- self.home_dir = None
189
+ self.home_dir: Optional[str] = None
146
190
  self.initialized = False
147
191
  # podman is not fully tested yet.
148
192
  use_podman = docker_config.get('use_podman', False)
149
193
  self.docker_cmd = 'podman' if use_podman else 'docker'
150
194
  self.log_path = log_path
151
195
 
152
- def _run(self,
153
- cmd,
154
- run_env='host',
155
- wait_for_docker_daemon: bool = False,
156
- separate_stderr: bool = False,
157
- log_err_when_fail: bool = True) -> str:
196
+ def _run(
197
+ self,
198
+ cmd,
199
+ run_env='host',
200
+ wait_for_docker_daemon: bool = False,
201
+ separate_stderr: bool = False,
202
+ log_err_when_fail: bool = True,
203
+ flock_name: Optional[str] = None,
204
+ flock_args: Optional[str] = None,
205
+ ) -> str:
158
206
 
159
207
  if run_env == 'docker':
160
208
  cmd = self._docker_expand_user(cmd, any_char=True)
@@ -163,8 +211,13 @@ class DockerInitializer:
163
211
  # an error: `the input device is not a TTY`, and it works without
164
212
  # `-it` flag.
165
213
  # TODO(zhwu): ray use the `-it` flag, we need to check why.
166
- cmd = (f'{self.docker_cmd} exec {self.container_name} /bin/bash -c'
167
- f' {shlex.quote(cmd)} ')
214
+ cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
215
+ f' /bin/bash -c {shlex.quote(cmd)} ')
216
+
217
+ if flock_name is not None:
218
+ flock_args = flock_args or ''
219
+ cmd = (f'flock {flock_args} /tmp/{flock_name} '
220
+ f'-c {shlex.quote(cmd)}')
168
221
 
169
222
  logger.debug(f'+ {cmd}')
170
223
  start = time.time()
@@ -176,7 +229,8 @@ class DockerInitializer:
176
229
  separate_stderr=separate_stderr,
177
230
  log_path=self.log_path)
178
231
  if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
179
- DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
232
+ DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
233
+ DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
180
234
  if wait_for_docker_daemon:
181
235
  if time.time(
182
236
  ) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
@@ -216,20 +270,56 @@ class DockerInitializer:
216
270
  if self._check_container_exited():
217
271
  self.initialized = True
218
272
  self._run(f'{self.docker_cmd} start {self.container_name}')
219
- self._run('sudo service ssh start', run_env='docker')
273
+ self._run('sudo service ssh start',
274
+ run_env='docker',
275
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
276
+ flock_args='-s -w 1')
220
277
  return self._run('whoami', run_env='docker')
221
278
 
222
279
  # SkyPilot: Docker login if user specified a private docker registry.
223
280
  if 'docker_login_config' in self.docker_config:
224
- # TODO(tian): Maybe support a command to get the login password?
225
281
  docker_login_config = DockerLoginConfig(
226
282
  **self.docker_config['docker_login_config'])
227
- self._run(
228
- f'{self.docker_cmd} login --username '
229
- f'{docker_login_config.username} '
230
- f'--password {docker_login_config.password} '
231
- f'{docker_login_config.server}',
232
- wait_for_docker_daemon=True)
283
+
284
+ if docker_login_config.password:
285
+ # Password is allowed to be empty, in that case, we will not run
286
+ # the login command, and assume that the image pulling is
287
+ # authenticated by the IAM permission on the VM.
288
+ self._run(
289
+ f'{self.docker_cmd} login --username '
290
+ f'{shlex.quote(docker_login_config.username)} '
291
+ f'--password {shlex.quote(docker_login_config.password)} '
292
+ f'{shlex.quote(docker_login_config.server)}',
293
+ wait_for_docker_daemon=True)
294
+ elif (docker_login_config.server.endswith('.amazonaws.com') and
295
+ '.dkr.ecr.' in docker_login_config.server):
296
+ # AWS ECR: Use aws ecr get-login-password for authentication
297
+ # ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
298
+ # This command uses the IAM credentials from the EC2 instance
299
+ # Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
300
+ region = _extract_region_from_ecr_server(
301
+ docker_login_config.server)
302
+
303
+ # AWS CLI is not pre-installed on AWS instances, unlike gcloud
304
+ # on GCP instances, so we need to install it first
305
+ self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
306
+
307
+ self._run(
308
+ f'aws ecr get-login-password --region {region} | '
309
+ f'{self.docker_cmd} login --username AWS '
310
+ f'--password-stdin '
311
+ f'{shlex.quote(docker_login_config.server)}',
312
+ wait_for_docker_daemon=True)
313
+ elif docker_login_config.server.endswith('-docker.pkg.dev'):
314
+ # Docker image server is on GCR, we need to do additional setup
315
+ # to pull the image.
316
+ # When no username or password is provided, we assume that
317
+ # we are on GCP VM (i.e. gcloud auth configure-docker is
318
+ # enough), or the image server is public.
319
+ # For the former case, gcloud should be available, and latter
320
+ # should be fine to fail the following command.
321
+ self._run('gcloud auth configure-docker '
322
+ f'{docker_login_config.server} --quiet || true')
233
323
  # We automatically add the server prefix to the image name if
234
324
  # the user did not add it.
235
325
  specific_image = docker_login_config.format_image(specific_image)
@@ -271,6 +361,10 @@ class DockerInitializer:
271
361
  'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
272
362
  'sudo systemctl restart docker; } || true')
273
363
  user_docker_run_options = self.docker_config.get('run_options', [])
364
+ remove_container_cmd = maybe_remove_container_cmds(
365
+ self.container_name,
366
+ self.docker_cmd,
367
+ )
274
368
  start_command = docker_start_cmds(
275
369
  specific_image,
276
370
  self.container_name,
@@ -278,7 +372,9 @@ class DockerInitializer:
278
372
  self._auto_configure_shm(user_docker_run_options)),
279
373
  self.docker_cmd,
280
374
  )
281
- self._run(start_command)
375
+ self._run(f'{remove_container_cmd} && {start_command}',
376
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
377
+ flock_args='-x -w 10')
282
378
 
283
379
  # SkyPilot: Setup Commands.
284
380
  # TODO(zhwu): the following setups should be aligned with the kubernetes
@@ -296,14 +392,18 @@ class DockerInitializer:
296
392
  'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
297
393
  run_env='docker')
298
394
  # Install dependencies.
299
- self._run(
300
- 'sudo apt-get update; '
395
+ cmd = (
396
+ 'bash -lc \''
397
+ 'exec 200>/var/tmp/sky_apt.lock; '
398
+ 'flock -x -w 120 200 || exit 1; '
399
+ 'export DEBIAN_FRONTEND=noninteractive; '
400
+ 'apt-get -yq update && '
301
401
  # Our mount script will install gcsfuse without fuse package.
302
402
  # We need to install fuse package first to enable storage mount.
303
403
  # The dpkg option is to suppress the prompt for fuse installation.
304
- 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '
305
- 'rsync curl wget patch openssh-server python3-pip fuse;',
306
- run_env='docker')
404
+ 'apt-get -o DPkg::Options::=--force-confnew install -y '
405
+ 'rsync curl wget patch openssh-server python3-pip fuse\'')
406
+ self._run(cmd, run_env='docker')
307
407
 
308
408
  # Copy local authorized_keys to docker container.
309
409
  # Stop and disable jupyter service. This is to avoid port conflict on
@@ -329,13 +429,16 @@ class DockerInitializer:
329
429
  # `mesg: ttyname failed: inappropriate ioctl for device`.
330
430
  # see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
331
431
  port = constants.DEFAULT_DOCKER_PORT
432
+ # In case the port is already configured in the sshd_config file
433
+ # in some images, we delete it first and then append the new one.
332
434
  # pylint: disable=anomalous-backslash-in-string
333
435
  self._run(
334
- f'sudo sed -i "s/#Port 22/Port {port}/" /etc/ssh/sshd_config;'
436
+ 'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
437
+ f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
335
438
  'mkdir -p ~/.ssh;'
336
439
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
337
440
  'sudo service ssh start;'
338
- 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'
441
+ 'sudo sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;'
339
442
  f'{SETUP_ENV_VARS_CMD}',
340
443
  run_env='docker')
341
444
 
@@ -376,9 +479,13 @@ class DockerInitializer:
376
479
  user_pos = string.find('~')
377
480
  if user_pos > -1:
378
481
  if self.home_dir is None:
379
- cmd = (f'{self.docker_cmd} exec {self.container_name} '
380
- 'printenv HOME')
381
- self.home_dir = self._run(cmd, separate_stderr=True)
482
+ cmd = (f'{self.docker_cmd} exec {self.container_name}'
483
+ ' printenv HOME')
484
+ self.home_dir = self._run(
485
+ cmd,
486
+ separate_stderr=True,
487
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
488
+ flock_args='-s -w 1')
382
489
  # Check for unexpected newline in home directory, which can be
383
490
  # a common issue when the output is mixed with stderr.
384
491
  assert '\n' not in self.home_dir, (
@@ -1,13 +1,13 @@
1
1
  """FluidStack instance provisioning."""
2
2
  import os
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
- from sky import authentication as auth
7
6
  from sky import exceptions
8
7
  from sky import sky_logging
9
8
  from sky.provision import common
10
9
  from sky.provision.fluidstack import fluidstack_utils as utils
10
+ from sky.utils import auth_utils
11
11
  from sky.utils import command_runner
12
12
  from sky.utils import common_utils
13
13
  from sky.utils import status_lib
@@ -26,7 +26,8 @@ logger = sky_logging.init_logger(__name__)
26
26
 
27
27
  def get_internal_ip(node_info: Dict[str, Any]) -> None:
28
28
  node_info['internal_ip'] = node_info['ip_address']
29
- private_key_path, _ = auth.get_or_generate_keys()
29
+
30
+ private_key_path, _ = auth_utils.get_or_generate_keys()
30
31
  runner = command_runner.SSHCommandRunner(
31
32
  (node_info['ip_address'], 22),
32
33
  ssh_user='ubuntu',
@@ -77,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
77
78
  return head_instance_id
78
79
 
79
80
 
80
- def run_instances(region: str, cluster_name_on_cloud: str,
81
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
81
82
  config: common.ProvisionConfig) -> common.ProvisionRecord:
82
83
  """Runs instances for the given cluster."""
83
-
84
+ del cluster_name # unused
84
85
  pending_status = ['pending', 'provisioning']
85
86
  while True:
86
87
  instances = _filter_instances(cluster_name_on_cloud, pending_status)
@@ -286,11 +287,14 @@ def get_cluster_info(
286
287
 
287
288
 
288
289
  def query_instances(
290
+ cluster_name: str,
289
291
  cluster_name_on_cloud: str,
290
292
  provider_config: Optional[Dict[str, Any]] = None,
291
293
  non_terminated_only: bool = True,
292
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
294
+ retry_if_missing: bool = False,
295
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
293
296
  """See sky/provision/__init__.py"""
297
+ del cluster_name, retry_if_missing # unused
294
298
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
295
299
  instances = _filter_instances(cluster_name_on_cloud, None)
296
300
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -301,7 +305,8 @@ def query_instances(
301
305
  'failed': status_lib.ClusterStatus.INIT,
302
306
  'terminated': None,
303
307
  }
304
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
308
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
309
+ Optional[str]]] = {}
305
310
  for inst_id, inst in instances.items():
306
311
  if inst['status'] not in status_map:
307
312
  with ux_utils.print_exception_no_traceback():
@@ -310,7 +315,7 @@ def query_instances(
310
315
  status = status_map.get(inst['status'], None)
311
316
  if non_terminated_only and status is None:
312
317
  continue
313
- statuses[inst_id] = status
318
+ statuses[inst_id] = (status, None)
314
319
  return statuses
315
320
 
316
321
 
@@ -1,6 +1,7 @@
1
1
  """GCP provisioner for SkyPilot."""
2
2
 
3
3
  from sky.provision.gcp.config import bootstrap_instances
4
+ from sky.provision.gcp.instance import cleanup_custom_multi_network
4
5
  from sky.provision.gcp.instance import cleanup_ports
5
6
  from sky.provision.gcp.instance import get_cluster_info
6
7
  from sky.provision.gcp.instance import open_ports