skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -48,9 +48,18 @@ install_requires = [
48
48
  # (https://github.com/yaml/pyyaml/issues/601)
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
+ 'ijson',
52
+ 'orjson',
51
53
  'requests',
54
+ # SkyPilot inherits from uvicorn.Server to customize the behavior of
55
+ # uvicorn, so we need to pin uvicorn version to avoid potential break
56
+ # changes.
57
+ # Notes for current version check:
58
+ # - uvicorn 0.33.0 is the latest version that supports Python 3.8
59
+ # - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
60
+ # behavior.
61
+ 'uvicorn[standard] >=0.33.0, <0.36.0',
52
62
  'fastapi',
53
- 'uvicorn[standard]',
54
63
  # Some pydantic versions are not compatible with ray. Adopted from ray's
55
64
  # setup.py:
56
65
  # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254
@@ -63,6 +72,8 @@ install_requires = [
63
72
  'setproctitle',
64
73
  'sqlalchemy',
65
74
  'psycopg2-binary',
75
+ 'aiosqlite',
76
+ 'asyncpg',
66
77
  # TODO(hailong): These three dependencies should be removed after we make
67
78
  # the client-side actually not importing them.
68
79
  'casbin',
@@ -70,13 +81,13 @@ install_requires = [
70
81
  # Required for API server metrics
71
82
  'prometheus_client>=0.8.0',
72
83
  'passlib',
73
- 'bcrypt',
84
+ 'bcrypt==4.0.1',
74
85
  'pyjwt',
75
86
  'gitpython',
87
+ 'paramiko',
76
88
  'types-paramiko',
77
89
  'alembic',
78
90
  'aiohttp',
79
- 'aiosqlite',
80
91
  'anyio',
81
92
  ]
82
93
 
@@ -94,6 +105,10 @@ GRPC = 'grpcio>=1.63.0'
94
105
  PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
95
106
 
96
107
  server_dependencies = [
108
+ # TODO: Some of these dependencies are also specified in install_requires,
109
+ # so they are redundant here. We should figure out if they are only needed
110
+ # on the server (should remove from install_requires), or if they are needed
111
+ # on the client (should remove from here).
97
112
  'casbin',
98
113
  'sqlalchemy_adapter',
99
114
  'passlib',
@@ -103,14 +118,16 @@ server_dependencies = [
103
118
  GRPC,
104
119
  PROTOBUF,
105
120
  'aiosqlite',
121
+ 'greenlet',
106
122
  ]
107
123
 
108
124
  local_ray = [
109
125
  # Lower version of ray will cause dependency conflict for
110
126
  # click/grpcio/protobuf.
111
- # Excluded 2.6.0 as it has a bug in the cluster launcher:
127
+ # Ray 2.6.1+ resolved cluster launcher bugs
128
+ # and grpcio issues on Apple Silicon.
112
129
  # https://github.com/ray-project/ray/releases/tag/ray-2.6.1
113
- 'ray[default] >= 2.2.0, != 2.6.0',
130
+ 'ray[default] >= 2.6.1',
114
131
  ]
115
132
 
116
133
  remote = [
@@ -132,11 +149,19 @@ aws_dependencies = [
132
149
  'colorama < 0.4.5',
133
150
  ]
134
151
 
152
+ # Kubernetes 32.0.0 has an authentication bug:
153
+ # https://github.com/kubernetes-client/python/issues/2333
154
+ kubernetes_dependencies = [
155
+ 'kubernetes>=20.0.0,!=32.0.0',
156
+ 'websockets',
157
+ 'python-dateutil',
158
+ ]
159
+
135
160
  # azure-cli cannot be installed normally by uv, so we need to work around it in
136
161
  # a few places.
137
162
  AZURE_CLI = 'azure-cli>=2.65.0'
138
163
 
139
- extras_require: Dict[str, List[str]] = {
164
+ cloud_dependencies: Dict[str, List[str]] = {
140
165
  'aws': aws_dependencies,
141
166
  # TODO(zongheng): azure-cli is huge and takes a long time to install.
142
167
  # Tracked in: https://github.com/Azure/azure-cli/issues/7387
@@ -172,20 +197,23 @@ extras_require: Dict[str, List[str]] = {
172
197
  'docker': ['docker'] + local_ray,
173
198
  'lambda': [], # No dependencies needed for lambda
174
199
  'cloudflare': aws_dependencies,
200
+ 'coreweave': aws_dependencies + kubernetes_dependencies,
175
201
  'scp': local_ray,
176
202
  'oci': ['oci'],
177
- # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
178
- 'kubernetes': [
179
- 'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
180
- ],
181
- 'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'],
182
- 'remote': remote,
203
+ 'kubernetes': kubernetes_dependencies,
204
+ 'ssh': kubernetes_dependencies,
183
205
  # For the container registry auth api. Reference:
184
206
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
185
- 'runpod': ['runpod>=1.6.1'],
207
+ # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
208
+ # stdlib provides tomllib; on lower versions we depend on tomli explicitly.
209
+ # Instead of installing tomli conditionally, we install it explicitly.
210
+ # This is because the conditional installation of tomli does not work
211
+ # with controller package installation code.
212
+ 'runpod': ['runpod>=1.6.1', 'tomli'],
186
213
  'fluidstack': [], # No dependencies needed for fluidstack
187
214
  'cudo': ['cudo-compute>=0.1.10'],
188
215
  'paperspace': [], # No dependencies needed for paperspace
216
+ 'primeintellect': [], # No dependencies needed for primeintellect
189
217
  'do': ['pydo>=0.3.0', 'azure-core>=1.24.0', 'azure-common'],
190
218
  'vast': ['vastai-sdk>=0.1.12'],
191
219
  'vsphere': [
@@ -198,19 +226,25 @@ extras_require: Dict[str, List[str]] = {
198
226
  # 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
199
227
  ],
200
228
  'nebius': [
201
- 'nebius>=0.2.47',
229
+ # Nebius requires grpcio and protobuf, so we need to include
230
+ # our constraints here.
231
+ 'nebius>=0.3.12',
232
+ GRPC,
233
+ PROTOBUF,
202
234
  ] + aws_dependencies,
203
235
  'hyperbolic': [], # No dependencies needed for hyperbolic
204
- 'server': server_dependencies,
236
+ 'seeweb': ['ecsapi==0.4.0'],
237
+ 'shadeform': [], # No dependencies needed for shadeform
238
+ 'slurm': [], # No dependencies needed for slurm
205
239
  }
206
240
 
207
241
  # Calculate which clouds should be included in the [all] installation.
208
- clouds_for_all = set(extras_require)
209
- clouds_for_all.remove('remote')
242
+ clouds_for_all = set(cloud_dependencies)
210
243
 
211
244
  if sys.version_info < (3, 10):
212
245
  # Nebius needs python3.10. If python 3.9 [all] will not install nebius
213
246
  clouds_for_all.remove('nebius')
247
+ clouds_for_all.remove('seeweb')
214
248
 
215
249
  if sys.version_info >= (3, 12):
216
250
  # The version of ray we use does not work with >= 3.12, so avoid clouds
@@ -220,5 +254,16 @@ if sys.version_info >= (3, 12):
220
254
  # TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
221
255
  clouds_for_all.remove('vast')
222
256
 
223
- extras_require['all'] = list(
224
- set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
257
+ cloud_extras = {
258
+ cloud: dependencies + server_dependencies
259
+ for cloud, dependencies in cloud_dependencies.items()
260
+ }
261
+
262
+ extras_require: Dict[str, List[str]] = {
263
+ # Include server_dependencies with each cloud.
264
+ **cloud_extras,
265
+ 'all': list(set().union(*[cloud_extras[cloud] for cloud in clouds_for_all])
266
+ ),
267
+ 'remote': remote,
268
+ 'server': server_dependencies,
269
+ }
sky/setup_files/setup.py CHANGED
@@ -148,47 +148,47 @@ if os.path.exists(readme_filepath):
148
148
  long_description = io.open(readme_filepath, 'r', encoding='utf-8').read()
149
149
  long_description = parse_readme(long_description)
150
150
 
151
- atexit.register(revert_commit_hash)
152
- replace_commit_hash()
153
-
154
- setuptools.setup(
155
- # NOTE: this affects the package.whl wheel name. When changing this (if
156
- # ever), you must grep for '.whl' and change all corresponding wheel paths
157
- # (templates/*.j2 and wheel_utils.py).
158
- name='skypilot-nightly',
159
- version=find_version(),
160
- packages=setuptools.find_packages(),
161
- author='SkyPilot Team',
162
- license='Apache 2.0',
163
- readme='README.md',
164
- description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
165
- long_description=long_description,
166
- long_description_content_type='text/markdown',
167
- setup_requires=['wheel'],
168
- requires_python='>=3.7',
169
- install_requires=dependencies['install_requires'],
170
- extras_require=dependencies['extras_require'],
171
- entry_points={
172
- 'console_scripts': ['sky = sky.cli:cli'],
173
- },
174
- include_package_data=True,
175
- classifiers=[
176
- 'Programming Language :: Python :: 3.7',
177
- 'Programming Language :: Python :: 3.8',
178
- 'Programming Language :: Python :: 3.9',
179
- 'Programming Language :: Python :: 3.10',
180
- 'Programming Language :: Python :: 3.11',
181
- 'Programming Language :: Python :: 3.12',
182
- 'Programming Language :: Python :: 3.13',
183
- 'License :: OSI Approved :: Apache Software License',
184
- 'Operating System :: OS Independent',
185
- 'Topic :: Software Development :: Libraries :: Python Modules',
186
- 'Topic :: System :: Distributed Computing',
187
- ],
188
- project_urls={
189
- 'Homepage': 'https://github.com/skypilot-org/skypilot',
190
- 'Issues': 'https://github.com/skypilot-org/skypilot/issues',
191
- 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
192
- 'Documentation': 'https://docs.skypilot.co/',
193
- },
194
- )
151
+ if __name__ == '__main__':
152
+ atexit.register(revert_commit_hash)
153
+ replace_commit_hash()
154
+ setuptools.setup(
155
+ # NOTE: this affects the package.whl wheel name. When changing this (if
156
+ # ever), you must grep for '.whl' and change all corresponding wheel paths
157
+ # (templates/*.j2 and wheel_utils.py).
158
+ name='skypilot-nightly',
159
+ version=find_version(),
160
+ packages=setuptools.find_packages(),
161
+ author='SkyPilot Team',
162
+ license='Apache 2.0',
163
+ readme='README.md',
164
+ description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
165
+ long_description=long_description,
166
+ long_description_content_type='text/markdown',
167
+ setup_requires=['wheel'],
168
+ requires_python='>=3.7',
169
+ install_requires=dependencies['install_requires'],
170
+ extras_require=dependencies['extras_require'],
171
+ entry_points={
172
+ 'console_scripts': ['sky = sky.cli:cli'],
173
+ },
174
+ include_package_data=True,
175
+ classifiers=[
176
+ 'Programming Language :: Python :: 3.7',
177
+ 'Programming Language :: Python :: 3.8',
178
+ 'Programming Language :: Python :: 3.9',
179
+ 'Programming Language :: Python :: 3.10',
180
+ 'Programming Language :: Python :: 3.11',
181
+ 'Programming Language :: Python :: 3.12',
182
+ 'Programming Language :: Python :: 3.13',
183
+ 'License :: OSI Approved :: Apache Software License',
184
+ 'Operating System :: OS Independent',
185
+ 'Topic :: Software Development :: Libraries :: Python Modules',
186
+ 'Topic :: System :: Distributed Computing',
187
+ ],
188
+ project_urls={
189
+ 'Homepage': 'https://github.com/skypilot-org/skypilot',
190
+ 'Issues': 'https://github.com/skypilot-org/skypilot/issues',
191
+ 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
192
+ 'Documentation': 'https://docs.skypilot.co/',
193
+ },
194
+ )
sky/sky_logging.py CHANGED
@@ -85,7 +85,7 @@ class EnvAwareHandler(rich_utils.RichSafeStreamHandler):
85
85
  @level.setter
86
86
  def level(self, level):
87
87
  # pylint: disable=protected-access
88
- self._level = logging._checkLevel(level)
88
+ self._level = logging._checkLevel(level) # type: ignore[attr-defined]
89
89
 
90
90
 
91
91
  _root_logger = logging.getLogger('sky')
@@ -109,7 +109,6 @@ def _setup_logger():
109
109
  global _default_handler
110
110
  if _default_handler is None:
111
111
  _default_handler = EnvAwareHandler(sys.stdout)
112
- _default_handler.flush = sys.stdout.flush # type: ignore
113
112
  if env_options.Options.SHOW_DEBUG_INFO.get():
114
113
  _default_handler.setLevel(logging.DEBUG)
115
114
  else:
@@ -129,7 +128,6 @@ def _setup_logger():
129
128
  for logger_name in _SENSITIVE_LOGGER:
130
129
  logger = logging.getLogger(logger_name)
131
130
  handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
132
- handler_to_logger.flush = sys.stdout.flush # type: ignore
133
131
  logger.addHandler(handler_to_logger)
134
132
  logger.setLevel(logging.INFO)
135
133
  if _show_logging_prefix():
@@ -148,7 +146,8 @@ def reload_logger():
148
146
  such as SKYPILOT_DEBUG.
149
147
  """
150
148
  global _default_handler
151
- _root_logger.removeHandler(_default_handler)
149
+ if _default_handler is not None:
150
+ _root_logger.removeHandler(_default_handler)
152
151
  _default_handler = None
153
152
  _setup_logger()
154
153
 
@@ -212,12 +211,21 @@ def logging_enabled(logger: logging.Logger, level: int) -> bool:
212
211
 
213
212
 
214
213
  @contextlib.contextmanager
215
- def silent():
214
+ def silent(should_silence: bool = True):
216
215
  """Make all sky_logging.print() and logger.{info, warning...} silent.
217
216
 
218
217
  We preserve the ERROR level logging, so that errors are
219
218
  still printed.
219
+
220
+ Args:
221
+ should_silence: Whether to actually suppress the logging. If False, this
222
+ is a no-op context manager. Provided for convenience when we want to
223
+ suppress logging conditionally.
220
224
  """
225
+ if not should_silence:
226
+ yield
227
+ return
228
+
221
229
  global print
222
230
  previous_level = _root_logger.level
223
231
  previous_is_silent = is_silent()
@@ -1,51 +1,143 @@
1
1
  """Restarts skylet if version does not match"""
2
2
 
3
3
  import os
4
+ import signal
4
5
  import subprocess
6
+ from typing import List, Optional, Tuple
7
+
8
+ import psutil
5
9
 
6
10
  from sky.skylet import constants
11
+ from sky.skylet import runtime_utils
12
+ from sky.utils import common_utils
13
+
14
+ VERSION_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_VERSION_FILE)
15
+ SKYLET_LOG_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_LOG_FILE)
16
+ PID_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PID_FILE)
17
+ PORT_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PORT_FILE)
18
+
19
+
20
+ def _is_running_skylet_process(pid: int) -> bool:
21
+ if pid <= 0:
22
+ return False
23
+ try:
24
+ process = psutil.Process(pid)
25
+ if not process.is_running():
26
+ return False
27
+ # Check if command line contains the skylet module identifier
28
+ cmdline = process.cmdline()
29
+ return any('sky.skylet.skylet' in arg for arg in cmdline)
30
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
31
+ OSError) as e:
32
+ print(f'Error checking if skylet process {pid} is running: {e}')
33
+ return False
34
+
35
+
36
+ def _find_running_skylet_pids() -> List[int]:
37
+ if os.path.exists(PID_FILE):
38
+ try:
39
+ with open(PID_FILE, 'r', encoding='utf-8') as pid_file:
40
+ pid = int(pid_file.read().strip())
41
+ if _is_running_skylet_process(pid):
42
+ return [pid]
43
+ except (OSError, ValueError, IOError) as e:
44
+ # Don't fallback to grep-based detection as the existence of the
45
+ # PID file implies that we are on the new version, and there is
46
+ # possibility of there being multiple skylet processes running,
47
+ # and we don't want to accidentally kill the wrong skylet(s).
48
+ print(f'Error reading PID file {PID_FILE}: {e}')
49
+ return []
50
+ else:
51
+ # Fall back to grep-based detection for backward compatibility.
52
+ pids = []
53
+ # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
54
+ # because need to handle the backward compatibility of the old skylet
55
+ # started before #3326, which does not use the full path to python.
56
+ proc = subprocess.run(
57
+ 'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"',
58
+ shell=True,
59
+ check=False,
60
+ capture_output=True,
61
+ text=True)
62
+ if proc.returncode == 0:
63
+ # Parse the output to extract PIDs (column 2)
64
+ for line in proc.stdout.strip().split('\n'):
65
+ if line:
66
+ parts = line.split()
67
+ if len(parts) >= 2:
68
+ try:
69
+ pids.append(int(parts[1]))
70
+ except ValueError:
71
+ continue
72
+ return pids
73
+
74
+
75
+ def _check_version_match() -> Tuple[bool, Optional[str]]:
76
+ """Check if the version file matches the current skylet version.
7
77
 
8
- VERSION_FILE = os.path.expanduser(constants.SKYLET_VERSION_FILE)
78
+ Returns:
79
+ Tuple of (version_match: bool, version: str or None)
80
+ """
81
+ version: Optional[str] = None
82
+ if os.path.exists(VERSION_FILE):
83
+ try:
84
+ with open(VERSION_FILE, 'r', encoding='utf-8') as f:
85
+ version = f.read().strip()
86
+ return version == constants.SKYLET_VERSION, version
87
+ except (OSError, IOError):
88
+ pass
89
+ return False, version
9
90
 
10
91
 
11
92
  def restart_skylet():
12
93
  # Kills old skylet if it is running.
13
94
  # TODO(zhwu): make the killing graceful, e.g., use a signal to tell
14
95
  # skylet to exit, instead of directly killing it.
15
- subprocess.run(
16
- # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
17
- # because need to handle the backward compatibility of the old skylet
18
- # started before #3326, which does not use the full path to python.
19
- 'ps aux | grep "sky.skylet.skylet" | grep " -m "'
20
- '| awk \'{print $2}\' | xargs kill >> ~/.sky/skylet.log 2>&1',
21
- shell=True,
22
- check=False)
96
+
97
+ # Find and kill running skylet processes
98
+ for pid in _find_running_skylet_pids():
99
+ try:
100
+ os.kill(pid, signal.SIGKILL)
101
+ # Wait until process fully terminates so its socket gets released.
102
+ # Without this, find_free_port may race with the kernel closing the
103
+ # socket and fail to bind to the port that's supposed to be free.
104
+ psutil.Process(pid).wait(timeout=5)
105
+ except (OSError, ProcessLookupError, psutil.NoSuchProcess,
106
+ psutil.TimeoutExpired):
107
+ # Process died between detection and kill, or timeout waiting
108
+ pass
109
+ # Clean up the PID file
110
+ try:
111
+ os.remove(PID_FILE)
112
+ except OSError:
113
+ pass # Best effort cleanup
114
+
115
+ # TODO(kevin): Handle race conditions here. Race conditions can only
116
+ # happen on Slurm, where there could be multiple clusters running in
117
+ # one network namespace. For other clouds, the behaviour will be that
118
+ # it always gets port 46590 (default port).
119
+ port = common_utils.find_free_port(constants.SKYLET_GRPC_PORT)
23
120
  subprocess.run(
24
121
  # We have made sure that `attempt_skylet.py` is executed with the
25
122
  # skypilot runtime env activated, so that skylet can access the cloud
26
123
  # CLI tools.
27
- f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet'
28
- ' >> ~/.sky/skylet.log 2>&1 &',
124
+ f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet '
125
+ f'--port={port} '
126
+ f'>> {SKYLET_LOG_FILE} 2>&1 & echo $! > {PID_FILE}',
29
127
  shell=True,
30
128
  check=True)
129
+
130
+ with open(PORT_FILE, 'w', encoding='utf-8') as pf:
131
+ pf.write(str(port))
132
+
31
133
  with open(VERSION_FILE, 'w', encoding='utf-8') as v_f:
32
134
  v_f.write(constants.SKYLET_VERSION)
33
135
 
34
136
 
35
- proc = subprocess.run(
36
- 'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"',
37
- shell=True,
38
- check=False)
39
-
40
- running = (proc.returncode == 0)
137
+ # Check if our skylet is running
138
+ running = bool(_find_running_skylet_pids())
41
139
 
42
- version_match = False
43
- found_version = None
44
- if os.path.exists(VERSION_FILE):
45
- with open(VERSION_FILE, 'r', encoding='utf-8') as f:
46
- found_version = f.read().strip()
47
- if found_version == constants.SKYLET_VERSION:
48
- version_match = True
140
+ version_match, found_version = _check_version_match()
49
141
 
50
142
  version_string = (f' (found version {found_version}, new version '
51
143
  f'{constants.SKYLET_VERSION})')
sky/skylet/configs.py CHANGED
@@ -5,6 +5,7 @@ import pathlib
5
5
  import threading
6
6
  from typing import Callable, Optional, Union
7
7
 
8
+ from sky.skylet import runtime_utils
8
9
  from sky.utils.db import db_utils
9
10
 
10
11
  _DB_PATH = None
@@ -29,7 +30,8 @@ def init_db(func: Callable):
29
30
 
30
31
  with _db_init_lock:
31
32
  if _DB_PATH is None:
32
- _DB_PATH = os.path.expanduser('~/.sky/skylet_config.db')
33
+ _DB_PATH = runtime_utils.get_runtime_dir_path(
34
+ '.sky/skylet_config.db')
33
35
  os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
34
36
  with db_utils.safe_cursor(
35
37
  _DB_PATH