skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/skylet/log_lib.py CHANGED
@@ -8,11 +8,13 @@ import functools
8
8
  import io
9
9
  import multiprocessing.pool
10
10
  import os
11
+ import queue as queue_lib
11
12
  import shlex
12
13
  import subprocess
13
14
  import sys
14
15
  import tempfile
15
16
  import textwrap
17
+ import threading
16
18
  import time
17
19
  from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
18
20
  Tuple, Union)
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
39
41
 
40
42
  LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
41
43
 
44
+ # 16-64KiB seems to be the sweet spot:
45
+ # https://github.com/grpc/grpc.github.io/issues/371
46
+ # TODO(kevin): Benchmark this ourselves and verify.
47
+ DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
48
+
42
49
 
43
50
  class _ProcessingArgs:
44
51
  """Arguments for processing logs."""
@@ -165,7 +172,7 @@ def run_with_log(
165
172
  streaming_prefix: Optional[str] = None,
166
173
  log_cmd: bool = False,
167
174
  **kwargs,
168
- ) -> Union[int, Tuple[int, str, str]]:
175
+ ) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
169
176
  """Runs a command and logs its output to a file.
170
177
 
171
178
  Args:
@@ -176,6 +183,8 @@ def run_with_log(
176
183
  process_stream: Whether to post-process the stdout/stderr of the
177
184
  command, such as replacing or skipping lines on the fly. If
178
185
  enabled, lines are printed only when '\r' or '\n' is found.
186
+ streaming_prefix: Optional prefix for each log line. Can contain {pid}
187
+ placeholder which will be replaced with the subprocess PID.
179
188
 
180
189
  Returns the returncode or returncode, stdout and stderr of the command.
181
190
  Note that the stdout and stderr is already decoded.
@@ -213,7 +222,21 @@ def run_with_log(
213
222
  stdin=stdin,
214
223
  **kwargs) as proc:
215
224
  try:
216
- subprocess_utils.kill_process_daemon(proc.pid)
225
+ if ctx is not None:
226
+ # When runs in coroutine, use kill_pg if available to avoid
227
+ # the overhead of refreshing the process tree in the daemon.
228
+ subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
229
+ else:
230
+ # For backward compatibility, do not specify use_kill_pg by
231
+ # default.
232
+ subprocess_utils.kill_process_daemon(proc.pid)
233
+
234
+ # Format streaming_prefix with subprocess PID if it contains {pid}
235
+ formatted_streaming_prefix = streaming_prefix
236
+ if streaming_prefix and '{pid}' in streaming_prefix:
237
+ formatted_streaming_prefix = streaming_prefix.format(
238
+ pid=proc.pid)
239
+
217
240
  stdout = ''
218
241
  stderr = ''
219
242
  stdout_stream_handler = None
@@ -242,7 +265,7 @@ def run_with_log(
242
265
  line_processor=line_processor,
243
266
  # Replace CRLF when the output is logged to driver by ray.
244
267
  replace_crlf=with_ray,
245
- streaming_prefix=streaming_prefix,
268
+ streaming_prefix=formatted_streaming_prefix,
246
269
  )
247
270
  stdout_stream_handler = functools.partial(
248
271
  _handle_io_stream,
@@ -264,7 +287,6 @@ def run_with_log(
264
287
  stdout, stderr = context_utils.pipe_and_wait_process(
265
288
  ctx,
266
289
  proc,
267
- cancel_callback=subprocess_utils.kill_children_processes,
268
290
  stdout_stream_handler=stdout_stream_handler,
269
291
  stderr_stream_handler=stderr_stream_handler)
270
292
  elif process_stream:
@@ -336,7 +358,8 @@ def run_bash_command_with_log(bash_command: str,
336
358
  log_path: str,
337
359
  env_vars: Optional[Dict[str, str]] = None,
338
360
  stream_logs: bool = False,
339
- with_ray: bool = False):
361
+ with_ray: bool = False,
362
+ streaming_prefix: Optional[str] = None):
340
363
  with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
341
364
  delete=False) as fp:
342
365
  bash_command = make_task_bash_script(bash_command, env_vars=env_vars)
@@ -351,9 +374,26 @@ def run_bash_command_with_log(bash_command: str,
351
374
  log_path,
352
375
  stream_logs=stream_logs,
353
376
  with_ray=with_ray,
377
+ streaming_prefix=streaming_prefix,
354
378
  shell=True)
355
379
 
356
380
 
381
+ def run_bash_command_with_log_and_return_pid(
382
+ bash_command: str,
383
+ log_path: str,
384
+ env_vars: Optional[Dict[str, str]] = None,
385
+ stream_logs: bool = False,
386
+ with_ray: bool = False,
387
+ streaming_prefix: Optional[str] = None):
388
+ return_code = run_bash_command_with_log(bash_command,
389
+ log_path,
390
+ env_vars,
391
+ stream_logs,
392
+ with_ray,
393
+ streaming_prefix=streaming_prefix)
394
+ return {'return_code': return_code, 'pid': os.getpid()}
395
+
396
+
357
397
  def _follow_job_logs(file,
358
398
  job_id: int,
359
399
  start_streaming: bool,
@@ -395,9 +435,9 @@ def _follow_job_logs(file,
395
435
  wait_last_logs = False
396
436
  continue
397
437
  status_str = status.value if status is not None else 'None'
398
- print(ux_utils.finishing_message(
399
- f'Job finished (status: {status_str}).'),
400
- flush=True)
438
+ finish = ux_utils.finishing_message(
439
+ f'Job finished (status: {status_str}).')
440
+ yield finish + '\n'
401
441
  return
402
442
 
403
443
  time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
@@ -552,3 +592,207 @@ def tail_logs(job_id: Optional[int],
552
592
  except FileNotFoundError:
553
593
  print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
554
594
  f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
595
+
596
+
597
+ def tail_logs_iter(job_id: Optional[int],
598
+ log_dir: Optional[str],
599
+ managed_job_id: Optional[int] = None,
600
+ follow: bool = True,
601
+ tail: int = 0) -> Iterator[str]:
602
+ """Tail the logs of a job. This is mostly the same as tail_logs, but
603
+ returns an iterator instead of printing to stdout/stderr."""
604
+ if job_id is None:
605
+ # This only happens when job_lib.get_latest_job_id() returns None,
606
+ # which means no job has been submitted to this cluster. See
607
+ # sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
608
+ logger.info('Skip streaming logs as no job has been submitted.')
609
+ return
610
+ job_str = f'job {job_id}'
611
+ if managed_job_id is not None:
612
+ job_str = f'managed job {managed_job_id}'
613
+ if log_dir is None:
614
+ msg = f'{job_str.capitalize()} not found (see `sky queue`).'
615
+ yield msg + '\n'
616
+ return
617
+ logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
618
+ f'{managed_job_id}.')
619
+ log_path = os.path.join(log_dir, 'run.log')
620
+ log_path = os.path.expanduser(log_path)
621
+
622
+ status = job_lib.update_job_status([job_id], silent=True)[0]
623
+
624
+ # Wait for the log to be written. This is needed due to the `ray submit`
625
+ # will take some time to start the job and write the log.
626
+ retry_cnt = 0
627
+ while status is not None and not status.is_terminal():
628
+ retry_cnt += 1
629
+ if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
630
+ break
631
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
632
+ err = (f'{colorama.Fore.RED}ERROR: Logs for '
633
+ f'{job_str} (status: {status.value}) does not exist '
634
+ f'after retrying {retry_cnt} times.'
635
+ f'{colorama.Style.RESET_ALL}')
636
+ yield err + '\n'
637
+ return
638
+ waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
639
+ 'to be written...')
640
+ yield waiting + '\n'
641
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
642
+ status = job_lib.update_job_status([job_id], silent=True)[0]
643
+
644
+ start_stream_at = LOG_FILE_START_STREAMING_AT
645
+ # Explicitly declare the type to avoid mypy warning.
646
+ lines: Iterable[str] = []
647
+ if follow and status in [
648
+ job_lib.JobStatus.SETTING_UP,
649
+ job_lib.JobStatus.PENDING,
650
+ job_lib.JobStatus.RUNNING,
651
+ ]:
652
+ # Not using `ray job logs` because it will put progress bar in
653
+ # multiple lines.
654
+ with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
655
+ # Using `_follow` instead of `tail -f` to streaming the whole
656
+ # log and creating a new process for tail.
657
+ start_streaming = False
658
+ if tail > 0:
659
+ head_lines_of_log_file = _peek_head_lines(log_file)
660
+ lines = collections.deque(log_file, maxlen=tail)
661
+ start_streaming = _should_stream_the_whole_tail_lines(
662
+ head_lines_of_log_file, lines, start_stream_at)
663
+ for line in lines:
664
+ if start_stream_at in line:
665
+ start_streaming = True
666
+ if start_streaming:
667
+ yield line
668
+ # Now, the cursor is at the end of the last lines
669
+ # if tail > 0
670
+ for line in _follow_job_logs(log_file,
671
+ job_id=job_id,
672
+ start_streaming=start_streaming,
673
+ start_streaming_at=start_stream_at):
674
+ yield line
675
+ else:
676
+ try:
677
+ start_streaming = False
678
+ with open(log_path, 'r', encoding='utf-8') as log_file:
679
+ if tail > 0:
680
+ # If tail > 0, we need to read the last n lines.
681
+ # We use double ended queue to rotate the last n lines.
682
+ head_lines_of_log_file = _peek_head_lines(log_file)
683
+ lines = collections.deque(log_file, maxlen=tail)
684
+ start_streaming = _should_stream_the_whole_tail_lines(
685
+ head_lines_of_log_file, lines, start_stream_at)
686
+ else:
687
+ lines = log_file
688
+ for line in lines:
689
+ if start_stream_at in line:
690
+ start_streaming = True
691
+ if start_streaming:
692
+ yield line
693
+ status_str = status.value if status is not None else 'None'
694
+ # Only show "Job finished" for actually terminal states
695
+ if status is not None and status.is_terminal():
696
+ finish = ux_utils.finishing_message(
697
+ f'Job finished (status: {status_str}).')
698
+ yield finish + '\n'
699
+ return
700
+ except FileNotFoundError:
701
+ err = (
702
+ f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
703
+ f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
704
+ yield err + '\n'
705
+
706
+
707
+ class LogBuffer:
708
+ """In-memory buffer for chunking log lines for streaming."""
709
+
710
+ def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
711
+ """Initialize the log buffer.
712
+
713
+ Args:
714
+ max_chars: Maximum buffer size (in characters, not bytes) before
715
+ flushing. The actual amount of bytes (UTF-8 encoding)
716
+ could be more than this, depending on the characters,
717
+ i.e. ASCII characters take 1 byte, while others
718
+ may take 2-4 bytes. But this is fine as our default
719
+ chunk size is well below the default value of
720
+ grpc.max_receive_message_length which is 4MB.
721
+ """
722
+ self.max_chars = max_chars
723
+ self._buffer = io.StringIO()
724
+
725
+ def _should_flush(self) -> bool:
726
+ return self._buffer.tell() >= self.max_chars
727
+
728
+ def flush(self) -> str:
729
+ """Get the current buffered content and clear the buffer.
730
+
731
+ Returns:
732
+ The buffered log lines as a single string
733
+ """
734
+ if not self._buffer.tell():
735
+ return ''
736
+ chunk = self._buffer.getvalue()
737
+ self._buffer.truncate(0)
738
+ self._buffer.seek(0)
739
+ return chunk
740
+
741
+ def write(self, line: str) -> bool:
742
+ """Add a line to the buffer.
743
+
744
+ Args:
745
+ line: The log line to add
746
+
747
+ Returns:
748
+ True if buffer should be flushed after adding the line
749
+ """
750
+ self._buffer.write(line)
751
+ return self._should_flush()
752
+
753
+ def close(self):
754
+ self._buffer.close()
755
+
756
+
757
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
758
+ timeout: float) -> Iterable[str]:
759
+ """Iterates over an iterable, writing each item to a buffer,
760
+ and flushing the buffer when it is full or no item is
761
+ yielded within the timeout duration."""
762
+ # TODO(kevin): Simplify this using asyncio.timeout, once we move
763
+ # the skylet event loop and gRPC server to asyncio.
764
+ # https://docs.python.org/3/library/asyncio-task.html#timeouts
765
+
766
+ queue: queue_lib.Queue = queue_lib.Queue()
767
+ sentinel = object()
768
+
769
+ def producer():
770
+ try:
771
+ for item in iterable:
772
+ queue.put(item)
773
+ finally:
774
+ queue.put(sentinel)
775
+
776
+ thread = threading.Thread(target=producer, daemon=True)
777
+ thread.start()
778
+
779
+ while True:
780
+ try:
781
+ item = queue.get(timeout=timeout)
782
+ except queue_lib.Empty:
783
+ out = buffer.flush()
784
+ if out:
785
+ yield out
786
+ continue
787
+
788
+ if item is sentinel:
789
+ thread.join()
790
+ out = buffer.flush()
791
+ if out:
792
+ yield out
793
+ return
794
+
795
+ if buffer.write(item):
796
+ out = buffer.flush()
797
+ if out:
798
+ yield out
sky/skylet/log_lib.pyi CHANGED
@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
4
4
  the return type based on the value of require_outputs.
5
5
  """
6
6
  import typing
7
- from typing import Dict, List, Optional, Tuple, Union
7
+ from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
8
8
 
9
9
  from typing_extensions import Literal
10
10
 
@@ -42,7 +42,7 @@ class _ProcessingArgs:
42
42
  ...
43
43
 
44
44
 
45
- def _get_context() -> Optional[context.Context]:
45
+ def _get_context() -> Optional[context.SkyPilotContext]:
46
46
  ...
47
47
 
48
48
 
@@ -68,7 +68,7 @@ def run_with_log(cmd: Union[List[str], str],
68
68
  process_stream: bool = ...,
69
69
  line_processor: Optional[log_utils.LineProcessor] = ...,
70
70
  streaming_prefix: Optional[str] = ...,
71
- ray_job_id: Optional[str] = ...,
71
+ log_cmd: bool = ...,
72
72
  **kwargs) -> int:
73
73
  ...
74
74
 
@@ -87,7 +87,7 @@ def run_with_log(cmd: Union[List[str], str],
87
87
  process_stream: bool = ...,
88
88
  line_processor: Optional[log_utils.LineProcessor] = ...,
89
89
  streaming_prefix: Optional[str] = ...,
90
- ray_job_id: Optional[str] = ...,
90
+ log_cmd: bool = ...,
91
91
  **kwargs) -> Tuple[int, str, str]:
92
92
  ...
93
93
 
@@ -106,8 +106,8 @@ def run_with_log(cmd: Union[List[str], str],
106
106
  process_stream: bool = ...,
107
107
  line_processor: Optional[log_utils.LineProcessor] = ...,
108
108
  streaming_prefix: Optional[str] = ...,
109
- ray_job_id: Optional[str] = ...,
110
- **kwargs) -> Union[int, Tuple[int, str, str]]:
109
+ log_cmd: bool = ...,
110
+ **kwargs) -> Tuple[int, int]:
111
111
  ...
112
112
 
113
113
 
@@ -125,7 +125,18 @@ def run_bash_command_with_log(bash_command: str,
125
125
  log_path: str,
126
126
  env_vars: Optional[Dict[str, str]] = ...,
127
127
  stream_logs: bool = ...,
128
- with_ray: bool = ...):
128
+ with_ray: bool = ...,
129
+ streaming_prefix: Optional[str] = ...) -> int:
130
+ ...
131
+
132
+
133
+ def run_bash_command_with_log_and_return_pid(
134
+ bash_command: str,
135
+ log_path: str,
136
+ env_vars: Optional[Dict[str, str]] = ...,
137
+ stream_logs: bool = ...,
138
+ with_ray: bool = ...,
139
+ streaming_prefix: Optional[str] = ...) -> Dict[str, Union[int, str]]:
129
140
  ...
130
141
 
131
142
 
@@ -134,3 +145,32 @@ def tail_logs(job_id: int,
134
145
  managed_job_id: Optional[int] = ...,
135
146
  follow: bool = ...) -> None:
136
147
  ...
148
+
149
+
150
+ def tail_logs_iter(job_id: Optional[int],
151
+ log_dir: Optional[str],
152
+ managed_job_id: Optional[int] = ...,
153
+ follow: bool = ...,
154
+ tail: int = ...) -> Iterator[str]:
155
+ ...
156
+
157
+
158
+ class LogBuffer:
159
+ max_chars: int
160
+
161
+ def __init__(self, max_chars: int = ...):
162
+ ...
163
+
164
+ def flush(self) -> str:
165
+ ...
166
+
167
+ def write(self, line: str) -> bool:
168
+ ...
169
+
170
+ def close(self):
171
+ ...
172
+
173
+
174
+ def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
175
+ timeout: float) -> Iterable[str]:
176
+ ...
@@ -24,7 +24,7 @@ import socket
24
24
  import threading
25
25
  import time
26
26
  from pathlib import Path
27
- from pprint import pprint
27
+ from pprint import pformat, pprint
28
28
  from typing import Any, Dict, List, Optional
29
29
  from uuid import uuid4
30
30
 
@@ -67,13 +67,13 @@ def log_in_out(func):
67
67
  logger.debug(
68
68
  f"\n\nEnter {name} from {inspect.stack()[0][3]} "
69
69
  f"{inspect.stack()[1][3]} {inspect.stack()[2][3]} with args: "
70
- f"entered with args:\n{pprint(args)} and kwargs {pprint(kwargs)}"
70
+ f"entered with args:\n{pformat(args)} and kwargs {pformat(kwargs)}"
71
71
  )
72
72
  try:
73
73
  result = func(*args, **kwargs)
74
74
  logger.debug(
75
75
  f"Leave {name} from {inspect.stack()[1][3]} with result "
76
- f"Func Result:{pprint(result)}\n\n"
76
+ f"Func Result:{pformat(result)}\n\n"
77
77
  )
78
78
  except Exception:
79
79
  cli_logger.error(f"Error in {name}")
@@ -445,7 +445,7 @@ class IBMVPCNodeProvider(NodeProvider):
445
445
  """returns the worker's node private ip address"""
446
446
  node = self._get_cached_node(node_id)
447
447
 
448
- # if a bug ocurred, or node data was fetched before primary_ip
448
+ # if a bug occurred, or node data was fetched before primary_ip
449
449
  # was assigned, refetch node data from cloud.
450
450
  try:
451
451
  primary_ip = node["network_interfaces"][0].get("primary_ip")["address"]
@@ -502,8 +502,12 @@ class IBMVPCNodeProvider(NodeProvider):
502
502
 
503
503
  logger.info(f"Creating new VM instance {name}")
504
504
 
505
- security_group_identity_model = {"id": self.vpc_tags["security_group_id"]}
506
- subnet_identity_model = {"id": self.vpc_tags["subnet_id"]}
505
+ if self.vpc_tags is None:
506
+ raise ValueError("vpc_tags must be initialized before creating instances")
507
+ vpc_tags = self.vpc_tags # Help mypy with type narrowing
508
+
509
+ security_group_identity_model = {"id": vpc_tags["security_group_id"]}
510
+ subnet_identity_model = {"id": vpc_tags["subnet_id"]}
507
511
  primary_network_interface = {
508
512
  "name": "eth0",
509
513
  "subnet": subnet_identity_model,
@@ -536,7 +540,7 @@ class IBMVPCNodeProvider(NodeProvider):
536
540
  instance_prototype["keys"] = [key_identity_model]
537
541
  instance_prototype["profile"] = {"name": profile_name}
538
542
  instance_prototype["resource_group"] = {"id": self.resource_group_id}
539
- instance_prototype["vpc"] = {"id": self.vpc_tags["vpc_id"]}
543
+ instance_prototype["vpc"] = {"id": vpc_tags["vpc_id"]}
540
544
  instance_prototype["image"] = {"id": base_config["image_id"]}
541
545
 
542
546
  instance_prototype["zone"] = {"name": self.zone}
@@ -584,7 +588,7 @@ class IBMVPCNodeProvider(NodeProvider):
584
588
  floating_ip_name = f"{RAY_RECYCLABLE}-{uuid4().hex[:4]}"
585
589
  # create a new floating ip
586
590
  logger.debug(f"Creating floating IP {floating_ip_name}")
587
- floating_ip_prototype = {}
591
+ floating_ip_prototype: Dict[str, Any] = {}
588
592
  floating_ip_prototype["name"] = floating_ip_name
589
593
  floating_ip_prototype["zone"] = {"name": self.zone}
590
594
  floating_ip_prototype["resource_group"] = {"id": self.resource_group_id}
@@ -10,6 +10,7 @@ import textwrap
10
10
  import time
11
11
  import uuid
12
12
  from concurrent.futures import ThreadPoolExecutor
13
+ from typing import Any, Dict
13
14
 
14
15
  import requests
15
16
 
@@ -173,7 +174,7 @@ class IBMVPCProvider:
173
174
  "a subnet"
174
175
  )
175
176
 
176
- subnet_prototype = {}
177
+ subnet_prototype: Dict[str, Any] = {}
177
178
  subnet_prototype["zone"] = {"name": zone_name}
178
179
  subnet_prototype["ip_version"] = "ipv4"
179
180
  subnet_prototype["name"] = subnet_name
@@ -186,7 +187,7 @@ class IBMVPCProvider:
186
187
 
187
188
  def create_public_gateway(self, vpc_id, zone_name, subnet_data):
188
189
 
189
- gateway_prototype = {}
190
+ gateway_prototype: Dict[str, Any] = {}
190
191
  gateway_prototype["vpc"] = {"id": vpc_id}
191
192
  gateway_prototype["zone"] = {"name": zone_name}
192
193
  gateway_prototype["name"] = f"{subnet_data['name']}-gw"
@@ -345,7 +346,7 @@ class IBMVPCProvider:
345
346
  return True
346
347
  tries -= 1
347
348
  time.sleep(sleep_interval)
348
- logger.error("Failed to delete instance within the alloted time\n")
349
+ logger.error("Failed to delete instance within the allotted time\n")
349
350
  return False
350
351
 
351
352
  for subnet_id in self.get_vpc_subnets(vpc_data, region, field="id"):
@@ -522,7 +523,7 @@ class ClusterCleaner:
522
523
  if e.code == 404:
523
524
  print(("VPC doesn't exist."))
524
525
  return None
525
- else: raise
526
+ else: raise
526
527
 
527
528
  def delete_subnets(vpc_data):
528
529
  def _poll_subnet_exists(subnet_id):
@@ -560,12 +561,12 @@ class ClusterCleaner:
560
561
  deleting_resource = False
561
562
  except ibm_cloud_sdk_core.ApiException as e:
562
563
  if e.code == 404:
563
- print("gateway doesn't exist.")
564
+ print("gateway doesn't exist.")
564
565
  deleting_resource = False
565
566
  if e.code == 409:
566
567
  print("gateway still in use.")
567
- # will retry until cloud functions timeout.
568
- time.sleep(5)
568
+ # will retry until cloud functions timeout.
569
+ time.sleep(5)
569
570
 
570
571
  def delete_vms(vpc_id):
571
572
  def _poll_vpc_contains_vms(vpc_id):
@@ -586,7 +587,7 @@ class ClusterCleaner:
586
587
  )
587
588
 
588
589
  def _del_instance(vm_data):
589
- # first delete ips created by node_provider
590
+ # first delete ips created by node_provider
590
591
  nic_id = vm_data["network_interfaces"][0]["id"]
591
592
  res = ibm_vpc_client.list_instance_network_interface_floating_ips(
592
593
  vm_data["id"], nic_id
@@ -598,7 +599,7 @@ class ClusterCleaner:
598
599
  ibm_vpc_client.delete_floating_ip(ip["id"])
599
600
  print(f"Deleting VM: {vm_data['id']}")
600
601
  ibm_vpc_client.delete_instance(id=vm_data["id"])
601
-
602
+
602
603
  res = ibm_vpc_client.list_instances(vpc_id=vpc_id).get_result()
603
604
  num_instances = res["total_count"]
604
605
 
@@ -619,12 +620,12 @@ class ClusterCleaner:
619
620
  deleting_resource = False
620
621
  except ibm_cloud_sdk_core.ApiException as e:
621
622
  if e.code == 404:
622
- print("VPC doesn't exist.")
623
+ print("VPC doesn't exist.")
623
624
  deleting_resource = False
624
625
  if e.code == 409:
625
626
  print("VPC still in use.")
626
- # will retry until cloud functions timeout.
627
- time.sleep(5)
627
+ # will retry until cloud functions timeout.
628
+ time.sleep(5)
628
629
 
629
630
  def delete_vpc(vpc_id):
630
631
  vpc_data = get_vpc_data(vpc_id)
@@ -0,0 +1,21 @@
1
+ """Runtime utilities for SkyPilot."""
2
+ import os
3
+
4
+ from sky.skylet import constants
5
+
6
+
7
+ def get_runtime_dir_path(path_suffix: str = '') -> str:
8
+ """Get an expanded path within the SkyPilot runtime directory.
9
+
10
+ Args:
11
+ path_suffix: Path suffix to join with the runtime dir
12
+ (e.g., '.sky/jobs.db').
13
+
14
+ Returns:
15
+ The full expanded path.
16
+ """
17
+ runtime_dir = os.path.expanduser(
18
+ os.environ.get(constants.SKY_RUNTIME_DIR_ENV_VAR_KEY, '~'))
19
+ if path_suffix:
20
+ return os.path.join(runtime_dir, path_suffix)
21
+ return runtime_dir