skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,171 @@
1
+ cluster_name: {{ cluster_name_on_cloud }}
2
+
3
+ max_workers: {{ num_nodes - 1 }}
4
+ upscaling_speed: {{ num_nodes - 1 }}
5
+ idle_timeout_minutes: 5
6
+
7
+ {%- if docker_image is not none %}
8
+ docker:
9
+ image: {{docker_image}}
10
+ container_name: {{docker_container_name}}
11
+ run_options:
12
+ - --ulimit nofile=1048576:1048576
13
+ {%- for run_option in docker_run_options %}
14
+ - {{run_option}}
15
+ {%- endfor %}
16
+ {%- if docker_login_config is not none %}
17
+ docker_login_config:
18
+ username: |-
19
+ {{docker_login_config.username}}
20
+ password: |-
21
+ {{docker_login_config.password | indent(6) }}
22
+ server: |-
23
+ {{docker_login_config.server}}
24
+ {%- endif %}
25
+ {%- endif %}
26
+
27
+ provider:
28
+ type: external
29
+ module: sky.provision.seeweb
30
+ region: "{{ region }}"
31
+
32
+ auth:
33
+ ssh_user: ecuser
34
+ ssh_private_key: {{ ssh_private_key }}
35
+
36
+ available_node_types:
37
+ ray_head_default:
38
+ resources: {}
39
+ node_config:
40
+ plan: {{ instance_type }}
41
+ image: {{ image_id }}
42
+ location: {{ region }}
43
+ {% if seeweb_gpu_config is not none %}
44
+ gpu: {{ seeweb_gpu_config.gpu }}
45
+ gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
46
+ {% endif %}
47
+ disk: {{ disk_size }}
48
+ {% if docker_image is not none %}
49
+ user_customize: |
50
+ #!/bin/bash
51
+ # Auto-generated Docker installation script for Seeweb
52
+ LOG_FILE=/var/log/user_customize.log
53
+ sudo mkdir -p "$(dirname "$LOG_FILE")"
54
+ {
55
+ echo "[$(date -Is)] Cloud script: start"
56
+ sudo apt-get update
57
+ sudo apt-get install -y \
58
+ apt-transport-https \
59
+ ca-certificates \
60
+ curl \
61
+ gnupg-agent \
62
+ lsb-release \
63
+ software-properties-common
64
+ sudo mkdir -p /usr/share/keyrings
65
+ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
66
+ sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
67
+ UBU_CODENAME="$(. /etc/os-release && echo "$VERSION_CODENAME")"
68
+ echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu ${UBU_CODENAME} stable" | \
69
+ sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
70
+ sudo apt-get update
71
+ sudo apt-get install -y docker-ce docker-ce-cli containerd.io
72
+ echo "[$(date -Is)] Cloud script: docker installed"
73
+ sudo usermod -aG docker ecuser || true
74
+ sudo systemctl enable docker || true
75
+ sudo systemctl start docker || true
76
+ command -v docker && docker --version || echo "[$(date -Is)] docker still missing"
77
+ echo "[$(date -Is)] Cloud script: complete"
78
+ } | sudo tee -a "$LOG_FILE"
79
+ sudo touch /var/log/docker_install_done
80
+ {% endif %}
81
+
82
+ head_node_type: ray_head_default
83
+
84
+ # Format: `REMOTE_PATH : LOCAL_PATH`
85
+ file_mounts: {
86
+ "~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
87
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
88
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
89
+ {%- for remote_path, local_path in credentials.items() %}
90
+ "{{remote_path}}": "{{local_path}}",
91
+ {%- endfor %}
92
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
93
+ }
94
+
95
+ rsync_exclude: []
96
+
97
+ setup_commands:
98
+ - |
99
+ {%- for initial_setup_command in initial_setup_commands %}
100
+ {{ initial_setup_command }}
101
+ {%- endfor %}
102
+ touch ~/.bashrc;
103
+ echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
104
+ echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
105
+ sudo systemctl stop unattended-upgrades || true;
106
+ sudo systemctl disable unattended-upgrades || true;
107
+ sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
108
+
109
+ {%- if docker_image is not none %}
110
+ # Docker installed via cloud-init; ensure service will be started by cloud-init
111
+ {%- endif %}
112
+
113
+ {{ conda_installation_commands }}
114
+ {{ ray_skypilot_installation_commands }}
115
+ {{ copy_skypilot_templates_commands }}
116
+
117
+ head_start_ray_commands:
118
+ - |
119
+ retry_ray() {
120
+ local n=0; local max=30
121
+ until [ $n -ge $max ]; do
122
+ export SKYPILOT_NUM_GPUS=0
123
+ command -v nvidia-smi >/dev/null 2>&1 && \
124
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
125
+
126
+ ray stop || true
127
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
128
+ ray start --disable-usage-stats --head \
129
+ --port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
130
+ --object-manager-port=8076 \
131
+ --autoscaling-config=~/ray_bootstrap_config.yaml \
132
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
133
+
134
+ echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
135
+ sleep 5
136
+ done
137
+ [ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
138
+ }
139
+ retry_ray
140
+
141
+ worker_start_ray_commands:
142
+ - |
143
+ retry_ray() {
144
+ local n=0; local max=30
145
+ until [ $n -ge $max ]; do
146
+ SKYPILOT_NUM_GPUS=0
147
+ command -v nvidia-smi >/dev/null 2>&1 && \
148
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
149
+
150
+ ray stop || true
151
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
152
+ ray start --disable-usage-stats \
153
+ --address=$RAY_HEAD_IP:{{ ray_port }} \
154
+ --object-manager-port=8076 \
155
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
156
+
157
+ echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
158
+ sleep 5
159
+ done
160
+ [ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
161
+ }
162
+ retry_ray
163
+
164
+ head_node: {}
165
+ worker_nodes: {}
166
+
167
+ head_setup_commands: []
168
+ worker_setup_commands: []
169
+
170
+ cluster_synced_files: []
171
+ file_mounts_sync_continuously: False
@@ -0,0 +1,73 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.shadeform
11
+ region: "{{region}}"
12
+ disable_launch_config_check: true
13
+
14
+ auth:
15
+ ssh_user: shadeform
16
+ ssh_private_key: {{ssh_private_key}}
17
+ ssh_key_id: {{ssh_key_id}}
18
+
19
+ available_node_types:
20
+ ray_head_default:
21
+ {%- if custom_resources %}
22
+ resources: {{custom_resources}}
23
+ {%- else %}
24
+ resources: {}
25
+ {%- endif %}
26
+ node_config:
27
+ InstanceType: {{instance_type}}
28
+ PublicKey: |-
29
+ skypilot:ssh_public_key_content
30
+
31
+ head_node_type: ray_head_default
32
+
33
+ # Format: `REMOTE_PATH : LOCAL_PATH`
34
+ file_mounts: {
35
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
36
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
37
+ {%- for remote_path, local_path in credentials.items() %}
38
+ "{{remote_path}}": "{{local_path}}",
39
+ {%- endfor %}
40
+ }
41
+
42
+ rsync_exclude: []
43
+
44
+ initialization_commands: []
45
+
46
+ # List of shell commands to run to set up nodes.
47
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
48
+ # connection, which is expensive. Try your best to co-locate commands into fewer
49
+ # items!
50
+ #
51
+ # Increment the following for catching performance bugs easier:
52
+ # current num items (num SSH connections): 1
53
+ setup_commands:
54
+ # Create ~/.ssh/config file in case the file does not exist in the image.
55
+ # Line 'rm ..': there is another installation of pip.
56
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
57
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
58
+ # Line 'mkdir -p ..': disable host key check
59
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
60
+ - {%- for initial_setup_command in initial_setup_commands %}
61
+ {{ initial_setup_command }}
62
+ {%- endfor %}
63
+ mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
64
+ {{ conda_installation_commands }}
65
+ {{ ray_skypilot_installation_commands }}
66
+ {{ copy_skypilot_templates_commands }}
67
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
68
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
69
+ (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
70
+ {{ ssh_max_sessions_config }}
71
+
72
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
73
+ # We do not need to list it here anymore.
@@ -0,0 +1,85 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.slurm
11
+
12
+ cluster: {{slurm_cluster}}
13
+ partition: {{slurm_partition}}
14
+
15
+ ssh:
16
+ hostname: {{ssh_hostname}}
17
+ port: {{ssh_port}}
18
+ user: {{ssh_user}}
19
+ private_key: {{slurm_private_key}}
20
+ {% if slurm_proxy_command is not none %}
21
+ proxycommand: {{slurm_proxy_command | tojson }}
22
+ {% endif %}
23
+
24
+ auth:
25
+ ssh_user: {{ssh_user}}
26
+ # TODO(jwj): Modify this tmp workaround.
27
+ # ssh_private_key: {{ssh_private_key}}
28
+ ssh_private_key: {{slurm_private_key}}
29
+ ssh_proxy_command: {{slurm_proxy_command | tojson }}
30
+
31
+ available_node_types:
32
+ ray_head_default:
33
+ resources: {}
34
+ node_config:
35
+ # From clouds/slurm.py::Slurm.make_deploy_resources_variables.
36
+ instance_type: {{instance_type}}
37
+ disk_size: {{disk_size}}
38
+ cpus: {{cpus}}
39
+ memory: {{memory}}
40
+ accelerator_type: {{accelerator_type}}
41
+ accelerator_count: {{accelerator_count}}
42
+
43
+ # TODO: more configs that is required by the provisioner to create new
44
+ # instances on the FluffyCloud:
45
+ # sky/provision/fluffycloud/instance.py::run_instances
46
+
47
+ head_node_type: ray_head_default
48
+
49
+ # Format: `REMOTE_PATH : LOCAL_PATH`
50
+ file_mounts: {
51
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
52
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
53
+ {%- for remote_path, local_path in credentials.items() %}
54
+ "{{remote_path}}": "{{local_path}}",
55
+ {%- endfor %}
56
+ }
57
+
58
+ rsync_exclude: []
59
+
60
+ initialization_commands: []
61
+
62
+ # List of shell commands to run to set up nodes.
63
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
64
+ # connection, which is expensive. Try your best to co-locate commands into fewer
65
+ # items!
66
+ #
67
+ # Increment the following for catching performance bugs easier:
68
+ # current num items (num SSH connections): 1
69
+ setup_commands:
70
+ - {%- for initial_setup_command in initial_setup_commands %}
71
+ {{ initial_setup_command }}
72
+ {%- endfor %}
73
+ {{ setup_sky_dirs_commands }}
74
+ {{ conda_installation_commands }}
75
+ {{ skypilot_wheel_installation_commands }}
76
+ {{ copy_skypilot_templates_commands }}
77
+
78
+ head_node: {}
79
+ worker_nodes: {}
80
+
81
+ # These fields are required for external cloud providers.
82
+ head_setup_commands: []
83
+ worker_setup_commands: []
84
+ cluster_synced_files: []
85
+ file_mounts_sync_continuously: False
@@ -10,6 +10,7 @@ provider:
10
10
  module: sky.provision.vast
11
11
  region: "{{region}}"
12
12
  disable_launch_config_check: true
13
+ secure_only: {{secure_only}}
13
14
 
14
15
  auth:
15
16
  ssh_user: root
@@ -61,6 +62,7 @@ setup_commands:
61
62
  mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
62
63
  {{ conda_installation_commands }}
63
64
  {{ ray_skypilot_installation_commands }}
65
+ {{ copy_skypilot_templates_commands }}
64
66
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
65
67
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
66
68
  (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
@@ -67,6 +67,7 @@ setup_commands:
67
67
  pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
68
68
  {{ conda_installation_commands }}
69
69
  {{ ray_skypilot_installation_commands }}
70
+ {{ copy_skypilot_templates_commands }}
70
71
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
71
72
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
72
73
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -11,15 +11,25 @@ This script is useful for users who do not have local Kubernetes credentials.
11
11
  import asyncio
12
12
  from http.cookiejar import MozillaCookieJar
13
13
  import os
14
+ import struct
14
15
  import sys
15
- from typing import Dict
16
+ import time
17
+ from typing import Dict, Optional
16
18
  from urllib.request import Request
17
19
 
20
+ import requests
18
21
  import websockets
19
22
  from websockets.asyncio.client import ClientConnection
20
23
  from websockets.asyncio.client import connect
21
24
 
25
+ from sky import exceptions
26
+ from sky.client import service_account_auth
27
+ from sky.server import constants
28
+ from sky.server.server import KubernetesSSHMessageType
29
+ from sky.skylet import constants as skylet_constants
30
+
22
31
  BUFFER_SIZE = 2**16 # 64KB
32
+ HEARTBEAT_INTERVAL_SECONDS = 10
23
33
 
24
34
  # Environment variable for a file path to the API cookie file.
25
35
  # Keep in sync with server/constants.py
@@ -28,6 +38,8 @@ API_COOKIE_FILE_ENV_VAR = 'SKYPILOT_API_COOKIE_FILE'
28
38
  # Keep in sync with server/constants.py
29
39
  API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
30
40
 
41
+ MAX_UNANSWERED_PINGS = 100
42
+
31
43
 
32
44
  def _get_cookie_header(url: str) -> Dict[str, str]:
33
45
  """Extract Cookie header value from a cookie jar for a specific URL"""
@@ -49,85 +61,218 @@ def _get_cookie_header(url: str) -> Dict[str, str]:
49
61
  return {'Cookie': cookie_header}
50
62
 
51
63
 
52
- async def main(url: str) -> None:
53
- cookie_header = _get_cookie_header(url)
54
- async with connect(url,
55
- ping_interval=None,
56
- additional_headers=cookie_header) as websocket:
57
- if os.isatty(sys.stdin.fileno()):
58
- # pylint: disable=import-outside-toplevel
59
- import termios
60
- import tty
61
- old_settings = termios.tcgetattr(sys.stdin.fileno())
62
- tty.setraw(sys.stdin.fileno())
64
+ async def main(url: str, timestamps_supported: bool, login_url: str) -> None:
65
+ headers = {}
66
+ headers.update(_get_cookie_header(url))
67
+ headers.update(service_account_auth.get_service_account_headers())
68
+ try:
69
+ async with connect(url, ping_interval=None,
70
+ additional_headers=headers) as websocket:
71
+ await run_websocket_proxy(websocket, timestamps_supported)
72
+ except websockets.exceptions.InvalidStatus as e:
73
+ if e.response.status_code == 403:
74
+ print(str(exceptions.ApiServerAuthenticationError(login_url)),
75
+ file=sys.stderr)
63
76
  else:
64
- old_settings = None
77
+ print(f'Error ssh into cluster: {e}', file=sys.stderr)
78
+ sys.exit(1)
79
+
80
+
81
+ async def run_websocket_proxy(websocket: ClientConnection,
82
+ timestamps_supported: bool) -> None:
83
+ if os.isatty(sys.stdin.fileno()):
84
+ # pylint: disable=import-outside-toplevel
85
+ import termios
86
+ import tty
87
+ old_settings = termios.tcgetattr(sys.stdin.fileno())
88
+ tty.setraw(sys.stdin.fileno())
89
+ else:
90
+ old_settings = None
91
+
92
+ try:
93
+ loop = asyncio.get_running_loop()
94
+ # Use asyncio.Stream primitives to wrap stdin and stdout, this is to
95
+ # avoid creating a new thread for each read/write operation
96
+ # excessively.
97
+ stdin_reader = asyncio.StreamReader()
98
+ protocol = asyncio.StreamReaderProtocol(stdin_reader)
99
+ await loop.connect_read_pipe(lambda: protocol, sys.stdin)
100
+ transport, protocol = await loop.connect_write_pipe(
101
+ asyncio.streams.FlowControlMixin, sys.stdout) # type: ignore
102
+ stdout_writer = asyncio.StreamWriter(transport, protocol, None, loop)
103
+ # Dictionary to store last ping time for latency measurement
104
+ last_ping_time_dict: Optional[Dict[int, float]] = None
105
+ if timestamps_supported:
106
+ last_ping_time_dict = {}
107
+
108
+ # Use an Event to signal when websocket is closed
109
+ websocket_closed_event = asyncio.Event()
110
+ websocket_lock = asyncio.Lock()
111
+
112
+ await asyncio.gather(
113
+ stdin_to_websocket(stdin_reader, websocket, timestamps_supported,
114
+ websocket_closed_event, websocket_lock),
115
+ websocket_to_stdout(websocket, stdout_writer, timestamps_supported,
116
+ last_ping_time_dict, websocket_closed_event,
117
+ websocket_lock),
118
+ latency_monitor(websocket, last_ping_time_dict,
119
+ websocket_closed_event, websocket_lock),
120
+ return_exceptions=True)
121
+ finally:
122
+ if old_settings:
123
+ termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
124
+ old_settings)
125
+
65
126
 
127
+ async def latency_monitor(websocket: ClientConnection,
128
+ last_ping_time_dict: Optional[dict],
129
+ websocket_closed_event: asyncio.Event,
130
+ websocket_lock: asyncio.Lock):
131
+ """Periodically send PING messages (type 1) to measure latency."""
132
+ if last_ping_time_dict is None:
133
+ return
134
+ next_id = 0
135
+ while not websocket_closed_event.is_set():
66
136
  try:
67
- loop = asyncio.get_running_loop()
68
- # Use asyncio.Stream primitives to wrap stdin and stdout, this is to
69
- # avoid creating a new thread for each read/write operation
70
- # excessively.
71
- stdin_reader = asyncio.StreamReader()
72
- protocol = asyncio.StreamReaderProtocol(stdin_reader)
73
- await loop.connect_read_pipe(lambda: protocol, sys.stdin)
74
- transport, protocol = await loop.connect_write_pipe(
75
- asyncio.streams.FlowControlMixin, sys.stdout) # type: ignore
76
- stdout_writer = asyncio.StreamWriter(transport, protocol, None,
77
- loop)
78
-
79
- await asyncio.gather(stdin_to_websocket(stdin_reader, websocket),
80
- websocket_to_stdout(websocket, stdout_writer))
81
- finally:
82
- if old_settings:
83
- termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
84
- old_settings)
137
+ await asyncio.sleep(HEARTBEAT_INTERVAL_SECONDS)
138
+ if len(last_ping_time_dict) >= MAX_UNANSWERED_PINGS:
139
+ # We are not getting responses, clear the dictionary so
140
+ # as not to grow unbounded.
141
+ last_ping_time_dict.clear()
142
+ ping_time = time.time()
143
+ next_id += 1
144
+ last_ping_time_dict[next_id] = ping_time
145
+ message_header_bytes = struct.pack(
146
+ '!BI', KubernetesSSHMessageType.PINGPONG.value, next_id)
147
+ try:
148
+ async with websocket_lock:
149
+ await websocket.send(message_header_bytes)
150
+ except websockets.exceptions.ConnectionClosed as e:
151
+ # Websocket is already closed.
152
+ print(f'Failed to send PING message: {e}', file=sys.stderr)
153
+ break
154
+ except Exception as e:
155
+ print(f'Error in latency_monitor: {e}', file=sys.stderr)
156
+ websocket_closed_event.set()
157
+ raise e
85
158
 
86
159
 
87
160
  async def stdin_to_websocket(reader: asyncio.StreamReader,
88
- websocket: ClientConnection):
161
+ websocket: ClientConnection,
162
+ timestamps_supported: bool,
163
+ websocket_closed_event: asyncio.Event,
164
+ websocket_lock: asyncio.Lock):
89
165
  try:
90
- while True:
166
+ while not websocket_closed_event.is_set():
91
167
  # Read at most BUFFER_SIZE bytes, this not affect
92
168
  # responsiveness since it will return as soon as
93
169
  # there is at least one byte.
94
170
  # The BUFFER_SIZE is chosen to be large enough to improve
95
171
  # throughput.
96
172
  data = await reader.read(BUFFER_SIZE)
173
+
97
174
  if not data:
98
175
  break
99
- await websocket.send(data)
176
+ if timestamps_supported:
177
+ # Send message with type 0 to indicate data.
178
+ message_type_bytes = struct.pack(
179
+ '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
180
+ data = message_type_bytes + data
181
+ async with websocket_lock:
182
+ await websocket.send(data)
183
+
100
184
  except Exception as e: # pylint: disable=broad-except
101
185
  print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
102
186
  finally:
103
- await websocket.close()
187
+ async with websocket_lock:
188
+ await websocket.close()
189
+ websocket_closed_event.set()
104
190
 
105
191
 
106
192
  async def websocket_to_stdout(websocket: ClientConnection,
107
- writer: asyncio.StreamWriter):
193
+ writer: asyncio.StreamWriter,
194
+ timestamps_supported: bool,
195
+ last_ping_time_dict: Optional[dict],
196
+ websocket_closed_event: asyncio.Event,
197
+ websocket_lock: asyncio.Lock):
108
198
  try:
109
- while True:
199
+ while not websocket_closed_event.is_set():
110
200
  message = await websocket.recv()
201
+ if (timestamps_supported and len(message) > 0 and
202
+ last_ping_time_dict is not None):
203
+ message_type = struct.unpack('!B', message[:1])[0]
204
+ if message_type == KubernetesSSHMessageType.REGULAR_DATA.value:
205
+ # Regular data - strip type byte and write to stdout
206
+ message = message[1:]
207
+ elif message_type == KubernetesSSHMessageType.PINGPONG.value:
208
+ # PONG response - calculate latency and send measurement
209
+ if not len(message) == struct.calcsize('!BI'):
210
+ raise ValueError(
211
+ f'Invalid PONG message length: {len(message)}')
212
+ pong_id = struct.unpack('!I', message[1:5])[0]
213
+ pong_time = time.time()
214
+
215
+ ping_time = last_ping_time_dict.pop(pong_id, None)
216
+
217
+ if ping_time is None:
218
+ continue
219
+
220
+ latency_seconds = pong_time - ping_time
221
+ latency_ms = int(latency_seconds * 1000)
222
+
223
+ # Send latency measurement (type 2)
224
+ message_type_bytes = struct.pack(
225
+ '!B',
226
+ KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
227
+ latency_bytes = struct.pack('!Q', latency_ms)
228
+ message = message_type_bytes + latency_bytes
229
+ # Send to server.
230
+ async with websocket_lock:
231
+ await websocket.send(message)
232
+ continue
233
+ # No timestamps support, write directly
111
234
  writer.write(message)
112
235
  await writer.drain()
113
236
  except websockets.exceptions.ConnectionClosed:
114
237
  print('WebSocket connection closed', file=sys.stderr)
115
238
  except Exception as e: # pylint: disable=broad-except
116
239
  print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
240
+ raise e
241
+ finally:
242
+ async with websocket_lock:
243
+ await websocket.close()
244
+ websocket_closed_event.set()
117
245
 
118
246
 
119
247
  if __name__ == '__main__':
120
248
  server_url = sys.argv[1].strip('/')
121
- if '://' not in server_url:
122
- # Keep backward compatibility for legacy server URLs without protocol
123
- # TODO(aylei): Remove this after 0.10.0
124
- server_url = f'http://{server_url}'
125
249
 
250
+ disable_latency_measurement = os.environ.get(
251
+ skylet_constants.SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR, '0') == '1'
252
+ if disable_latency_measurement:
253
+ timestamps_are_supported = False
254
+ else:
255
+ # TODO(aylei): remove the separate /api/health call and use the header
256
+ # during websocket handshake to determine the server version.
257
+ health_url = f'{server_url}/api/health'
258
+ cookie_hdr = _get_cookie_header(health_url)
259
+ health_response = requests.get(health_url, headers=cookie_hdr)
260
+ health_data = health_response.json()
261
+ timestamps_are_supported = int(health_data.get('api_version', 0)) > 21
262
+
263
+ # Capture the original API server URL for login hint if authentication
264
+ # is required.
265
+ _login_url = server_url
126
266
  server_proto, server_fqdn = server_url.split('://')
127
267
  websocket_proto = 'ws'
128
268
  if server_proto == 'https':
129
269
  websocket_proto = 'wss'
130
270
  server_url = f'{websocket_proto}://{server_fqdn}'
271
+
272
+ client_version_str = (f'&client_version={constants.API_VERSION}'
273
+ if timestamps_are_supported else '')
274
+
131
275
  websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
132
- f'?cluster_name={sys.argv[2]}')
133
- asyncio.run(main(websocket_url))
276
+ f'?cluster_name={sys.argv[2]}'
277
+ f'{client_version_str}')
278
+ asyncio.run(main(websocket_url, timestamps_are_supported, _login_url))