skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,400 @@
1
+ """ Shadeform Cloud. """
2
+
3
+ import json
4
+ import os
5
+ import typing
6
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
7
+
8
+ from sky import catalog
9
+ from sky import clouds
10
+ from sky.adaptors import common as adaptors_common
11
+ from sky.catalog import shadeform_catalog
12
+ from sky.utils import registry
13
+ from sky.utils import resources_utils
14
+ from sky.utils import status_lib
15
+
16
+ if typing.TYPE_CHECKING:
17
+ from sky import resources as resources_lib
18
+ from sky.utils import volume as volume_lib
19
+ else:
20
+ requests = adaptors_common.LazyImport('requests')
21
+
22
+ # Minimum set of files under ~/.shadeform that grant Shadeform access.
23
+ _CREDENTIAL_FILES = [
24
+ 'api_key',
25
+ ]
26
+
27
+
28
+ @registry.CLOUD_REGISTRY.register
29
+ class Shadeform(clouds.Cloud):
30
+ """Shadeform GPU Cloud
31
+
32
+ Shadeform is a unified API for deploying and managing cloud GPUs across
33
+ multiple cloud providers.
34
+ """
35
+
36
+ # Shadeform doesn't have explicit cluster name limits, but conservative
37
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 120
38
+
39
+ # Features not currently supported by Shadeform
40
+ # yapf: disable
41
+ _CLOUD_UNSUPPORTED_FEATURES = {
42
+ clouds.CloudImplementationFeatures.STOP:
43
+ 'Stopping instances not supported on Shadeform.',
44
+ clouds.CloudImplementationFeatures.MULTI_NODE:
45
+ 'Multi-node clusters not supported on Shadeform.',
46
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE:
47
+ 'Spot instances not supported on Shadeform.',
48
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
49
+ 'Custom disk tiers not supported on Shadeform.',
50
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
51
+ 'Custom network tiers not supported on Shadeform.',
52
+ clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
53
+ 'Object storage mounting not supported on Shadeform.',
54
+ clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
55
+ 'Host controllers not supported on Shadeform.',
56
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
57
+ 'High availability controllers not supported.',
58
+ clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
59
+ 'Disk cloning not supported on Shadeform.',
60
+ clouds.CloudImplementationFeatures.IMAGE_ID:
61
+ 'Custom image IDs not supported on Shadeform.',
62
+ clouds.CloudImplementationFeatures.DOCKER_IMAGE:
63
+ 'Docker images not supported on Shadeform yet.',
64
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
65
+ 'Custom multiple network interfaces not supported.',
66
+ }
67
+ # yapf: enable
68
+
69
+ _regions: List[clouds.Region] = []
70
+
71
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
72
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
73
+ OPEN_PORTS_VERSION = clouds.OpenPortsVersion.LAUNCH_ONLY
74
+
75
+ @classmethod
76
+ def _unsupported_features_for_resources(
77
+ cls,
78
+ resources: 'resources_lib.Resources',
79
+ region: Optional[str] = None,
80
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
81
+ """The features not supported based on the resources provided."""
82
+ del resources # unused
83
+ return cls._CLOUD_UNSUPPORTED_FEATURES
84
+
85
+ @classmethod
86
+ def _max_cluster_name_length(cls) -> Optional[int]:
87
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
88
+
89
+ @classmethod
90
+ def regions_with_offering(
91
+ cls,
92
+ instance_type: str,
93
+ accelerators: Optional[Dict[str, int]],
94
+ use_spot: bool,
95
+ region: Optional[str],
96
+ zone: Optional[str],
97
+ resources: Optional['resources_lib.Resources'] = None,
98
+ ) -> List[clouds.Region]:
99
+ """Get regions that offer the requested instance type."""
100
+ assert zone is None, 'Shadeform does not support zones.'
101
+ del zone # unused
102
+ if use_spot:
103
+ return [] # No spot support
104
+
105
+ # IMPORTANT: instance_type here is the specific Shadeform instance type
106
+ # (like 'massedcompute_A6000_base'), NOT the accelerator name
107
+ # We only return regions where this exact instance type exists
108
+ regions = shadeform_catalog.get_region_zones_for_instance_type(
109
+ instance_type, use_spot)
110
+
111
+ if region is not None:
112
+ regions = [r for r in regions if r.name == region]
113
+ return regions
114
+
115
+ @classmethod
116
+ def zones_provision_loop(
117
+ cls,
118
+ *,
119
+ region: str,
120
+ num_nodes: int,
121
+ instance_type: str,
122
+ accelerators: Optional[Dict[str, int]] = None,
123
+ use_spot: bool = False,
124
+ ) -> Iterator[None]:
125
+ """Iterate over zones for provisioning."""
126
+ del num_nodes # unused
127
+ if use_spot:
128
+ return
129
+
130
+ regions = cls.regions_with_offering(instance_type, accelerators,
131
+ use_spot, region, None)
132
+ for r in regions:
133
+ assert r.zones is None, r
134
+ yield r.zones
135
+
136
+ @classmethod
137
+ def get_vcpus_mem_from_instance_type(
138
+ cls,
139
+ instance_type: str,
140
+ ) -> Tuple[Optional[float], Optional[float]]:
141
+ """Get vCPUs and memory from instance type."""
142
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
143
+ clouds='shadeform')
144
+
145
+ @classmethod
146
+ def get_accelerators_from_instance_type(
147
+ cls,
148
+ instance_type: str,
149
+ ) -> Optional[Dict[str, Union[int, float]]]:
150
+ """Get accelerator information from instance type."""
151
+ return catalog.get_accelerators_from_instance_type(instance_type,
152
+ clouds='shadeform')
153
+
154
+ @classmethod
155
+ def get_default_instance_type(
156
+ cls,
157
+ cpus: Optional[str] = None,
158
+ memory: Optional[str] = None,
159
+ disk_tier: Optional[resources_utils.DiskTier] = None,
160
+ region: Optional[str] = None,
161
+ zone: Optional[str] = None,
162
+ ) -> Optional[str]:
163
+ """Get default instance type."""
164
+ del disk_tier # Not supported
165
+ return catalog.get_default_instance_type(cpus=cpus,
166
+ memory=memory,
167
+ disk_tier=None,
168
+ region=region,
169
+ zone=zone,
170
+ clouds='shadeform')
171
+
172
+ @classmethod
173
+ def get_zone_shell_cmd(cls) -> Optional[str]:
174
+ """Return shell command to get the zone of the instance."""
175
+ return None
176
+
177
+ @classmethod
178
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
179
+ """Get user identities for Shadeform."""
180
+ # No user identity support needed
181
+ return None
182
+
183
+ def instance_type_exists(self, instance_type: str) -> bool:
184
+ return catalog.instance_type_exists(instance_type, 'shadeform')
185
+
186
+ def instance_type_to_hourly_cost(self,
187
+ instance_type: str,
188
+ use_spot: bool,
189
+ region: Optional[str] = None,
190
+ zone: Optional[str] = None) -> float:
191
+ """Get hourly cost for instance type."""
192
+ if use_spot:
193
+ raise ValueError('Spot instances are not supported on Shadeform')
194
+ return catalog.get_hourly_cost(instance_type,
195
+ use_spot=use_spot,
196
+ region=region,
197
+ zone=zone,
198
+ clouds='shadeform')
199
+
200
+ def accelerators_to_hourly_cost(self,
201
+ accelerators: Dict[str, int],
202
+ use_spot: bool,
203
+ region: Optional[str] = None,
204
+ zone: Optional[str] = None) -> float:
205
+ """Get hourly cost for accelerators."""
206
+ return 0.0
207
+
208
+ def get_egress_cost(self, num_gigabytes: float) -> float:
209
+ """Get egress cost."""
210
+ # No explicit egress pricing from Shadeform API
211
+ return 0.0
212
+
213
+ def __repr__(self):
214
+ return 'Shadeform'
215
+
216
+ @classmethod
217
+ def get_current_user_identity(cls) -> Optional[str]:
218
+ """Get current user identity."""
219
+ return None
220
+
221
+ def make_deploy_resources_variables(
222
+ self,
223
+ resources: 'resources_lib.Resources',
224
+ cluster_name: resources_utils.ClusterName,
225
+ region: 'clouds.Region',
226
+ zones: Optional[List['clouds.Zone']],
227
+ num_nodes: int,
228
+ dryrun: bool = False,
229
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
230
+ ) -> Dict[str, Any]:
231
+ """Make variables for deployment template."""
232
+ del zones, num_nodes, dryrun, volume_mounts # unused for Shadeform
233
+
234
+ # Get instance type
235
+ r = resources.copy(accelerators=None)
236
+ feasible_resources = self._get_feasible_launchable_resources(r)
237
+ instance_type = feasible_resources.resources_list[0].instance_type
238
+
239
+ resources_vars = {}
240
+ if instance_type is not None:
241
+ instance_type_split = instance_type.split('_')
242
+ cloud = instance_type_split[0]
243
+ resources_vars.update({
244
+ 'instance_type': instance_type,
245
+ 'region': region.name,
246
+ 'cloud': cloud,
247
+ })
248
+
249
+ # Add accelerator resources for Ray
250
+ accelerators = resources.accelerators
251
+ if accelerators is not None:
252
+ resources_vars['custom_resources'] = json.dumps(accelerators,
253
+ separators=(',',
254
+ ':'))
255
+
256
+ return resources_vars
257
+
258
+ def get_credential_file_mounts(self) -> Dict[str, str]:
259
+ """Get credential files that need to be mounted."""
260
+ return {
261
+ f'~/.shadeform/{f}': f'~/.shadeform/{f}' for f in _CREDENTIAL_FILES
262
+ }
263
+
264
+ @classmethod
265
+ def get_current_user_identity_str(cls) -> Optional[str]:
266
+ """Get current user identity string."""
267
+ return None
268
+
269
+ @classmethod
270
+ def check_credentials(
271
+ cls, cloud_capability: clouds.CloudCapability
272
+ ) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
273
+ """Check if Shadeform credentials are properly configured."""
274
+ del cloud_capability # unused for Shadeform
275
+ try:
276
+ api_key_path = os.path.expanduser('~/.shadeform/api_key')
277
+ if not os.path.exists(api_key_path):
278
+ return False, (f'Shadeform API key not found. '
279
+ f'Please save your API key to {api_key_path}')
280
+
281
+ # Try to read the API key
282
+ with open(api_key_path, 'r', encoding='utf-8') as f:
283
+ api_key = f.read().strip()
284
+
285
+ if not api_key:
286
+ return False, f'Shadeform API key is empty in {api_key_path}'
287
+
288
+ return True, None
289
+
290
+ except (OSError, IOError) as e:
291
+ return False, f'Error checking Shadeform credentials: {str(e)}'
292
+
293
+ def _get_feasible_launchable_resources(
294
+ self, resources: 'resources_lib.Resources'
295
+ ) -> 'resources_utils.FeasibleResources':
296
+ """Get feasible launchable resources."""
297
+ if resources.use_spot:
298
+ return resources_utils.FeasibleResources(
299
+ [], [], 'Spot instances are not supported on Shadeform.')
300
+
301
+ if resources.instance_type is not None:
302
+ # Instance type is already specified, validate it
303
+ assert resources.is_launchable(), resources
304
+ fuzzy_candidate_list = [resources.instance_type]
305
+ return resources_utils.FeasibleResources([resources],
306
+ fuzzy_candidate_list, None)
307
+
308
+ # Map accelerators to instance types
309
+ def _make_resources(instance_type_list):
310
+ resource_list = []
311
+ for instance_type in instance_type_list:
312
+ r = resources.copy(
313
+ cloud=Shadeform(),
314
+ instance_type=instance_type,
315
+ accelerators=resources.
316
+ accelerators, # Keep original accelerators
317
+ cpus=None,
318
+ memory=None,
319
+ )
320
+ resource_list.append(r)
321
+ return resource_list
322
+
323
+ # Handle accelerator requests
324
+ accelerators = resources.accelerators
325
+ if accelerators is not None:
326
+ # Get the first accelerator type and count
327
+ for accelerator_name, accelerator_count in accelerators.items():
328
+ # Get instance types that provide this accelerator
329
+ func = shadeform_catalog.get_instance_type_for_accelerator
330
+ instance_types, errors = func(accelerator_name,
331
+ accelerator_count,
332
+ use_spot=resources.use_spot)
333
+
334
+ if instance_types:
335
+ # Create separate resource objects for each instance type
336
+ # This is crucial: each resource will only be considered
337
+ # for regions where its specific instance type is available
338
+ all_resources = []
339
+ all_candidate_names = []
340
+
341
+ # Create one resource per instance type
342
+ for instance_type in instance_types:
343
+ resource = resources.copy(
344
+ cloud=Shadeform(),
345
+ instance_type=instance_type,
346
+ accelerators=resources.accelerators,
347
+ cpus=None,
348
+ memory=None,
349
+ )
350
+ all_resources.append(resource)
351
+ all_candidate_names.append(instance_type)
352
+
353
+ return resources_utils.FeasibleResources(
354
+ all_resources, all_candidate_names, None)
355
+ else:
356
+ error_msg = (f'No instances available for accelerator '
357
+ f'{accelerator_name}')
358
+ if errors:
359
+ error_msg += f': {"; ".join(errors)}'
360
+ return resources_utils.FeasibleResources([], [], error_msg)
361
+
362
+ # If accelerator not found in mapping, return error
363
+ return resources_utils.FeasibleResources(
364
+ [], [],
365
+ f'Accelerator {list(accelerators.keys())[0]} not supported.')
366
+
367
+ # No accelerators specified, return a default instance type
368
+ if accelerators is None:
369
+ # Return a default instance type
370
+ default_instance_type = Shadeform.get_default_instance_type(
371
+ cpus=resources.cpus,
372
+ memory=resources.memory,
373
+ disk_tier=resources.disk_tier,
374
+ region=resources.region,
375
+ zone=resources.zone)
376
+ if default_instance_type is None:
377
+ # TODO: Add hints to all return values in this method to help
378
+ # users understand why the resources are not launchable.
379
+ return resources_utils.FeasibleResources([], [], None)
380
+ else:
381
+ return resources_utils.FeasibleResources(
382
+ _make_resources([default_instance_type]), [], None)
383
+
384
+ @classmethod
385
+ def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
386
+ """Check compute credentials."""
387
+ success, msg = cls.check_credentials(clouds.CloudCapability.COMPUTE)
388
+ # Convert return type to match expected signature
389
+ if isinstance(msg, dict):
390
+ msg = str(msg)
391
+ return success, msg
392
+
393
+ @classmethod
394
+ def query_status(cls, name: str, tag_filters: Dict[str, str],
395
+ region: Optional[str], zone: Optional[str],
396
+ **kwargs) -> List[status_lib.ClusterStatus]:
397
+ """Query cluster status."""
398
+ # For validation purposes, return empty list (no existing clusters)
399
+ # Actual status querying is handled by the provisioner
400
+ return []