skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,329 @@
1
+ """A script that generates the Seeweb catalog.
2
+
3
+ Usage:
4
+ python fetch_seeweb.py [-h] [--api-key API_KEY]
5
+ [--api-key-path API_KEY_PATH]
6
+
7
+ If neither --api-key nor --api-key-path are provided, this script will parse
8
+ `~/.seeweb_cloud/seeweb_keys` to look for Seeweb API key.
9
+ """
10
+ import argparse
11
+ import configparser
12
+ import csv
13
+ import json
14
+ import os
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ from sky.adaptors.seeweb import ecsapi
18
+
19
+ # GPU name mapping from Seeweb to SkyPilot canonical names
20
+ SEEWEB_GPU_NAME_TO_SKYPILOT_GPU_NAME = {
21
+ 'H200 141GB': 'H200',
22
+ 'RTX A6000 48GB': 'RTXA6000',
23
+ 'A100 80GB': 'A100',
24
+ 'L4 24GB': 'L4',
25
+ 'L40s 48GB': 'L40S',
26
+ 'H100 80GB': 'H100',
27
+ 'MI300X': 'MI300X',
28
+ 'A30': 'A30',
29
+ 'RTX 6000 24GB': 'RTX6000',
30
+ 'Tenstorrent Grayskull e75': 'GRAYSKULL-E75',
31
+ 'Tenstorrent Grayskull e150': 'GRAYSKULL-E150',
32
+ }
33
+
34
+ # GPU VRAM mapping in MB
35
+ VRAM = {
36
+ 'RTXA6000': 48384, # 48GB
37
+ 'H200': 144384, # 141GB
38
+ 'A100': 81920, # 80GB
39
+ 'L4': 24576, # 24GB
40
+ 'L40S': 49152, # 48GB
41
+ 'H100': 81920, # 80GB
42
+ 'MI300X': 192000, # 192GB
43
+ 'A30': 24576, # 24GB
44
+ 'RTX6000': 24576, # 24GB
45
+ 'GRAYSKULL-E75': 8192, # 8GB
46
+ 'GRAYSKULL-E150': 8192, # 8GB
47
+ }
48
+
49
+
50
+ def is_tenstorrent_gpu_name(gpu_name: Optional[str]) -> bool:
51
+ """Return True if the given GPU name refers to a Tenstorrent GPU.
52
+
53
+ Detects by common identifiers present in normalized names (e.g., GRAYSKULL)
54
+ or by the vendor name directly.
55
+ """
56
+ if not gpu_name:
57
+ return False
58
+ upper = str(gpu_name).upper()
59
+ return 'TENSTORRENT' in upper or 'GRAYSKULL' in upper
60
+
61
+
62
+ def is_mi300x_gpu_name(gpu_name: Optional[str]) -> bool:
63
+ """Return True if the given GPU name refers to AMD MI300X."""
64
+ if not gpu_name:
65
+ return False
66
+ return 'MI300X' in str(gpu_name).upper()
67
+
68
+
69
+ def get_api_key(path: Optional[str] = None) -> str:
70
+ """Get API key from config file or environment variable."""
71
+ # Step 1: Try to get from config file
72
+ if path is None:
73
+ path = os.path.expanduser('~/.seeweb_cloud/seeweb_keys')
74
+ else:
75
+ path = os.path.expanduser(path)
76
+
77
+ try:
78
+ parser = configparser.ConfigParser()
79
+ parser.read(path)
80
+ return parser['DEFAULT']['api_key'].strip()
81
+ except (KeyError, FileNotFoundError) as exc:
82
+ # Step 2: Try environment variable
83
+ api_key = os.environ.get('SEEWEB_API_KEY')
84
+ if api_key:
85
+ return api_key.strip()
86
+
87
+ # If neither found, raise error
88
+ raise ValueError(
89
+ f'API key not found in {path} or ENV variable SEEWEB_API_KEY'
90
+ ) from exc
91
+
92
+
93
+ def normalize_gpu_name(gpu_name: str) -> str:
94
+ """Normalize GPU name from Seeweb API to SkyPilot canonical name."""
95
+ if not gpu_name:
96
+ return ''
97
+
98
+ # Map to canonical name if available
99
+ canonical_name = SEEWEB_GPU_NAME_TO_SKYPILOT_GPU_NAME.get(gpu_name)
100
+ if canonical_name:
101
+ return canonical_name
102
+
103
+ # If not found in mapping, return original name
104
+ print(f'Warning: GPU name "{gpu_name}" not found in mapping,'
105
+ f'using original name')
106
+ return gpu_name
107
+
108
+
109
+ def parse_plan_info(plan: Any) -> Dict[str, Any]:
110
+ """Parse plan information from Seeweb API response."""
111
+ # Handle both dictionary and object formats
112
+ if hasattr(plan, 'name'):
113
+ # Object format from API
114
+ plan_name = getattr(plan, 'name', 'unknown')
115
+ vcpus = int(getattr(plan, 'cpu', 0))
116
+
117
+ # Handle memory conversion safely
118
+ memory_mb = getattr(plan, 'ram', 0)
119
+ try:
120
+ memory_gb = int(
121
+ memory_mb) / 1024 if memory_mb else 0 # Convert to GB
122
+ except (ValueError, TypeError):
123
+ memory_gb = 0
124
+
125
+ # Handle price safely
126
+ try:
127
+ price = float(getattr(plan, 'hourly_price', 0.0))
128
+ except (ValueError, TypeError):
129
+ price = 0.0
130
+
131
+ # Handle GPU info
132
+ try:
133
+ gpu_count = int(getattr(plan, 'gpu', 0))
134
+ except (ValueError, TypeError):
135
+ gpu_count = 0
136
+
137
+ gpu_label = getattr(plan, 'gpu_label', None)
138
+
139
+ # Determine GPU name - use gpu_label if available,
140
+ # otherwise try to infer from plan name
141
+ if gpu_label:
142
+ gpu_name = normalize_gpu_name(gpu_label) # Normalize the GPU name
143
+ else:
144
+ # Try to extract GPU name from plan name
145
+ plan_name = getattr(plan, 'name', '')
146
+ if 'GPU' in plan_name:
147
+ # Extract GPU type from plan name (e.g., ECS1GPU11 -> GPU11)
148
+ parts = plan_name.split('GPU')
149
+ if len(parts) > 1:
150
+ gpu_name = 'GPU' + parts[1]
151
+ else:
152
+ gpu_name = 'GPU'
153
+ else:
154
+ gpu_name = None
155
+
156
+ # Get GPU VRAM from mapping using the normalized name
157
+ gpu_vram_mb = VRAM.get(gpu_name, 0) if gpu_name else 0
158
+ else:
159
+ raise ValueError(f'Unsupported plan format: {type(plan)}')
160
+
161
+ return {
162
+ 'plan_name': plan_name,
163
+ 'vcpus': vcpus,
164
+ 'memory_gb': memory_gb,
165
+ 'gpu_name': gpu_name,
166
+ 'gpu_count': gpu_count,
167
+ 'gpu_vram_mb': gpu_vram_mb,
168
+ 'price': price,
169
+ }
170
+
171
+
172
+ def get_gpu_info(gpu_count: int, gpu_name: str, gpu_vram_mb: int = 0) -> str:
173
+ """Generate GPU info JSON string compatible with SkyPilot."""
174
+ if not gpu_name or gpu_count == 0:
175
+ return ''
176
+
177
+ # Determine manufacturer based on GPU name
178
+ gpu_name_upper = str(gpu_name).upper()
179
+ if 'MI300' in gpu_name_upper or gpu_name_upper == 'MI300X':
180
+ manufacturer = 'AMD'
181
+ elif 'GRAYSKULL' in gpu_name_upper:
182
+ manufacturer = 'TENSTORRENT'
183
+ else:
184
+ manufacturer = 'NVIDIA'
185
+
186
+ gpu_info = {
187
+ 'Gpus': [{
188
+ 'Name': gpu_name,
189
+ 'Manufacturer': manufacturer,
190
+ 'Count': float(gpu_count),
191
+ 'MemoryInfo': {
192
+ 'SizeInMiB': gpu_vram_mb
193
+ },
194
+ }],
195
+ 'TotalGpuMemoryInMiB': gpu_vram_mb * gpu_count if gpu_vram_mb else 0
196
+ }
197
+
198
+ return json.dumps(gpu_info).replace('"', '\'')
199
+
200
+
201
+ def fetch_seeweb_data(api_key: str) -> List[Dict]:
202
+ """Fetch data from Seeweb API."""
203
+ if ecsapi is None:
204
+ raise ImportError('ecsapi not available')
205
+
206
+ try:
207
+ client = ecsapi.Api(token=api_key)
208
+
209
+ print('Fetching plans from Seeweb API...')
210
+ api_plans = client.fetch_plans()
211
+
212
+ if not api_plans:
213
+ raise ValueError('No plans returned from API')
214
+
215
+ print(f'Successfully fetched {len(api_plans)} plans from API')
216
+ plans = []
217
+
218
+ for plan in api_plans:
219
+ try:
220
+ # Parse first so we can filter
221
+ # Tenstorrent before extra API calls
222
+ parsed = parse_plan_info(plan)
223
+
224
+ if is_tenstorrent_gpu_name(parsed.get('gpu_name')):
225
+ print(f'Skipping Tenstorrent plan {plan.name}')
226
+ continue
227
+
228
+ if is_mi300x_gpu_name(parsed.get('gpu_name')):
229
+ print(f'Skipping MI300X plan {plan.name}')
230
+ continue
231
+
232
+ print(f'Fetching regions available for {plan.name}')
233
+ regions_available = client.fetch_regions_available(plan.name)
234
+
235
+ parsed.update({'regions_available': regions_available})
236
+ plans.append(parsed)
237
+ except Exception as e: # pylint: disable=broad-except
238
+ print(f'Error parsing plan {plan.name}: {e}')
239
+ continue
240
+
241
+ print(f'Successfully parsed {len(plans)} plans')
242
+ return plans
243
+
244
+ except Exception as e: # pylint: disable=broad-except
245
+ raise Exception(f'Error fetching data from Seeweb API: {e}') from e
246
+
247
+
248
+ def create_catalog(api_key: str, output_path: str) -> None:
249
+ """Create Seeweb catalog by fetching data from API."""
250
+ plans = fetch_seeweb_data(api_key)
251
+
252
+ # Create CSV catalog
253
+ print(f'Writing catalog to {output_path}')
254
+ with open(output_path, mode='w', encoding='utf-8') as f:
255
+ writer = csv.writer(f, delimiter=',', quotechar='"')
256
+ writer.writerow([
257
+ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
258
+ 'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
259
+ ])
260
+
261
+ for plan in plans:
262
+ try:
263
+ gpu_info_str = ''
264
+ if plan['gpu_name'] and plan['gpu_count'] > 0:
265
+ gpu_info_str = get_gpu_info(plan['gpu_count'],
266
+ plan['gpu_name'],
267
+ plan.get('gpu_vram_mb', 0))
268
+
269
+ # Handle regions - create a row for each available region
270
+ regions_available = plan['regions_available']
271
+ if isinstance(regions_available,
272
+ list) and len(regions_available) > 0:
273
+ # Create a row for each region
274
+ for region in regions_available:
275
+ writer.writerow([
276
+ plan['plan_name'], # InstanceType
277
+ plan['gpu_name'], # AcceleratorName (cleaned)
278
+ plan['gpu_count'] if plan['gpu_count'] > 0 else
279
+ '', # AcceleratorCount
280
+ plan['vcpus'], # vCPUs
281
+ plan['memory_gb'], # MemoryGiB
282
+ plan['price'], # Price
283
+ region, # Region (single region per row)
284
+ gpu_info_str, # GpuInfo
285
+ '' # SpotPrice (Seeweb doesn't support spot)
286
+ ])
287
+ else:
288
+ # No regions available, create a row with empty region
289
+ writer.writerow([
290
+ plan['plan_name'], # InstanceType
291
+ plan['gpu_name'], # AcceleratorName (cleaned)
292
+ plan['gpu_count']
293
+ if plan['gpu_count'] > 0 else '', # AcceleratorCount
294
+ plan['vcpus'], # vCPUs
295
+ plan['memory_gb'], # MemoryGiB
296
+ plan['price'], # Price
297
+ '', # Region (empty)
298
+ gpu_info_str, # GpuInfo
299
+ '' # SpotPrice (Seeweb doesn't support spot)
300
+ ])
301
+ except Exception as e: # pylint: disable=broad-except
302
+ print(f'Error processing plan {plan["plan_name"]}: {e}')
303
+ continue
304
+
305
+ print(f'Seeweb catalog saved to {output_path}')
306
+ print(f'Created {len(plans)} instance types')
307
+
308
+
309
+ def main() -> None:
310
+ """Main function to fetch and write Seeweb platform prices to a CSV file."""
311
+ parser = argparse.ArgumentParser()
312
+ parser.add_argument('--api-key', help='Seeweb API key')
313
+ parser.add_argument('--api-key-path',
314
+ help='Path to file containing Seeweb API key')
315
+ args = parser.parse_args()
316
+
317
+ # Get API key
318
+ if args.api_key:
319
+ api_key = args.api_key
320
+ else:
321
+ api_key = get_api_key(args.api_key_path)
322
+
323
+ os.makedirs('seeweb', exist_ok=True)
324
+ create_catalog(api_key, 'seeweb/vms.csv')
325
+ print('Seeweb Service Catalog saved to seeweb/vms.csv')
326
+
327
+
328
+ if __name__ == '__main__':
329
+ main()
@@ -0,0 +1,142 @@
1
+ """A script that generates the Shadeform catalog.
2
+
3
+ Usage:
4
+ python fetch_shadeform.py [-h] [--api-key API_KEY]
5
+ [--api-key-path API_KEY_PATH]
6
+
7
+ If neither --api-key nor --api-key-path are provided, this script will parse
8
+ `~/.shadeform/api_key` to look for Shadeform API key.
9
+ """
10
+ import argparse
11
+ import csv
12
+ import json
13
+ import os
14
+ from typing import Dict
15
+
16
+ import requests
17
+
18
+ ENDPOINT = 'https://api.shadeform.ai/v1/instances/types'
19
+ DEFAULT_SHADEFORM_API_KEY_PATH = os.path.expanduser('~/.shadeform/api_key')
20
+
21
+
22
+ def parse_gpu_info(gpu_type: str, num_gpus: int, ram_per_gpu: int) -> Dict:
23
+ """Parse GPU information for the catalog."""
24
+
25
+ manufacturer = 'NVIDIA'
26
+ if gpu_type == 'MI300X':
27
+ manufacturer = 'AMD'
28
+ elif gpu_type == 'GAUDI2':
29
+ manufacturer = 'Intel'
30
+
31
+ return {
32
+ 'Gpus': [{
33
+ 'Name': gpu_type,
34
+ 'Manufacturer': manufacturer,
35
+ 'Count': float(num_gpus),
36
+ 'MemoryInfo': {
37
+ 'SizeInMiB': ram_per_gpu
38
+ },
39
+ 'TotalGpuMemoryInMiB': ram_per_gpu * num_gpus
40
+ }]
41
+ }
42
+
43
+
44
+ def create_catalog(api_key: str, output_path: str) -> None:
45
+ """Create Shadeform catalog by fetching from API."""
46
+ headers = {'X-API-KEY': api_key}
47
+
48
+ params = {'available': 'true'}
49
+
50
+ response = requests.get(ENDPOINT,
51
+ headers=headers,
52
+ params=params,
53
+ timeout=30)
54
+ response.raise_for_status()
55
+
56
+ data = response.json()
57
+ instance_types = data.get('instance_types', [])
58
+
59
+ with open(output_path, mode='w', encoding='utf-8') as f:
60
+ writer = csv.writer(f, delimiter=',', quotechar='"')
61
+ writer.writerow([
62
+ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
63
+ 'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
64
+ ])
65
+
66
+ for instance in instance_types:
67
+ config = instance['configuration']
68
+
69
+ cloud = instance['cloud']
70
+ shade_instance_type = instance['shade_instance_type']
71
+ instance_type = f'{cloud}_{shade_instance_type.replace("_", "-")}'
72
+ gpu_type = config['gpu_type'].replace('_', '-')
73
+ gpu_count = float(config['num_gpus'])
74
+ vcpus = float(config['vcpus'])
75
+ memory_gb = int(config['memory_in_gb'])
76
+
77
+ # Append "B" to instance_type and gpu_type if they end with "G"
78
+ if instance_type.endswith('G'):
79
+ instance_type += 'B'
80
+ if gpu_type.endswith('G'):
81
+ gpu_type += 'B'
82
+
83
+ # Replace "Gx" with "GBx" (case sensitive)
84
+ if 'Gx' in instance_type:
85
+ instance_type = instance_type.replace('Gx', 'GBx')
86
+
87
+ # Price is in cents per hour, convert to dollars
88
+ price = float(instance['hourly_price']) / 100
89
+
90
+ # Create GPU info
91
+ gpuinfo = None
92
+ if gpu_count > 0:
93
+ gpuinfo_dict = parse_gpu_info(gpu_type, int(gpu_count),
94
+ int(config['vram_per_gpu_in_gb']))
95
+ gpuinfo = json.dumps(gpuinfo_dict).replace('"', '\'')
96
+
97
+ # Write entry for each available region
98
+ for availability in instance.get('availability', []):
99
+ if availability['available'] and gpu_count > 0:
100
+ region = availability['region']
101
+ writer.writerow([
102
+ instance_type,
103
+ gpu_type,
104
+ gpu_count,
105
+ vcpus,
106
+ memory_gb,
107
+ price,
108
+ region,
109
+ gpuinfo,
110
+ '' # No spot pricing info available
111
+ ])
112
+
113
+
114
+ def get_api_key(cmdline_args: argparse.Namespace) -> str:
115
+ """Get Shadeform API key from cmdline or default path."""
116
+ api_key = cmdline_args.api_key
117
+ if api_key is None:
118
+ if cmdline_args.api_key_path is not None:
119
+ with open(cmdline_args.api_key_path, mode='r',
120
+ encoding='utf-8') as f:
121
+ api_key = f.read().strip()
122
+ else:
123
+ # Read from ~/.shadeform/api_key
124
+ with open(DEFAULT_SHADEFORM_API_KEY_PATH,
125
+ mode='r',
126
+ encoding='utf-8') as f:
127
+ api_key = f.read().strip()
128
+ assert api_key is not None, (
129
+ f'API key not found. Please provide via --api-key or place in '
130
+ f'{DEFAULT_SHADEFORM_API_KEY_PATH}')
131
+ return api_key
132
+
133
+
134
+ if __name__ == '__main__':
135
+ parser = argparse.ArgumentParser()
136
+ parser.add_argument('--api-key', help='Shadeform API key.')
137
+ parser.add_argument('--api-key-path',
138
+ help='path of file containing Shadeform API key.')
139
+ args = parser.parse_args()
140
+ os.makedirs('shadeform', exist_ok=True)
141
+ create_catalog(get_api_key(args), 'shadeform/vms.csv')
142
+ print('Shadeform catalog saved to shadeform/vms.csv')
@@ -3,6 +3,7 @@
3
3
  Kubernetes does not require a catalog of instances, but we need an image catalog
4
4
  mapping SkyPilot image tags to corresponding container image tags.
5
5
  """
6
+ import collections
6
7
  import re
7
8
  import typing
8
9
  from typing import Dict, List, Optional, Set, Tuple
@@ -167,12 +168,25 @@ def _list_accelerators(
167
168
  accelerators_qtys: Set[Tuple[str, int]] = set()
168
169
  keys = lf.get_label_keys()
169
170
  nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
170
- pods = None
171
- if realtime:
172
- # Get the pods to get the real-time GPU usage
171
+
172
+ # Check if any nodes have accelerators before fetching pods
173
+ has_accelerator_nodes = False
174
+ for node in nodes:
175
+ for key in keys:
176
+ if key in node.metadata.labels:
177
+ has_accelerator_nodes = True
178
+ break
179
+ if has_accelerator_nodes:
180
+ break
181
+
182
+ # Only fetch pods if we have accelerator nodes and realtime is requested
183
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
184
+ error_on_get_allocated_gpu_qty_by_node = False
185
+ if realtime and has_accelerator_nodes:
186
+ # Get the allocated GPU quantity by each node
173
187
  try:
174
- pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(
175
- context=context)
188
+ allocated_qty_by_node = (
189
+ kubernetes_utils.get_allocated_gpu_qty_by_node(context=context))
176
190
  except kubernetes.api_exception() as e:
177
191
  if e.status == 403:
178
192
  logger.warning(
@@ -180,6 +194,7 @@ def _list_accelerators(
180
194
  '(forbidden). Please check if your account has '
181
195
  'necessary permissions to list pods. Realtime GPU '
182
196
  'availability information may be incorrect.')
197
+ error_on_get_allocated_gpu_qty_by_node = True
183
198
  else:
184
199
  raise
185
200
  # Total number of GPUs in the cluster
@@ -189,9 +204,11 @@ def _list_accelerators(
189
204
  min_quantity_filter = quantity_filter if quantity_filter else 1
190
205
 
191
206
  for node in nodes:
207
+ # Check if node is ready
208
+ node_is_ready = node.is_ready()
209
+
192
210
  for key in keys:
193
211
  if key in node.metadata.labels:
194
- allocated_qty = 0
195
212
  accelerator_name = lf.get_accelerator_from_label_value(
196
213
  node.metadata.labels.get(key))
197
214
 
@@ -246,37 +263,24 @@ def _list_accelerators(
246
263
  total_accelerators_capacity[
247
264
  accelerator_name] += quantized_count
248
265
 
249
- if pods is None:
250
- # If we can't get the pods, we can't get the GPU usage
251
- total_accelerators_available[accelerator_name] = -1
252
- continue
253
-
254
- for pod in pods:
255
- # Get all the pods running on the node
256
- if (pod.spec.node_name == node.metadata.name and
257
- pod.status.phase in ['Running', 'Pending']):
258
- # Skip pods that should not count against GPU count
259
- if (kubernetes_utils.
260
- should_exclude_pod_from_gpu_allocation(pod)):
261
- logger.debug(
262
- f'Excluding pod '
263
- f'{pod.metadata.name} from GPU count '
264
- f'calculations on node {node.metadata.name}')
265
- continue
266
- # Iterate over all the containers in the pod and sum
267
- # the GPU requests
268
- for container in pod.spec.containers:
269
- if container.resources.requests:
270
- allocated_qty += (
271
- kubernetes_utils.get_node_accelerator_count(
272
- context, container.resources.requests))
273
-
274
- accelerators_available = accelerator_count - allocated_qty
275
266
  # Initialize the total_accelerators_available to make sure the
276
267
  # key exists in the dictionary.
277
268
  total_accelerators_available[accelerator_name] = (
278
269
  total_accelerators_available.get(accelerator_name, 0))
279
270
 
271
+ # Skip availability counting for not-ready nodes
272
+ if not node_is_ready:
273
+ continue
274
+
275
+ if error_on_get_allocated_gpu_qty_by_node:
276
+ # If we can't get the allocated GPU quantity by each node,
277
+ # we can't get the GPU usage.
278
+ total_accelerators_available[accelerator_name] = -1
279
+ continue
280
+
281
+ allocated_qty = allocated_qty_by_node[node.metadata.name]
282
+ accelerators_available = accelerator_count - allocated_qty
283
+
280
284
  if accelerators_available >= min_quantity_filter:
281
285
  quantized_availability = min_quantity_filter * (
282
286
  accelerators_available // min_quantity_filter)
@@ -0,0 +1,95 @@
1
+ """PrimeIntellect service catalog.
2
+
3
+ This module loads the service catalog file and can be used to
4
+ query instance types and pricing information for PrimeIntellect.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ from sky.catalog import common
11
+
12
+ if typing.TYPE_CHECKING:
13
+ from sky.clouds import cloud
14
+
15
+ _df = common.read_catalog('primeintellect/vms.csv')
16
+
17
+
18
+ def instance_type_exists(instance_type: str) -> bool:
19
+ return common.instance_type_exists_impl(_df, instance_type)
20
+
21
+
22
+ def validate_region_zone(
23
+ region: Optional[str],
24
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
25
+ return common.validate_region_zone_impl('primeintellect', _df, region, zone)
26
+
27
+
28
+ def get_hourly_cost(instance_type: str,
29
+ use_spot: bool = False,
30
+ region: Optional[str] = None,
31
+ zone: Optional[str] = None) -> float:
32
+ """Returns the cost, or the cheapest cost among all zones for spot."""
33
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
34
+ zone)
35
+
36
+
37
+ def get_vcpus_mem_from_instance_type(
38
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
39
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
40
+
41
+
42
+ def get_default_instance_type(cpus: Optional[str] = None,
43
+ memory: Optional[str] = None,
44
+ disk_tier: Optional[str] = None,
45
+ region: Optional[str] = None,
46
+ zone: Optional[str] = None) -> Optional[str]:
47
+ del disk_tier # no disk tiers
48
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
49
+ zone)
50
+
51
+
52
+ def get_accelerators_from_instance_type(
53
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
54
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
55
+
56
+
57
+ def get_instance_type_for_accelerator(
58
+ acc_name: str,
59
+ acc_count: int,
60
+ cpus: Optional[str] = None,
61
+ memory: Optional[str] = None,
62
+ use_spot: bool = False,
63
+ region: Optional[str] = None,
64
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
65
+ """Returns a list of instance types that have the given accelerator."""
66
+ return common.get_instance_type_for_accelerator_impl(df=_df,
67
+ acc_name=acc_name,
68
+ acc_count=acc_count,
69
+ cpus=cpus,
70
+ memory=memory,
71
+ use_spot=use_spot,
72
+ region=region,
73
+ zone=zone)
74
+
75
+
76
+ def get_region_zones_for_instance_type(instance_type: str,
77
+ use_spot: bool) -> List['cloud.Region']:
78
+ df = _df[_df['InstanceType'] == instance_type]
79
+ return common.get_region_zones(df, use_spot)
80
+
81
+
82
+ def list_accelerators(
83
+ gpus_only: bool,
84
+ name_filter: Optional[str],
85
+ region_filter: Optional[str],
86
+ quantity_filter: Optional[int],
87
+ case_sensitive: bool = True,
88
+ all_regions: bool = False,
89
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
90
+ """Returns all instance types in Prime Intellect offering GPUs."""
91
+ del require_price
92
+ return common.list_accelerators_impl('PrimeIntellect', _df, gpus_only,
93
+ name_filter, region_filter,
94
+ quantity_filter, case_sensitive,
95
+ all_regions)
@@ -12,7 +12,11 @@ from sky.catalog import common
12
12
  if typing.TYPE_CHECKING:
13
13
  from sky.clouds import cloud
14
14
 
15
- _df = common.read_catalog('runpod/vms.csv')
15
+ # Runpod has no set updated schedule for their catalog. We pull the catalog
16
+ # every 7 hours to make sure we have the latest information.
17
+ _PULL_FREQUENCY_HOURS = 7
18
+ _df = common.read_catalog('runpod/vms.csv',
19
+ pull_frequency_hours=_PULL_FREQUENCY_HOURS)
16
20
 
17
21
 
18
22
  def instance_type_exists(instance_type: str) -> bool: