skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Utilities for formatting tables for CLI output."""
|
|
2
|
+
import abc
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import prettytable
|
|
7
|
+
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.jobs import utils as managed_jobs
|
|
10
|
+
from sky.schemas.api import responses
|
|
11
|
+
from sky.skylet import constants
|
|
12
|
+
from sky.utils import common_utils
|
|
13
|
+
from sky.utils import log_utils
|
|
14
|
+
from sky.utils import volume
|
|
15
|
+
|
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def format_job_queue(jobs: List[responses.ClusterJobRecord]):
|
|
20
|
+
"""Format the job queue for display.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
jobs = get_job_queue()
|
|
24
|
+
print(format_job_queue(jobs))
|
|
25
|
+
"""
|
|
26
|
+
job_table = log_utils.create_table([
|
|
27
|
+
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
28
|
+
'STATUS', 'LOG', 'GIT COMMIT'
|
|
29
|
+
])
|
|
30
|
+
for job in jobs:
|
|
31
|
+
job_table.add_row([
|
|
32
|
+
job.job_id,
|
|
33
|
+
job.job_name,
|
|
34
|
+
job.username,
|
|
35
|
+
log_utils.readable_time_duration(job.submitted_at),
|
|
36
|
+
log_utils.readable_time_duration(job.start_at),
|
|
37
|
+
log_utils.readable_time_duration(job.start_at,
|
|
38
|
+
job.end_at,
|
|
39
|
+
absolute=True),
|
|
40
|
+
job.resources,
|
|
41
|
+
job.status.colored_str(),
|
|
42
|
+
job.log_path,
|
|
43
|
+
job.metadata.get('git_commit', '-'),
|
|
44
|
+
])
|
|
45
|
+
return job_table
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def format_storage_table(storages: List[responses.StorageRecord],
|
|
49
|
+
show_all: bool = False) -> str:
|
|
50
|
+
"""Format the storage table for display.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
storage_table (dict): The storage table.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
str: The formatted storage table.
|
|
57
|
+
"""
|
|
58
|
+
storage_table = log_utils.create_table([
|
|
59
|
+
'NAME',
|
|
60
|
+
'UPDATED',
|
|
61
|
+
'STORE',
|
|
62
|
+
'COMMAND',
|
|
63
|
+
'STATUS',
|
|
64
|
+
])
|
|
65
|
+
|
|
66
|
+
for row in storages:
|
|
67
|
+
launched_at = row.launched_at
|
|
68
|
+
if show_all:
|
|
69
|
+
command = row.last_use
|
|
70
|
+
else:
|
|
71
|
+
command = common_utils.truncate_long_string(
|
|
72
|
+
row.last_use, constants.LAST_USE_TRUNC_LENGTH)
|
|
73
|
+
storage_table.add_row([
|
|
74
|
+
# NAME
|
|
75
|
+
row.name,
|
|
76
|
+
# LAUNCHED
|
|
77
|
+
log_utils.readable_time_duration(launched_at),
|
|
78
|
+
# CLOUDS
|
|
79
|
+
', '.join([s.value for s in row.store]),
|
|
80
|
+
# COMMAND,
|
|
81
|
+
command,
|
|
82
|
+
# STATUS
|
|
83
|
+
row.status.value,
|
|
84
|
+
])
|
|
85
|
+
if storages:
|
|
86
|
+
return str(storage_table)
|
|
87
|
+
else:
|
|
88
|
+
return 'No existing storage.'
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def format_job_table(
|
|
92
|
+
jobs: List[responses.ManagedJobRecord],
|
|
93
|
+
show_all: bool,
|
|
94
|
+
show_user: bool,
|
|
95
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
96
|
+
max_jobs: Optional[int] = None,
|
|
97
|
+
status_counts: Optional[Dict[str, int]] = None,
|
|
98
|
+
):
|
|
99
|
+
jobs = [job.model_dump() for job in jobs]
|
|
100
|
+
return managed_jobs.format_job_table(
|
|
101
|
+
jobs,
|
|
102
|
+
pool_status=pool_status,
|
|
103
|
+
show_all=show_all,
|
|
104
|
+
show_user=show_user,
|
|
105
|
+
max_jobs=max_jobs,
|
|
106
|
+
job_status_counts=status_counts,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
_BASIC_COLUMNS = [
|
|
111
|
+
'NAME',
|
|
112
|
+
'TYPE',
|
|
113
|
+
'INFRA',
|
|
114
|
+
'SIZE',
|
|
115
|
+
'USER',
|
|
116
|
+
'WORKSPACE',
|
|
117
|
+
'AGE',
|
|
118
|
+
'STATUS',
|
|
119
|
+
'LAST_USE',
|
|
120
|
+
'USED_BY',
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_infra_str(cloud: Optional[str], region: Optional[str],
|
|
125
|
+
zone: Optional[str]) -> str:
|
|
126
|
+
"""Get the infrastructure string for the volume."""
|
|
127
|
+
infra = ''
|
|
128
|
+
if cloud:
|
|
129
|
+
infra += cloud
|
|
130
|
+
if region:
|
|
131
|
+
infra += f'/{region}'
|
|
132
|
+
if zone:
|
|
133
|
+
infra += f'/{zone}'
|
|
134
|
+
return infra
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class VolumeTable(abc.ABC):
|
|
138
|
+
"""The volume table."""
|
|
139
|
+
|
|
140
|
+
def __init__(self,
|
|
141
|
+
volumes: List[responses.VolumeRecord],
|
|
142
|
+
show_all: bool = False):
|
|
143
|
+
super().__init__()
|
|
144
|
+
self.table = self._create_table(show_all)
|
|
145
|
+
self._add_rows(volumes, show_all)
|
|
146
|
+
|
|
147
|
+
def _get_row_base_columns(self,
|
|
148
|
+
row: responses.VolumeRecord,
|
|
149
|
+
show_all: bool = False) -> List[str]:
|
|
150
|
+
"""Get the base columns for a row."""
|
|
151
|
+
# Convert last_attached_at timestamp to human readable string
|
|
152
|
+
last_attached_at = row.get('last_attached_at')
|
|
153
|
+
if last_attached_at is not None:
|
|
154
|
+
last_attached_at_str = datetime.fromtimestamp(
|
|
155
|
+
last_attached_at).strftime('%Y-%m-%d %H:%M:%S')
|
|
156
|
+
else:
|
|
157
|
+
last_attached_at_str = '-'
|
|
158
|
+
size = row.get('size', '')
|
|
159
|
+
if size:
|
|
160
|
+
size = f'{size}Gi'
|
|
161
|
+
usedby_str = '-'
|
|
162
|
+
usedby_clusters = row.get('usedby_clusters')
|
|
163
|
+
usedby_pods = row.get('usedby_pods')
|
|
164
|
+
if usedby_clusters:
|
|
165
|
+
usedby_str = f'{", ".join(usedby_clusters)}'
|
|
166
|
+
elif usedby_pods:
|
|
167
|
+
usedby_str = f'{", ".join(usedby_pods)}'
|
|
168
|
+
if show_all:
|
|
169
|
+
usedby = usedby_str
|
|
170
|
+
else:
|
|
171
|
+
usedby = common_utils.truncate_long_string(
|
|
172
|
+
usedby_str, constants.USED_BY_TRUNC_LENGTH)
|
|
173
|
+
infra = _get_infra_str(row.get('cloud'), row.get('region'),
|
|
174
|
+
row.get('zone'))
|
|
175
|
+
return [
|
|
176
|
+
row.get('name', ''),
|
|
177
|
+
row.get('type', ''),
|
|
178
|
+
infra,
|
|
179
|
+
size,
|
|
180
|
+
row.get('user_name', '-'),
|
|
181
|
+
row.get('workspace', '-'),
|
|
182
|
+
log_utils.human_duration(row.get('launched_at', 0)),
|
|
183
|
+
row.get('status', ''),
|
|
184
|
+
last_attached_at_str,
|
|
185
|
+
usedby,
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
|
|
189
|
+
"""Create the volume table."""
|
|
190
|
+
raise NotImplementedError
|
|
191
|
+
|
|
192
|
+
def _add_rows(self,
|
|
193
|
+
volumes: List[responses.VolumeRecord],
|
|
194
|
+
show_all: bool = False) -> None:
|
|
195
|
+
"""Add rows to the volume table."""
|
|
196
|
+
raise NotImplementedError
|
|
197
|
+
|
|
198
|
+
@abc.abstractmethod
|
|
199
|
+
def format(self) -> str:
|
|
200
|
+
"""Format the volume table for display."""
|
|
201
|
+
raise NotImplementedError
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class PVCVolumeTable(VolumeTable):
|
|
205
|
+
"""The PVC volume table."""
|
|
206
|
+
|
|
207
|
+
def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
|
|
208
|
+
"""Create the PVC volume table."""
|
|
209
|
+
# If show_all is False, show the table with the columns:
|
|
210
|
+
# NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
|
|
211
|
+
# AGE, STATUS, LAST_USE, USED_BY, IS_EPHEMERAL
|
|
212
|
+
# If show_all is True, show the table with the columns:
|
|
213
|
+
# NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
|
|
214
|
+
# AGE, STATUS, LAST_USE, USED_BY, IS_EPHEMERAL, NAME_ON_CLOUD
|
|
215
|
+
# STORAGE_CLASS, ACCESS_MODE
|
|
216
|
+
|
|
217
|
+
columns = _BASIC_COLUMNS + [
|
|
218
|
+
'IS_EPHEMERAL',
|
|
219
|
+
]
|
|
220
|
+
if show_all:
|
|
221
|
+
columns = columns + [
|
|
222
|
+
'NAME_ON_CLOUD',
|
|
223
|
+
'STORAGE_CLASS',
|
|
224
|
+
'ACCESS_MODE',
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
table = log_utils.create_table(columns)
|
|
228
|
+
return table
|
|
229
|
+
|
|
230
|
+
def _add_rows(self,
|
|
231
|
+
volumes: List[responses.VolumeRecord],
|
|
232
|
+
show_all: bool = False) -> None:
|
|
233
|
+
"""Add rows to the PVC volume table."""
|
|
234
|
+
for row in volumes:
|
|
235
|
+
table_row = self._get_row_base_columns(row, show_all)
|
|
236
|
+
table_row.append(row.get('is_ephemeral', False))
|
|
237
|
+
if show_all:
|
|
238
|
+
table_row.append(row.get('name_on_cloud', ''))
|
|
239
|
+
table_row.append(
|
|
240
|
+
row.get('config', {}).get('storage_class_name', '-'))
|
|
241
|
+
table_row.append(row.get('config', {}).get('access_mode', ''))
|
|
242
|
+
|
|
243
|
+
self.table.add_row(table_row)
|
|
244
|
+
|
|
245
|
+
def format(self) -> str:
|
|
246
|
+
"""Format the PVC volume table for display."""
|
|
247
|
+
return 'Kubernetes PVCs:\n' + str(self.table)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class RunPodVolumeTable(VolumeTable):
|
|
251
|
+
"""The RunPod volume table."""
|
|
252
|
+
|
|
253
|
+
def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
|
|
254
|
+
"""Create the RunPod volume table."""
|
|
255
|
+
# If show_all is False, show the table with the columns:
|
|
256
|
+
# NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
|
|
257
|
+
# AGE, STATUS, LAST_USE, USED_BY
|
|
258
|
+
# If show_all is True, show the table with the columns:
|
|
259
|
+
# NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
|
|
260
|
+
# AGE, STATUS, LAST_USE, USED_BY, NAME_ON_CLOUD
|
|
261
|
+
|
|
262
|
+
if show_all:
|
|
263
|
+
columns = _BASIC_COLUMNS + ['NAME_ON_CLOUD']
|
|
264
|
+
else:
|
|
265
|
+
columns = _BASIC_COLUMNS
|
|
266
|
+
|
|
267
|
+
table = log_utils.create_table(columns)
|
|
268
|
+
return table
|
|
269
|
+
|
|
270
|
+
def _add_rows(self,
|
|
271
|
+
volumes: List[responses.VolumeRecord],
|
|
272
|
+
show_all: bool = False) -> None:
|
|
273
|
+
"""Add rows to the RunPod volume table."""
|
|
274
|
+
for row in volumes:
|
|
275
|
+
table_row = self._get_row_base_columns(row, show_all)
|
|
276
|
+
if show_all:
|
|
277
|
+
table_row.append(row.get('name_on_cloud', ''))
|
|
278
|
+
|
|
279
|
+
self.table.add_row(table_row)
|
|
280
|
+
|
|
281
|
+
def format(self) -> str:
|
|
282
|
+
"""Format the RunPod volume table for display."""
|
|
283
|
+
return 'RunPod Network Volumes:\n' + str(self.table)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def format_volume_table(volumes: List[responses.VolumeRecord],
|
|
287
|
+
show_all: bool = False) -> str:
|
|
288
|
+
"""Format the volume table for display.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
volume_table (dict): The volume table.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
str: The formatted volume table.
|
|
295
|
+
"""
|
|
296
|
+
volumes_per_type: Dict[str, List[responses.VolumeRecord]] = {}
|
|
297
|
+
supported_volume_types = [
|
|
298
|
+
volume_type.value for volume_type in volume.VolumeType
|
|
299
|
+
]
|
|
300
|
+
for row in volumes:
|
|
301
|
+
volume_type = row.get('type', '')
|
|
302
|
+
if volume_type in supported_volume_types:
|
|
303
|
+
if volume_type not in volumes_per_type:
|
|
304
|
+
volumes_per_type[volume_type] = []
|
|
305
|
+
volumes_per_type[volume_type].append(row)
|
|
306
|
+
else:
|
|
307
|
+
logger.warning(f'Unknown volume type: {volume_type}')
|
|
308
|
+
continue
|
|
309
|
+
table_str = ''
|
|
310
|
+
for volume_type, volume_list in volumes_per_type.items():
|
|
311
|
+
if table_str:
|
|
312
|
+
table_str += '\n\n'
|
|
313
|
+
if volume_type == volume.VolumeType.PVC.value:
|
|
314
|
+
pvc_table = PVCVolumeTable(volume_list, show_all)
|
|
315
|
+
table_str += pvc_table.format()
|
|
316
|
+
elif volume_type == volume.VolumeType.RUNPOD_NETWORK_VOLUME.value:
|
|
317
|
+
runpod_table = RunPodVolumeTable(volume_list, show_all)
|
|
318
|
+
table_str += runpod_table.format()
|
|
319
|
+
if table_str:
|
|
320
|
+
return table_str
|
|
321
|
+
else:
|
|
322
|
+
return 'No existing volumes.'
|
sky/client/cli/utils.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Utility functions for the CLI."""
|
|
2
|
+
import enum
|
|
3
|
+
import typing
|
|
4
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from sky import exceptions
|
|
7
|
+
from sky import jobs as managed_jobs
|
|
8
|
+
from sky.schemas.api import responses
|
|
9
|
+
from sky.server import common as server_common
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class QueueResultVersion(enum.Enum):
|
|
13
|
+
"""The version of the queue result.
|
|
14
|
+
|
|
15
|
+
V1: The old version of the queue result.
|
|
16
|
+
- job_records (List[responses.ManagedJobRecord]): A list of dicts,
|
|
17
|
+
with each dict containing the information of a job.
|
|
18
|
+
V2: The new version of the queue result.
|
|
19
|
+
- job_records (List[responses.ManagedJobRecord]): A list of dicts,
|
|
20
|
+
with each dict containing the information of a job.
|
|
21
|
+
- total (int): Total number of jobs after filter.
|
|
22
|
+
- status_counts (Dict[str, int]): Status counts after filter.
|
|
23
|
+
- total_no_filter (int): Total number of jobs before filter.
|
|
24
|
+
"""
|
|
25
|
+
V1 = 'v1'
|
|
26
|
+
V2 = 'v2'
|
|
27
|
+
|
|
28
|
+
def v2(self) -> bool:
|
|
29
|
+
return self == QueueResultVersion.V2
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_managed_job_queue(
|
|
33
|
+
refresh: bool,
|
|
34
|
+
skip_finished: bool = False,
|
|
35
|
+
all_users: bool = False,
|
|
36
|
+
job_ids: Optional[List[int]] = None,
|
|
37
|
+
limit: Optional[int] = None,
|
|
38
|
+
fields: Optional[List[str]] = None,
|
|
39
|
+
) -> Tuple[server_common.RequestId[Union[List[responses.ManagedJobRecord],
|
|
40
|
+
Tuple[List[responses.ManagedJobRecord],
|
|
41
|
+
int, Dict[str, int], int]]],
|
|
42
|
+
QueueResultVersion]:
|
|
43
|
+
"""Gets statuses of managed jobs.
|
|
44
|
+
|
|
45
|
+
Please refer to sky.cli.job_queue for documentation.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
refresh: Whether to restart the jobs controller if it is stopped.
|
|
49
|
+
skip_finished: Whether to skip finished jobs.
|
|
50
|
+
all_users: Whether to show all users' jobs.
|
|
51
|
+
job_ids: IDs of the managed jobs to show.
|
|
52
|
+
limit: Number of jobs to show.
|
|
53
|
+
fields: Fields to get for the managed jobs.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
- the request ID of the queue request
|
|
57
|
+
- the version of the queue result
|
|
58
|
+
|
|
59
|
+
Request Raises:
|
|
60
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
|
61
|
+
does not exist.
|
|
62
|
+
RuntimeError: if failed to get the managed jobs with ssh.
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
return typing.cast(
|
|
66
|
+
server_common.RequestId[
|
|
67
|
+
Union[List[responses.ManagedJobRecord],
|
|
68
|
+
Tuple[List[responses.ManagedJobRecord], int,
|
|
69
|
+
Dict[str, int], int]]],
|
|
70
|
+
managed_jobs.queue_v2(refresh, skip_finished, all_users, job_ids,
|
|
71
|
+
limit, fields)), QueueResultVersion.V2
|
|
72
|
+
except exceptions.APINotSupportedError:
|
|
73
|
+
return typing.cast(
|
|
74
|
+
server_common.RequestId[
|
|
75
|
+
Union[List[responses.ManagedJobRecord],
|
|
76
|
+
Tuple[List[responses.ManagedJobRecord], int,
|
|
77
|
+
Dict[str, int], int]]],
|
|
78
|
+
managed_jobs.queue(refresh, skip_finished, all_users,
|
|
79
|
+
job_ids)), QueueResultVersion.V1
|
sky/client/common.py
CHANGED
|
@@ -16,8 +16,10 @@ import zipfile
|
|
|
16
16
|
|
|
17
17
|
from sky import sky_logging
|
|
18
18
|
from sky.adaptors import common as adaptors_common
|
|
19
|
+
from sky.client import service_account_auth
|
|
19
20
|
from sky.data import data_utils
|
|
20
21
|
from sky.data import storage_utils
|
|
22
|
+
from sky.schemas.api import responses as api_responses
|
|
21
23
|
from sky.server import common as server_common
|
|
22
24
|
from sky.server.requests import payloads
|
|
23
25
|
from sky.skylet import constants
|
|
@@ -31,7 +33,7 @@ if typing.TYPE_CHECKING:
|
|
|
31
33
|
import requests
|
|
32
34
|
|
|
33
35
|
import sky
|
|
34
|
-
import
|
|
36
|
+
from sky import dag as dag_lib
|
|
35
37
|
else:
|
|
36
38
|
httpx = adaptors_common.LazyImport('httpx')
|
|
37
39
|
requests = adaptors_common.LazyImport('requests')
|
|
@@ -42,8 +44,10 @@ logger = sky_logging.init_logger(__name__)
|
|
|
42
44
|
_DOWNLOAD_CHUNK_BYTES = 8192
|
|
43
45
|
# The chunk size for the zip file to be uploaded to the API server. We split
|
|
44
46
|
# the zip file into chunks to avoid network issues for large request body that
|
|
45
|
-
# can be caused by NGINX's client_max_body_size.
|
|
46
|
-
|
|
47
|
+
# can be caused by NGINX's client_max_body_size or Cloudflare's upload limit.
|
|
48
|
+
# As of 09/25/2025, the upload limit for Cloudflare's free plan is 100MiB:
|
|
49
|
+
# https://developers.cloudflare.com/support/troubleshooting/http-status-codes/4xx-client-error/error-413/
|
|
50
|
+
_UPLOAD_CHUNK_BYTES = 100 * 1024 * 1024
|
|
47
51
|
|
|
48
52
|
FILE_UPLOAD_LOGS_DIR = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
49
53
|
'file_uploads')
|
|
@@ -79,11 +83,20 @@ def download_logs_from_api_server(
|
|
|
79
83
|
remote_machine_prefix,
|
|
80
84
|
local_machine_prefix) for remote_path in paths_on_api_server
|
|
81
85
|
}
|
|
86
|
+
# Check if any local log directories already exist before downloading
|
|
87
|
+
for local_path in remote2local_path_dict.values():
|
|
88
|
+
expanded_path = os.path.expanduser(local_path)
|
|
89
|
+
if os.path.exists(expanded_path):
|
|
90
|
+
logger.warning(
|
|
91
|
+
f'Log directory {local_path} already exists. '
|
|
92
|
+
f'This may overwrite logs from a previous cluster with the '
|
|
93
|
+
f'same name and job ID.')
|
|
82
94
|
body = payloads.DownloadBody(folder_paths=list(paths_on_api_server),)
|
|
83
|
-
response =
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
95
|
+
response = server_common.make_authenticated_request(
|
|
96
|
+
'POST',
|
|
97
|
+
'/download',
|
|
98
|
+
json=json.loads(body.model_dump_json()),
|
|
99
|
+
stream=True)
|
|
87
100
|
if response.status_code == 200:
|
|
88
101
|
remote_home_path = response.headers.get('X-Home-Path')
|
|
89
102
|
assert remote_home_path is not None, response.headers
|
|
@@ -164,14 +177,19 @@ class UploadChunkParams:
|
|
|
164
177
|
log_file: str
|
|
165
178
|
|
|
166
179
|
|
|
167
|
-
def _upload_chunk_with_retry(params: UploadChunkParams) ->
|
|
168
|
-
"""Uploads a chunk of a zip file to the API server.
|
|
180
|
+
def _upload_chunk_with_retry(params: UploadChunkParams) -> str:
|
|
181
|
+
"""Uploads a chunk of a zip file to the API server.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Status of the upload.
|
|
185
|
+
"""
|
|
169
186
|
upload_logger = params.upload_logger
|
|
170
187
|
upload_logger.info(
|
|
171
188
|
f'Uploading chunk: {params.chunk_index + 1} / {params.total_chunks}')
|
|
172
189
|
|
|
173
190
|
server_url = server_common.get_server_url()
|
|
174
191
|
max_attempts = 3
|
|
192
|
+
sa_headers = service_account_auth.get_service_account_headers()
|
|
175
193
|
with open(params.file_path, 'rb') as f:
|
|
176
194
|
for attempt in range(max_attempts):
|
|
177
195
|
response = params.client.post(
|
|
@@ -184,19 +202,23 @@ def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
|
|
|
184
202
|
},
|
|
185
203
|
content=FileChunkIterator(f, _UPLOAD_CHUNK_BYTES,
|
|
186
204
|
params.chunk_index),
|
|
187
|
-
headers={
|
|
205
|
+
headers={
|
|
206
|
+
'Content-Type': 'application/octet-stream',
|
|
207
|
+
**sa_headers,
|
|
208
|
+
},
|
|
188
209
|
cookies=server_common.get_api_cookie_jar())
|
|
189
210
|
if response.status_code == 200:
|
|
190
211
|
data = response.json()
|
|
191
212
|
status = data.get('status')
|
|
192
213
|
msg = ('Uploaded chunk: '
|
|
193
|
-
f'{params.chunk_index + 1} / {params.total_chunks}'
|
|
194
|
-
|
|
214
|
+
f'{params.chunk_index + 1} / {params.total_chunks} '
|
|
215
|
+
f'(Status: {status})')
|
|
216
|
+
if status == api_responses.UploadStatus.UPLOADING.value:
|
|
195
217
|
missing_chunks = data.get('missing_chunks')
|
|
196
218
|
if missing_chunks:
|
|
197
219
|
msg += f' - Waiting for chunks: {missing_chunks}'
|
|
198
220
|
upload_logger.info(msg)
|
|
199
|
-
return
|
|
221
|
+
return status
|
|
200
222
|
elif attempt < max_attempts - 1:
|
|
201
223
|
upload_logger.error(
|
|
202
224
|
f'Failed to upload chunk: '
|
|
@@ -204,17 +226,29 @@ def _upload_chunk_with_retry(params: UploadChunkParams) -> None:
|
|
|
204
226
|
f'{response.content.decode("utf-8")}')
|
|
205
227
|
upload_logger.info(
|
|
206
228
|
f'Retrying... ({attempt + 1} / {max_attempts})')
|
|
207
|
-
|
|
229
|
+
if response.status_code == 503:
|
|
230
|
+
# If the server is temporarily unavailable,
|
|
231
|
+
# wait a little longer before retrying.
|
|
232
|
+
time.sleep(10)
|
|
233
|
+
else:
|
|
234
|
+
time.sleep(1)
|
|
208
235
|
else:
|
|
236
|
+
try:
|
|
237
|
+
response_details = response.json().get('detail')
|
|
238
|
+
except Exception: # pylint: disable=broad-except
|
|
239
|
+
response_details = response.content
|
|
209
240
|
error_msg = (
|
|
210
241
|
f'Failed to upload chunk: {params.chunk_index + 1} / '
|
|
211
|
-
f'{params.total_chunks}: {
|
|
242
|
+
f'{params.total_chunks}: {response_details} '
|
|
243
|
+
f'(Status code: {response.status_code})')
|
|
212
244
|
upload_logger.error(error_msg)
|
|
213
245
|
with ux_utils.print_exception_no_traceback():
|
|
214
246
|
raise RuntimeError(
|
|
215
247
|
ux_utils.error_message(error_msg + '\n',
|
|
216
248
|
params.log_file,
|
|
217
249
|
is_local=True))
|
|
250
|
+
# If we reach here, the upload failed.
|
|
251
|
+
return 'failed'
|
|
218
252
|
|
|
219
253
|
|
|
220
254
|
@contextlib.contextmanager
|
|
@@ -267,7 +301,7 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
|
|
|
267
301
|
upload_list = []
|
|
268
302
|
for task_ in dag.tasks:
|
|
269
303
|
task_.file_mounts_mapping = {}
|
|
270
|
-
if task_.workdir:
|
|
304
|
+
if task_.workdir and isinstance(task_.workdir, str):
|
|
271
305
|
workdir = task_.workdir
|
|
272
306
|
assert os.path.isabs(workdir)
|
|
273
307
|
upload_list.append(workdir)
|
|
@@ -299,14 +333,12 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
|
|
|
299
333
|
task_.file_mounts_mapping[src] = _full_path(src)
|
|
300
334
|
if (task_.service is not None and
|
|
301
335
|
task_.service.tls_credential is not None):
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
task_.file_mounts_mapping[
|
|
308
|
-
task_.service.tls_credential.
|
|
309
|
-
certfile] = task_.service.tls_credential.certfile
|
|
336
|
+
keyfile = task_.service.tls_credential.keyfile
|
|
337
|
+
certfile = task_.service.tls_credential.certfile
|
|
338
|
+
upload_list.append(_full_path(keyfile))
|
|
339
|
+
upload_list.append(_full_path(certfile))
|
|
340
|
+
task_.file_mounts_mapping[keyfile] = _full_path(keyfile)
|
|
341
|
+
task_.file_mounts_mapping[certfile] = _full_path(certfile)
|
|
310
342
|
|
|
311
343
|
if upload_list:
|
|
312
344
|
os.makedirs(os.path.expanduser(FILE_UPLOAD_LOGS_DIR), exist_ok=True)
|
|
@@ -339,15 +371,29 @@ def upload_mounts_to_api_server(dag: 'sky.Dag',
|
|
|
339
371
|
log_file,
|
|
340
372
|
is_local=True))
|
|
341
373
|
|
|
374
|
+
upload_completed = False
|
|
342
375
|
with httpx.Client(timeout=timeout) as client:
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
376
|
+
total_retries = 3
|
|
377
|
+
for retry in range(total_retries):
|
|
378
|
+
chunk_params = [
|
|
379
|
+
UploadChunkParams(client, upload_id, chunk_index,
|
|
380
|
+
total_chunks, temp_zip_file.name,
|
|
381
|
+
upload_logger, log_file)
|
|
382
|
+
for chunk_index in range(total_chunks)
|
|
383
|
+
]
|
|
384
|
+
statuses = subprocess_utils.run_in_parallel(
|
|
385
|
+
_upload_chunk_with_retry, chunk_params)
|
|
386
|
+
if any(status == api_responses.UploadStatus.COMPLETED.value
|
|
387
|
+
for status in statuses):
|
|
388
|
+
upload_completed = True
|
|
389
|
+
break
|
|
390
|
+
else:
|
|
391
|
+
upload_logger.info(
|
|
392
|
+
f'No chunk upload returned completed status. '
|
|
393
|
+
'Retrying entire upload... '
|
|
394
|
+
f'({retry + 1} / {total_retries})')
|
|
395
|
+
if not upload_completed:
|
|
396
|
+
raise RuntimeError('Failed to upload files to API server.')
|
|
351
397
|
os.unlink(temp_zip_file.name)
|
|
352
398
|
upload_logger.info(f'Uploaded files: {upload_list}')
|
|
353
399
|
logger.info(
|
sky/client/oauth.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Client-side OAuth module."""
|
|
2
|
+
from http.server import BaseHTTPRequestHandler
|
|
3
|
+
from http.server import HTTPServer
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from typing import Dict, Optional
|
|
7
|
+
|
|
8
|
+
AUTH_TIMEOUT = 300 # 5 minutes
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _AuthCallbackHandler(BaseHTTPRequestHandler):
|
|
12
|
+
"""HTTP request handler for OAuth callback."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, token_container: Dict[str, Optional[str]],
|
|
15
|
+
remote_endpoint: str, *args, **kwargs):
|
|
16
|
+
self.token_container = token_container
|
|
17
|
+
self.remote_endpoint = remote_endpoint
|
|
18
|
+
super().__init__(*args, **kwargs)
|
|
19
|
+
|
|
20
|
+
def do_POST(self): # pylint: disable=invalid-name
|
|
21
|
+
"""Handle POST request for OAuth callback."""
|
|
22
|
+
data = self.rfile.read(int(self.headers['Content-Length']))
|
|
23
|
+
|
|
24
|
+
if data:
|
|
25
|
+
token = data.decode('utf-8')
|
|
26
|
+
self.token_container['token'] = token
|
|
27
|
+
|
|
28
|
+
# Send success response
|
|
29
|
+
self.send_response(200)
|
|
30
|
+
self.send_header('Content-type', 'text/html')
|
|
31
|
+
self.send_header('Access-Control-Allow-Origin',
|
|
32
|
+
self.remote_endpoint)
|
|
33
|
+
self.end_headers()
|
|
34
|
+
else:
|
|
35
|
+
# Send error response
|
|
36
|
+
self.send_response(400)
|
|
37
|
+
self.send_header('Content-type', 'text/html')
|
|
38
|
+
self.send_header('Access-Control-Allow-Origin',
|
|
39
|
+
self.remote_endpoint)
|
|
40
|
+
self.end_headers()
|
|
41
|
+
|
|
42
|
+
def log_message(self, *args): # pylint: disable=unused-argument
|
|
43
|
+
"""Suppress default HTTP server logging."""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def start_local_auth_server(port: int,
|
|
48
|
+
token_store: Dict[str, Optional[str]],
|
|
49
|
+
remote_endpoint: str,
|
|
50
|
+
timeout: int = AUTH_TIMEOUT) -> HTTPServer:
|
|
51
|
+
"""Start a local HTTP server to handle OAuth callback.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
port: Port to bind the server to.
|
|
55
|
+
token_container: Dict to store the received token.
|
|
56
|
+
remote_endpoint: The endpoint of the SkyPilot API server that will send
|
|
57
|
+
the token, needed for CORS.
|
|
58
|
+
timeout: Timeout in seconds to wait for the callback.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
The HTTP server instance.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def handler_factory(*args, **kwargs):
|
|
65
|
+
return _AuthCallbackHandler(token_store, remote_endpoint, *args,
|
|
66
|
+
**kwargs)
|
|
67
|
+
|
|
68
|
+
server = HTTPServer(('localhost', port), handler_factory)
|
|
69
|
+
server.timeout = timeout
|
|
70
|
+
|
|
71
|
+
def serve_until_token():
|
|
72
|
+
"""Serve requests until token is received or timeout."""
|
|
73
|
+
start_time = time.time()
|
|
74
|
+
while (token_store['token'] is None and
|
|
75
|
+
time.time() - start_time < timeout):
|
|
76
|
+
server.handle_request()
|
|
77
|
+
|
|
78
|
+
# Start server in a separate thread
|
|
79
|
+
server_thread = threading.Thread(target=serve_until_token, daemon=True)
|
|
80
|
+
server_thread.start()
|
|
81
|
+
|
|
82
|
+
return server
|