skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/log_utils.py
CHANGED
|
@@ -47,13 +47,16 @@ class RayUpLineProcessor(LineProcessor):
|
|
|
47
47
|
RUNTIME_SETUP = 1
|
|
48
48
|
PULLING_DOCKER_IMAGES = 2
|
|
49
49
|
|
|
50
|
-
def __init__(self, log_path: str):
|
|
50
|
+
def __init__(self, log_path: str, cluster_name: Optional[str] = None):
|
|
51
51
|
self.log_path = log_path
|
|
52
|
+
self.cluster_name = cluster_name
|
|
52
53
|
|
|
53
54
|
def __enter__(self) -> None:
|
|
54
55
|
self.state = self.ProvisionStatus.LAUNCH
|
|
55
56
|
self.status_display = rich_utils.safe_status(
|
|
56
|
-
ux_utils.spinner_message('Launching',
|
|
57
|
+
ux_utils.spinner_message('Launching',
|
|
58
|
+
self.log_path,
|
|
59
|
+
cluster_name=self.cluster_name))
|
|
57
60
|
self.status_display.start()
|
|
58
61
|
|
|
59
62
|
def process_line(self, log_line: str) -> None:
|
|
@@ -62,19 +65,25 @@ class RayUpLineProcessor(LineProcessor):
|
|
|
62
65
|
logger.info(' Head VM is up.')
|
|
63
66
|
self.status_display.update(
|
|
64
67
|
ux_utils.spinner_message(
|
|
65
|
-
'Launching - Preparing SkyPilot runtime',
|
|
68
|
+
'Launching - Preparing SkyPilot runtime',
|
|
69
|
+
self.log_path,
|
|
70
|
+
cluster_name=self.cluster_name))
|
|
66
71
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
|
67
72
|
if ('Pulling from' in log_line and
|
|
68
73
|
self.state == self.ProvisionStatus.RUNTIME_SETUP):
|
|
69
74
|
self.status_display.update(
|
|
70
75
|
ux_utils.spinner_message(
|
|
71
|
-
'Launching - Initializing docker container',
|
|
76
|
+
'Launching - Initializing docker container',
|
|
77
|
+
self.log_path,
|
|
78
|
+
cluster_name=self.cluster_name))
|
|
72
79
|
self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
|
|
73
80
|
if ('Status: Downloaded newer image' in log_line and
|
|
74
81
|
self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
|
|
75
82
|
self.status_display.update(
|
|
76
83
|
ux_utils.spinner_message(
|
|
77
|
-
'Launching - Preparing SkyPilot runtime',
|
|
84
|
+
'Launching - Preparing SkyPilot runtime',
|
|
85
|
+
self.log_path,
|
|
86
|
+
cluster_name=self.cluster_name))
|
|
78
87
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
|
79
88
|
|
|
80
89
|
def __exit__(self, except_type: Optional[Type[BaseException]],
|
|
@@ -189,108 +198,6 @@ class SkyLocalUpLineProcessor(LineProcessor):
|
|
|
189
198
|
self.status_display.stop()
|
|
190
199
|
|
|
191
200
|
|
|
192
|
-
class SkyRemoteUpLineProcessor(LineProcessor):
|
|
193
|
-
"""A processor for deploy_remote_cluster.sh log lines."""
|
|
194
|
-
|
|
195
|
-
def __init__(self, log_path: str, is_local: bool):
|
|
196
|
-
self.log_path = log_path
|
|
197
|
-
self.is_local = is_local
|
|
198
|
-
|
|
199
|
-
def __enter__(self) -> None:
|
|
200
|
-
# TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
|
|
201
|
-
# messages.
|
|
202
|
-
status = rich_utils.safe_status(
|
|
203
|
-
ux_utils.spinner_message('Creating remote cluster',
|
|
204
|
-
log_path=self.log_path,
|
|
205
|
-
is_local=self.is_local))
|
|
206
|
-
self.status_display = status
|
|
207
|
-
self.status_display.start()
|
|
208
|
-
|
|
209
|
-
def process_line(self, log_line: str) -> None:
|
|
210
|
-
# Pre-flight checks
|
|
211
|
-
if 'SSH connection successful' in log_line:
|
|
212
|
-
logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
|
|
213
|
-
f'{colorama.Style.RESET_ALL}')
|
|
214
|
-
|
|
215
|
-
# Kubernetes installation steps
|
|
216
|
-
if 'Deploying Kubernetes on head node' in log_line:
|
|
217
|
-
self.status_display.update(
|
|
218
|
-
ux_utils.spinner_message(
|
|
219
|
-
'Creating remote cluster - '
|
|
220
|
-
'deploying Kubernetes on head node',
|
|
221
|
-
log_path=self.log_path,
|
|
222
|
-
is_local=self.is_local))
|
|
223
|
-
if 'K3s deployed on head node.' in log_line:
|
|
224
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
225
|
-
'✔ K3s successfully deployed on head node.'
|
|
226
|
-
f'{colorama.Style.RESET_ALL}')
|
|
227
|
-
|
|
228
|
-
# Worker nodes
|
|
229
|
-
if 'Deploying Kubernetes on worker node' in log_line:
|
|
230
|
-
self.status_display.update(
|
|
231
|
-
ux_utils.spinner_message(
|
|
232
|
-
'Creating remote cluster - '
|
|
233
|
-
'deploying Kubernetes on worker nodes',
|
|
234
|
-
log_path=self.log_path,
|
|
235
|
-
is_local=self.is_local))
|
|
236
|
-
if 'Kubernetes deployed on worker node' in log_line:
|
|
237
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
238
|
-
'✔ K3s successfully deployed on worker node.'
|
|
239
|
-
f'{colorama.Style.RESET_ALL}')
|
|
240
|
-
|
|
241
|
-
# Cluster configuration
|
|
242
|
-
if 'Configuring local kubectl to connect to the cluster...' in log_line:
|
|
243
|
-
self.status_display.update(
|
|
244
|
-
ux_utils.spinner_message(
|
|
245
|
-
'Creating remote cluster - '
|
|
246
|
-
'configuring local kubectl',
|
|
247
|
-
log_path=self.log_path,
|
|
248
|
-
is_local=self.is_local))
|
|
249
|
-
if 'kubectl configured to connect to the cluster.' in log_line:
|
|
250
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
251
|
-
'✔ kubectl configured for the remote cluster.'
|
|
252
|
-
f'{colorama.Style.RESET_ALL}')
|
|
253
|
-
|
|
254
|
-
# GPU operator installation
|
|
255
|
-
if 'Installing Nvidia GPU Operator...' in log_line:
|
|
256
|
-
self.status_display.update(
|
|
257
|
-
ux_utils.spinner_message(
|
|
258
|
-
'Creating remote cluster - '
|
|
259
|
-
'installing Nvidia GPU Operator',
|
|
260
|
-
log_path=self.log_path,
|
|
261
|
-
is_local=self.is_local))
|
|
262
|
-
if 'GPU Operator installed.' in log_line:
|
|
263
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
264
|
-
'✔ Nvidia GPU Operator installed successfully.'
|
|
265
|
-
f'{colorama.Style.RESET_ALL}')
|
|
266
|
-
|
|
267
|
-
# Cleanup steps
|
|
268
|
-
if 'Cleaning up head node' in log_line:
|
|
269
|
-
self.status_display.update(
|
|
270
|
-
ux_utils.spinner_message('Cleaning up head node',
|
|
271
|
-
log_path=self.log_path,
|
|
272
|
-
is_local=self.is_local))
|
|
273
|
-
if 'Cleaning up node' in log_line:
|
|
274
|
-
self.status_display.update(
|
|
275
|
-
ux_utils.spinner_message('Cleaning up worker node',
|
|
276
|
-
log_path=self.log_path,
|
|
277
|
-
is_local=self.is_local))
|
|
278
|
-
if 'cleaned up successfully' in log_line:
|
|
279
|
-
logger.info(f'{colorama.Fore.GREEN}'
|
|
280
|
-
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
281
|
-
|
|
282
|
-
# Final status
|
|
283
|
-
if 'Cluster deployment completed.' in log_line:
|
|
284
|
-
logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
|
|
285
|
-
f'{colorama.Style.RESET_ALL}')
|
|
286
|
-
|
|
287
|
-
def __exit__(self, except_type: Optional[Type[BaseException]],
|
|
288
|
-
except_value: Optional[BaseException],
|
|
289
|
-
traceback: Optional[types.TracebackType]) -> None:
|
|
290
|
-
del except_type, except_value, traceback # unused
|
|
291
|
-
self.status_display.stop()
|
|
292
|
-
|
|
293
|
-
|
|
294
201
|
def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
|
|
295
202
|
"""Creates table with default style."""
|
|
296
203
|
border = kwargs.pop('border', False)
|
|
@@ -356,6 +263,74 @@ def readable_time_duration(start: Optional[float],
|
|
|
356
263
|
return diff
|
|
357
264
|
|
|
358
265
|
|
|
266
|
+
def human_duration(start: int, end: Optional[int] = None) -> str:
|
|
267
|
+
"""Calculates the time elapsed between two timestamps and returns
|
|
268
|
+
it as a human-readable string, similar to Kubernetes' duration format.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
start: The start time as a Unix timestamp (seconds since epoch).
|
|
272
|
+
end: The end time as a Unix timestamp (seconds since epoch).
|
|
273
|
+
If None, current time is used.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
A string representing the duration, e.g., "2d3h", "15m", "30s".
|
|
277
|
+
Returns "0s" for zero, negative durations, or if the timestamp
|
|
278
|
+
is invalid.
|
|
279
|
+
"""
|
|
280
|
+
if not start or start <= 0:
|
|
281
|
+
return '0s'
|
|
282
|
+
|
|
283
|
+
if end is None:
|
|
284
|
+
end = int(time.time())
|
|
285
|
+
duration_seconds = end - start
|
|
286
|
+
|
|
287
|
+
units = {
|
|
288
|
+
'y': 365 * 24 * 60 * 60,
|
|
289
|
+
'd': 60 * 60 * 24,
|
|
290
|
+
'h': 60 * 60,
|
|
291
|
+
'm': 60,
|
|
292
|
+
's': 1,
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
if duration_seconds <= 0:
|
|
296
|
+
return '0s'
|
|
297
|
+
elif duration_seconds < 60 * 2:
|
|
298
|
+
return f'{duration_seconds}s'
|
|
299
|
+
|
|
300
|
+
minutes = int(duration_seconds / units['m'])
|
|
301
|
+
if minutes < 10:
|
|
302
|
+
s = int(duration_seconds / units['s']) % 60
|
|
303
|
+
if s == 0:
|
|
304
|
+
return f'{minutes}m'
|
|
305
|
+
return f'{minutes}m{s}s'
|
|
306
|
+
elif minutes < 60 * 3:
|
|
307
|
+
return f'{minutes}m'
|
|
308
|
+
|
|
309
|
+
hours = int(duration_seconds / units['h'])
|
|
310
|
+
days = int(hours / 24)
|
|
311
|
+
years = int(hours / 24 / 365)
|
|
312
|
+
if hours < 8:
|
|
313
|
+
m = int(duration_seconds / units['m']) % 60
|
|
314
|
+
if m == 0:
|
|
315
|
+
return f'{hours}h'
|
|
316
|
+
return f'{hours}h{m}m'
|
|
317
|
+
elif hours < 48:
|
|
318
|
+
return f'{hours}h'
|
|
319
|
+
elif hours < 24 * 8:
|
|
320
|
+
h = hours % 24
|
|
321
|
+
if h == 0:
|
|
322
|
+
return f'{days}d'
|
|
323
|
+
return f'{days}d{h}h'
|
|
324
|
+
elif hours < 24 * 365 * 2:
|
|
325
|
+
return f'{days}d'
|
|
326
|
+
elif hours < 24 * 365 * 8:
|
|
327
|
+
dy = int(hours / 24) % 365
|
|
328
|
+
if dy == 0:
|
|
329
|
+
return f'{years}y'
|
|
330
|
+
return f'{years}y{dy}d'
|
|
331
|
+
return f'{years}y'
|
|
332
|
+
|
|
333
|
+
|
|
359
334
|
def follow_logs(
|
|
360
335
|
file: TextIO,
|
|
361
336
|
*,
|
sky/utils/perf_utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Utility functions for performance monitoring."""
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from sky import sky_logging
|
|
6
|
+
from sky.skylet import constants
|
|
7
|
+
|
|
8
|
+
logger = sky_logging.init_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_loop_lag_threshold() -> Optional[float]:
|
|
12
|
+
"""Get the loop lag threshold from the environment variable."""
|
|
13
|
+
lag_threshold = os.getenv(constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS, None)
|
|
14
|
+
if lag_threshold is not None:
|
|
15
|
+
try:
|
|
16
|
+
return float(lag_threshold) / 1000.0
|
|
17
|
+
except ValueError:
|
|
18
|
+
logger.warning(
|
|
19
|
+
f'Invalid value for {constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS}:'
|
|
20
|
+
f' {lag_threshold}')
|
|
21
|
+
return None
|
|
22
|
+
return None
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""Resource checking utilities for finding active clusters and managed jobs."""
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
from typing import Any, Callable, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from sky import exceptions
|
|
7
|
+
from sky import global_user_state
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.skylet import constants
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check_no_active_resources_for_users(
|
|
15
|
+
user_operations: List[Tuple[str, str]]) -> None:
|
|
16
|
+
"""Check if users have active clusters or managed jobs.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
user_operations: List of tuples (user_id, operation) where
|
|
20
|
+
operation is 'update' or 'delete'.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If any user has active clusters or managed jobs.
|
|
24
|
+
The error message will include all users with issues.
|
|
25
|
+
"""
|
|
26
|
+
if not user_operations:
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
def filter_by_user(user_id: str):
|
|
30
|
+
return lambda resource: resource.get('user_hash') == user_id
|
|
31
|
+
|
|
32
|
+
_check_active_resources(user_operations, filter_by_user, 'user')
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def check_no_active_resources_for_workspaces(
|
|
36
|
+
workspace_operations: List[Tuple[str, str]]) -> None:
|
|
37
|
+
"""Check if workspaces have active clusters or managed jobs.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
workspace_operations: List of tuples (workspace_name, operation) where
|
|
41
|
+
operation is 'update' or 'delete'.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ValueError: If any workspace has active clusters or managed jobs.
|
|
45
|
+
The error message will include all workspaces with issues.
|
|
46
|
+
"""
|
|
47
|
+
if not workspace_operations:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
def filter_by_workspace(workspace_name: str):
|
|
51
|
+
return lambda resource: (resource.get(
|
|
52
|
+
'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) == workspace_name
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_check_active_resources(workspace_operations, filter_by_workspace,
|
|
56
|
+
'workspace')
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
|
60
|
+
filter_factory: Callable[[str],
|
|
61
|
+
Callable[[Dict[str, Any]],
|
|
62
|
+
bool]],
|
|
63
|
+
resource_type: str) -> None:
|
|
64
|
+
"""Check if resource entities have active clusters or managed jobs.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
resource_operations: List of tuples (resource_name, operation) where
|
|
68
|
+
operation is 'update' or 'delete'.
|
|
69
|
+
filter_factory: Function that takes a resource_name and returns a filter
|
|
70
|
+
function for clusters/jobs.
|
|
71
|
+
resource_type: Type of resource being checked ('user' or 'workspace').
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If any resource has active clusters or managed jobs.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
all_clusters, all_managed_jobs = _get_active_resources()
|
|
78
|
+
|
|
79
|
+
# Collect all error messages instead of raising immediately
|
|
80
|
+
error_messages = []
|
|
81
|
+
|
|
82
|
+
# Check each resource against the fetched data
|
|
83
|
+
for resource_name, operation in resource_operations:
|
|
84
|
+
resource_filter = filter_factory(resource_name)
|
|
85
|
+
|
|
86
|
+
# Filter clusters for this resource
|
|
87
|
+
resource_clusters = [
|
|
88
|
+
cluster for cluster in all_clusters if resource_filter(cluster)
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Filter managed jobs for this resource
|
|
92
|
+
resource_active_jobs = [
|
|
93
|
+
job for job in all_managed_jobs if resource_filter(job)
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
# Collect error messages for this resource
|
|
97
|
+
resource_errors = []
|
|
98
|
+
|
|
99
|
+
if resource_clusters:
|
|
100
|
+
active_cluster_names = [
|
|
101
|
+
cluster['name'] for cluster in resource_clusters
|
|
102
|
+
]
|
|
103
|
+
cluster_list = ', '.join(active_cluster_names)
|
|
104
|
+
resource_errors.append(
|
|
105
|
+
f'{len(resource_clusters)} active cluster(s): {cluster_list}')
|
|
106
|
+
|
|
107
|
+
if resource_active_jobs:
|
|
108
|
+
job_names = [str(job['job_id']) for job in resource_active_jobs]
|
|
109
|
+
job_list = ', '.join(job_names)
|
|
110
|
+
resource_errors.append(
|
|
111
|
+
f'{len(resource_active_jobs)} active managed job(s): '
|
|
112
|
+
f'{job_list}')
|
|
113
|
+
|
|
114
|
+
# If this resource has issues, add to overall error messages
|
|
115
|
+
if resource_errors:
|
|
116
|
+
resource_error_summary = ' and '.join(resource_errors)
|
|
117
|
+
if resource_type == 'user':
|
|
118
|
+
# resource_name is user_id
|
|
119
|
+
user_info = global_user_state.get_user(resource_name)
|
|
120
|
+
if user_info and user_info.name:
|
|
121
|
+
resource_name = user_info.name
|
|
122
|
+
error_messages.append(
|
|
123
|
+
f'Cannot {operation} {resource_type} {resource_name!r} '
|
|
124
|
+
f'because it has {resource_error_summary}.')
|
|
125
|
+
|
|
126
|
+
# If we collected any errors, raise them all together
|
|
127
|
+
if error_messages:
|
|
128
|
+
if len(error_messages) == 1:
|
|
129
|
+
# Single resource error
|
|
130
|
+
full_message = error_messages[
|
|
131
|
+
0] + ' Please terminate these resources first.'
|
|
132
|
+
else:
|
|
133
|
+
# Multiple resource errors
|
|
134
|
+
full_message = (f'Cannot proceed due to active resources in '
|
|
135
|
+
f'{len(error_messages)} {resource_type}(s):\n' +
|
|
136
|
+
'\n'.join(f'• {msg}' for msg in error_messages) +
|
|
137
|
+
'\nPlease terminate these resources first.')
|
|
138
|
+
raise ValueError(full_message)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def check_users_workspaces_active_resources(
|
|
142
|
+
user_ids: List[str],
|
|
143
|
+
workspace_names: List[str]) -> Tuple[str, List[str], Dict[str, str]]:
|
|
144
|
+
"""Check if all the active clusters or managed jobs in workspaces
|
|
145
|
+
belong to the user_ids. If not, return the error message.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
user_ids: List of user_id.
|
|
149
|
+
workspace_names: List of workspace_name.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
resource_error_summary: str
|
|
153
|
+
missed_users_names: List[str]
|
|
154
|
+
missed_user_dict: Dict[str, str]
|
|
155
|
+
"""
|
|
156
|
+
all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
|
|
157
|
+
workspace_names)
|
|
158
|
+
resource_errors = []
|
|
159
|
+
missed_users = set()
|
|
160
|
+
active_cluster_names = []
|
|
161
|
+
active_job_names = []
|
|
162
|
+
# Check clusters
|
|
163
|
+
if all_clusters:
|
|
164
|
+
for cluster in all_clusters:
|
|
165
|
+
user_hash = cluster.get('user_hash')
|
|
166
|
+
if user_hash and user_hash not in user_ids:
|
|
167
|
+
missed_users.add(user_hash)
|
|
168
|
+
active_cluster_names.append(cluster['name'])
|
|
169
|
+
if active_cluster_names:
|
|
170
|
+
cluster_list = ', '.join(active_cluster_names)
|
|
171
|
+
resource_errors.append(
|
|
172
|
+
f'{len(active_cluster_names)} active cluster(s):'
|
|
173
|
+
f' {cluster_list}')
|
|
174
|
+
|
|
175
|
+
# Check managed jobs
|
|
176
|
+
if all_managed_jobs:
|
|
177
|
+
for job in all_managed_jobs:
|
|
178
|
+
user_hash = job.get('user_hash')
|
|
179
|
+
if user_hash and user_hash not in user_ids:
|
|
180
|
+
missed_users.add(user_hash)
|
|
181
|
+
active_job_names.append(str(job['job_id']))
|
|
182
|
+
if active_job_names:
|
|
183
|
+
job_list = ', '.join(active_job_names)
|
|
184
|
+
resource_errors.append(f'{len(active_job_names)} active'
|
|
185
|
+
f' managed job(s): {job_list}')
|
|
186
|
+
|
|
187
|
+
resource_error_summary = ''
|
|
188
|
+
if resource_errors:
|
|
189
|
+
resource_error_summary = ' and '.join(resource_errors)
|
|
190
|
+
missed_users_names = []
|
|
191
|
+
missed_user_dict = {}
|
|
192
|
+
if missed_users:
|
|
193
|
+
all_users = global_user_state.get_all_users()
|
|
194
|
+
for user in all_users:
|
|
195
|
+
if user.id in missed_users:
|
|
196
|
+
missed_users_names.append(user.name if user.name else user.id)
|
|
197
|
+
missed_user_dict[user.id] = user.name if user.name else user.id
|
|
198
|
+
return resource_error_summary, missed_users_names, missed_user_dict
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _get_active_resources_for_workspaces(
|
|
202
|
+
workspace_names: List[str]
|
|
203
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
204
|
+
"""Get active clusters or managed jobs for workspaces.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
workspace_names: List of workspace_name.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
all_clusters: List[Dict[str, Any]]
|
|
211
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
212
|
+
"""
|
|
213
|
+
if not workspace_names:
|
|
214
|
+
return [], []
|
|
215
|
+
|
|
216
|
+
def filter_by_workspaces(workspace_names: List[str]):
|
|
217
|
+
return lambda resource: (resource.get(
|
|
218
|
+
'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
219
|
+
workspace_names)
|
|
220
|
+
|
|
221
|
+
return _get_active_resources_by_names(workspace_names, filter_by_workspaces)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _get_active_resources_by_names(
|
|
225
|
+
resource_names: List[str],
|
|
226
|
+
filter_factory: Callable[[List[str]], Callable[[Dict[str, Any]], bool]]
|
|
227
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
228
|
+
"""Get active clusters or managed jobs.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
resource_names: List of resource_name.
|
|
232
|
+
filter_factory: Function that takes a resource_name and returns a filter
|
|
233
|
+
function for clusters/jobs.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
all_clusters: List[Dict[str, Any]]
|
|
237
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
all_clusters, all_managed_jobs = _get_active_resources()
|
|
241
|
+
|
|
242
|
+
resource_clusters = []
|
|
243
|
+
resource_active_jobs = []
|
|
244
|
+
|
|
245
|
+
# Check each resource against the fetched data,
|
|
246
|
+
# return the active resources by names
|
|
247
|
+
resource_filter = filter_factory(resource_names)
|
|
248
|
+
|
|
249
|
+
# Filter clusters for this resource
|
|
250
|
+
if all_clusters:
|
|
251
|
+
resource_clusters = [
|
|
252
|
+
cluster for cluster in all_clusters if resource_filter(cluster)
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
# Filter managed jobs for this resource
|
|
256
|
+
if all_managed_jobs:
|
|
257
|
+
resource_active_jobs = [
|
|
258
|
+
job for job in all_managed_jobs if resource_filter(job)
|
|
259
|
+
]
|
|
260
|
+
|
|
261
|
+
return resource_clusters, resource_active_jobs
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _get_active_resources(
|
|
265
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
266
|
+
"""Get all active clusters and managed jobs.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
all_clusters: List[Dict[str, Any]]
|
|
270
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
def get_all_clusters() -> List[Dict[str, Any]]:
|
|
274
|
+
return global_user_state.get_clusters()
|
|
275
|
+
|
|
276
|
+
def get_all_managed_jobs() -> List[Dict[str, Any]]:
|
|
277
|
+
# pylint: disable=import-outside-toplevel
|
|
278
|
+
from sky.jobs.server import core as managed_jobs_core
|
|
279
|
+
try:
|
|
280
|
+
filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
|
|
281
|
+
refresh=False,
|
|
282
|
+
skip_finished=True,
|
|
283
|
+
all_users=True,
|
|
284
|
+
fields=['job_id', 'user_hash', 'workspace'])
|
|
285
|
+
return filtered_jobs
|
|
286
|
+
except exceptions.ClusterNotUpError:
|
|
287
|
+
logger.warning('All jobs should be finished.')
|
|
288
|
+
return []
|
|
289
|
+
|
|
290
|
+
# Fetch both clusters and jobs in parallel
|
|
291
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
292
|
+
clusters_future = executor.submit(get_all_clusters)
|
|
293
|
+
jobs_future = executor.submit(get_all_managed_jobs)
|
|
294
|
+
|
|
295
|
+
all_clusters = clusters_future.result()
|
|
296
|
+
all_managed_jobs = jobs_future.result()
|
|
297
|
+
|
|
298
|
+
return all_clusters, all_managed_jobs
|