skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/utils/log_utils.py
CHANGED
|
@@ -47,13 +47,16 @@ class RayUpLineProcessor(LineProcessor):
|
|
|
47
47
|
RUNTIME_SETUP = 1
|
|
48
48
|
PULLING_DOCKER_IMAGES = 2
|
|
49
49
|
|
|
50
|
-
def __init__(self, log_path: str):
|
|
50
|
+
def __init__(self, log_path: str, cluster_name: Optional[str] = None):
|
|
51
51
|
self.log_path = log_path
|
|
52
|
+
self.cluster_name = cluster_name
|
|
52
53
|
|
|
53
54
|
def __enter__(self) -> None:
|
|
54
55
|
self.state = self.ProvisionStatus.LAUNCH
|
|
55
56
|
self.status_display = rich_utils.safe_status(
|
|
56
|
-
ux_utils.spinner_message('Launching',
|
|
57
|
+
ux_utils.spinner_message('Launching',
|
|
58
|
+
self.log_path,
|
|
59
|
+
cluster_name=self.cluster_name))
|
|
57
60
|
self.status_display.start()
|
|
58
61
|
|
|
59
62
|
def process_line(self, log_line: str) -> None:
|
|
@@ -62,19 +65,25 @@ class RayUpLineProcessor(LineProcessor):
|
|
|
62
65
|
logger.info(' Head VM is up.')
|
|
63
66
|
self.status_display.update(
|
|
64
67
|
ux_utils.spinner_message(
|
|
65
|
-
'Launching - Preparing SkyPilot runtime',
|
|
68
|
+
'Launching - Preparing SkyPilot runtime',
|
|
69
|
+
self.log_path,
|
|
70
|
+
cluster_name=self.cluster_name))
|
|
66
71
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
|
67
72
|
if ('Pulling from' in log_line and
|
|
68
73
|
self.state == self.ProvisionStatus.RUNTIME_SETUP):
|
|
69
74
|
self.status_display.update(
|
|
70
75
|
ux_utils.spinner_message(
|
|
71
|
-
'Launching - Initializing docker container',
|
|
76
|
+
'Launching - Initializing docker container',
|
|
77
|
+
self.log_path,
|
|
78
|
+
cluster_name=self.cluster_name))
|
|
72
79
|
self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
|
|
73
80
|
if ('Status: Downloaded newer image' in log_line and
|
|
74
81
|
self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
|
|
75
82
|
self.status_display.update(
|
|
76
83
|
ux_utils.spinner_message(
|
|
77
|
-
'Launching - Preparing SkyPilot runtime',
|
|
84
|
+
'Launching - Preparing SkyPilot runtime',
|
|
85
|
+
self.log_path,
|
|
86
|
+
cluster_name=self.cluster_name))
|
|
78
87
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
|
79
88
|
|
|
80
89
|
def __exit__(self, except_type: Optional[Type[BaseException]],
|
|
@@ -190,7 +199,7 @@ class SkyLocalUpLineProcessor(LineProcessor):
|
|
|
190
199
|
|
|
191
200
|
|
|
192
201
|
class SkyRemoteUpLineProcessor(LineProcessor):
|
|
193
|
-
"""A processor for deploy_remote_cluster.
|
|
202
|
+
"""A processor for deploy_remote_cluster.py log lines."""
|
|
194
203
|
|
|
195
204
|
def __init__(self, log_path: str, is_local: bool):
|
|
196
205
|
self.log_path = log_path
|
|
@@ -291,6 +300,223 @@ class SkyRemoteUpLineProcessor(LineProcessor):
|
|
|
291
300
|
self.status_display.stop()
|
|
292
301
|
|
|
293
302
|
|
|
303
|
+
class SkySSHUpLineProcessor(LineProcessor):
|
|
304
|
+
"""A processor for deploy_remote_cluster.py log lines for SSH clusters"""
|
|
305
|
+
|
|
306
|
+
def __init__(self, log_path: str, is_local: bool):
|
|
307
|
+
self.log_path = log_path
|
|
308
|
+
self.is_local = is_local
|
|
309
|
+
self.current_cluster: Optional[str] = None
|
|
310
|
+
self.is_cleanup_mode = False
|
|
311
|
+
|
|
312
|
+
def __enter__(self) -> None:
|
|
313
|
+
status = rich_utils.safe_status(
|
|
314
|
+
ux_utils.spinner_message('Preparing to set up SSH Node Pools',
|
|
315
|
+
log_path=self.log_path,
|
|
316
|
+
is_local=self.is_local))
|
|
317
|
+
self.status_display = status
|
|
318
|
+
self.status_display.start()
|
|
319
|
+
|
|
320
|
+
def process_line(self, log_line: str) -> None:
|
|
321
|
+
# Detect cleanup mode
|
|
322
|
+
if 'SKYPILOT_CLEANUP_MODE:' in log_line:
|
|
323
|
+
self.is_cleanup_mode = True
|
|
324
|
+
if self.current_cluster:
|
|
325
|
+
self.status_display.update(
|
|
326
|
+
ux_utils.spinner_message(
|
|
327
|
+
f'Cleaning up Node Pool: \\[{self.current_cluster}]',
|
|
328
|
+
log_path=self.log_path,
|
|
329
|
+
is_local=self.is_local))
|
|
330
|
+
|
|
331
|
+
# Cluster detection message
|
|
332
|
+
if 'SKYPILOT_CLUSTER_INFO:' in log_line:
|
|
333
|
+
clusters_part = log_line.split('SKYPILOT_CLUSTER_INFO:',
|
|
334
|
+
1)[1].strip()
|
|
335
|
+
if clusters_part.startswith('Found'):
|
|
336
|
+
logger.info(f'{colorama.Style.RESET_ALL}'
|
|
337
|
+
f'{colorama.Fore.CYAN}{clusters_part}'
|
|
338
|
+
f'{colorama.Style.RESET_ALL}')
|
|
339
|
+
|
|
340
|
+
# Current cluster being operated on
|
|
341
|
+
if 'SKYPILOT_CURRENT_CLUSTER:' in log_line:
|
|
342
|
+
self.current_cluster = log_line.split('SKYPILOT_CURRENT_CLUSTER:',
|
|
343
|
+
1)[1].strip()
|
|
344
|
+
|
|
345
|
+
if self.is_cleanup_mode:
|
|
346
|
+
self.status_display.update(
|
|
347
|
+
ux_utils.spinner_message(
|
|
348
|
+
f'Cleaning up Node Pool: {self.current_cluster}',
|
|
349
|
+
log_path=self.log_path,
|
|
350
|
+
is_local=self.is_local))
|
|
351
|
+
logger.info(f'{colorama.Fore.CYAN}\nCleaning up Node Pool: '
|
|
352
|
+
f'{self.current_cluster}{colorama.Style.RESET_ALL}')
|
|
353
|
+
else:
|
|
354
|
+
self.status_display.update(
|
|
355
|
+
ux_utils.spinner_message(
|
|
356
|
+
f'Deploying SkyPilot \\[{self.current_cluster}]',
|
|
357
|
+
log_path=self.log_path,
|
|
358
|
+
is_local=self.is_local))
|
|
359
|
+
logger.info(f'{colorama.Style.RESET_ALL}'
|
|
360
|
+
f'{colorama.Fore.CYAN}\nSetting up Node Pool: '
|
|
361
|
+
f'{self.current_cluster}{colorama.Style.RESET_ALL}')
|
|
362
|
+
|
|
363
|
+
# Handle cluster completion marker
|
|
364
|
+
if 'SKYPILOT_CLUSTER_COMPLETED:' in log_line:
|
|
365
|
+
if self.is_cleanup_mode:
|
|
366
|
+
logger.info(
|
|
367
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
|
368
|
+
f'✔ Node Pool {self.current_cluster} cleaned up '
|
|
369
|
+
f'successfully.{colorama.Style.RESET_ALL}')
|
|
370
|
+
else:
|
|
371
|
+
logger.info(
|
|
372
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
|
373
|
+
f'✔ Node Pool {self.current_cluster} deployed successfully.'
|
|
374
|
+
f'{colorama.Style.RESET_ALL}')
|
|
375
|
+
|
|
376
|
+
# Pre-flight checks
|
|
377
|
+
if 'Checking SSH connection to head node' in log_line:
|
|
378
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
|
379
|
+
'Checking SSH connection to head node...'
|
|
380
|
+
f'{colorama.Style.RESET_ALL}')
|
|
381
|
+
|
|
382
|
+
if log_line.startswith('SSH connection successful'):
|
|
383
|
+
node_name = log_line.split('(')[-1].split(')')[0]
|
|
384
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
385
|
+
'✔ SSH connection established to head node '
|
|
386
|
+
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
387
|
+
|
|
388
|
+
# Kubernetes installation steps
|
|
389
|
+
if 'Deploying Kubernetes on head node' in log_line:
|
|
390
|
+
current_cluster_str = f' \\[{self.current_cluster}]' if (
|
|
391
|
+
self.current_cluster) else ''
|
|
392
|
+
self.status_display.update(
|
|
393
|
+
ux_utils.spinner_message(
|
|
394
|
+
'Deploying SkyPilot runtime on head node'
|
|
395
|
+
f'{current_cluster_str}',
|
|
396
|
+
log_path=self.log_path,
|
|
397
|
+
is_local=self.is_local))
|
|
398
|
+
|
|
399
|
+
if 'K3s deployed on head node' in log_line:
|
|
400
|
+
node_name = log_line.split('(')[-1].split(')')[0]
|
|
401
|
+
logger.info(
|
|
402
|
+
f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
403
|
+
f'✔ SkyPilot runtime successfully deployed on head node '
|
|
404
|
+
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
405
|
+
|
|
406
|
+
# Worker nodes
|
|
407
|
+
if 'Deploying Kubernetes on worker node' in log_line:
|
|
408
|
+
self.status_display.update(
|
|
409
|
+
ux_utils.spinner_message(
|
|
410
|
+
'Deploying SkyPilot runtime on worker nodes' +
|
|
411
|
+
(f' \\[{self.current_cluster}]'
|
|
412
|
+
if self.current_cluster else ''),
|
|
413
|
+
log_path=self.log_path,
|
|
414
|
+
is_local=self.is_local))
|
|
415
|
+
|
|
416
|
+
if 'Kubernetes deployed on worker node' in log_line:
|
|
417
|
+
node_name = log_line.split('(')[-1].split(')')[0]
|
|
418
|
+
logger.info(
|
|
419
|
+
f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
420
|
+
'✔ SkyPilot runtime successfully deployed on worker node '
|
|
421
|
+
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
422
|
+
|
|
423
|
+
if 'Failed to deploy K3s on worker node' in log_line:
|
|
424
|
+
node_name = log_line.split('(')[-1].split(')')[0]
|
|
425
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
426
|
+
'✗ Failed to deploy K3s on worker node '
|
|
427
|
+
f'{node_name}.{colorama.Style.RESET_ALL}')
|
|
428
|
+
|
|
429
|
+
# Cluster configuration
|
|
430
|
+
if 'Configuring local kubectl to connect to the cluster...' in log_line:
|
|
431
|
+
self.status_display.update(
|
|
432
|
+
ux_utils.spinner_message('Setting up SkyPilot configuration' +
|
|
433
|
+
(f' \\[{self.current_cluster}]'
|
|
434
|
+
if self.current_cluster else ''),
|
|
435
|
+
log_path=self.log_path,
|
|
436
|
+
is_local=self.is_local))
|
|
437
|
+
|
|
438
|
+
if 'kubectl configured to connect to the cluster.' in log_line:
|
|
439
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
440
|
+
'✔ SkyPilot configuration complete.'
|
|
441
|
+
f'{colorama.Style.RESET_ALL}')
|
|
442
|
+
|
|
443
|
+
# GPU operator installation
|
|
444
|
+
if 'Installing Nvidia GPU Operator...' in log_line:
|
|
445
|
+
self.status_display.update(
|
|
446
|
+
ux_utils.spinner_message('Configuring Nvidia GPUs' +
|
|
447
|
+
(f' \\[{self.current_cluster}]'
|
|
448
|
+
if self.current_cluster else ''),
|
|
449
|
+
log_path=self.log_path,
|
|
450
|
+
is_local=self.is_local))
|
|
451
|
+
|
|
452
|
+
if 'GPU Operator installed.' in log_line:
|
|
453
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
454
|
+
'✔ Nvidia GPUs configured successfully.'
|
|
455
|
+
f'{colorama.Style.RESET_ALL}')
|
|
456
|
+
|
|
457
|
+
# Cleanup steps
|
|
458
|
+
if 'Cleaning up head node' in log_line:
|
|
459
|
+
self.status_display.update(
|
|
460
|
+
ux_utils.spinner_message('Cleaning up head node' +
|
|
461
|
+
(f' \\[{self.current_cluster}]'
|
|
462
|
+
if self.current_cluster else ''),
|
|
463
|
+
log_path=self.log_path,
|
|
464
|
+
is_local=self.is_local))
|
|
465
|
+
|
|
466
|
+
if 'Cleaning up worker node' in log_line:
|
|
467
|
+
self.status_display.update(
|
|
468
|
+
ux_utils.spinner_message('Cleaning up worker nodes' +
|
|
469
|
+
(f' \\[{self.current_cluster}]'
|
|
470
|
+
if self.current_cluster else ''),
|
|
471
|
+
log_path=self.log_path,
|
|
472
|
+
is_local=self.is_local))
|
|
473
|
+
|
|
474
|
+
# Handle node cleanup success messages
|
|
475
|
+
if 'Node' in log_line and 'cleaned up successfully' in log_line:
|
|
476
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
477
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
478
|
+
|
|
479
|
+
if 'Node' in log_line and 'Failed to clean up' in log_line:
|
|
480
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
481
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
482
|
+
|
|
483
|
+
if 'Failed to clean up worker node' in log_line:
|
|
484
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
485
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
486
|
+
|
|
487
|
+
# Final status for the cluster deployment
|
|
488
|
+
if 'Cluster deployment completed.' in log_line:
|
|
489
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
|
|
490
|
+
'✔ SkyPilot runtime is up.'
|
|
491
|
+
f'{colorama.Style.RESET_ALL}')
|
|
492
|
+
|
|
493
|
+
if 'Failed to deploy Kubernetes on the following nodes:' in log_line:
|
|
494
|
+
logger.info(log_line.strip())
|
|
495
|
+
|
|
496
|
+
if 'already exists in history. ' in log_line:
|
|
497
|
+
node_name = log_line.split('(')[-1].split(')')[0]
|
|
498
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.YELLOW}'
|
|
499
|
+
'✔ SkyPilot runtime already deployed on worker node '
|
|
500
|
+
f'{node_name}. Skipping.{colorama.Style.RESET_ALL}')
|
|
501
|
+
|
|
502
|
+
if 'Failed to setup TCP forwarding on head node' in log_line:
|
|
503
|
+
node_name = log_line.split('(')[-1].split(')')[0]
|
|
504
|
+
logger.info(
|
|
505
|
+
f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
|
|
506
|
+
f'✗ Failed to setup TCP forwarding on head node {node_name}.'
|
|
507
|
+
f'{colorama.Style.RESET_ALL}')
|
|
508
|
+
|
|
509
|
+
if 'Error in deploying SSH Target' in log_line:
|
|
510
|
+
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.RED}'
|
|
511
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
|
512
|
+
|
|
513
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
|
514
|
+
except_value: Optional[BaseException],
|
|
515
|
+
traceback: Optional[types.TracebackType]) -> None:
|
|
516
|
+
del except_type, except_value, traceback # unused
|
|
517
|
+
self.status_display.stop()
|
|
518
|
+
|
|
519
|
+
|
|
294
520
|
def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
|
|
295
521
|
"""Creates table with default style."""
|
|
296
522
|
border = kwargs.pop('border', False)
|
|
@@ -356,6 +582,74 @@ def readable_time_duration(start: Optional[float],
|
|
|
356
582
|
return diff
|
|
357
583
|
|
|
358
584
|
|
|
585
|
+
def human_duration(start: int, end: Optional[int] = None) -> str:
|
|
586
|
+
"""Calculates the time elapsed between two timestamps and returns
|
|
587
|
+
it as a human-readable string, similar to Kubernetes' duration format.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
start: The start time as a Unix timestamp (seconds since epoch).
|
|
591
|
+
end: The end time as a Unix timestamp (seconds since epoch).
|
|
592
|
+
If None, current time is used.
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
A string representing the duration, e.g., "2d3h", "15m", "30s".
|
|
596
|
+
Returns "0s" for zero, negative durations, or if the timestamp
|
|
597
|
+
is invalid.
|
|
598
|
+
"""
|
|
599
|
+
if not start or start <= 0:
|
|
600
|
+
return '0s'
|
|
601
|
+
|
|
602
|
+
if end is None:
|
|
603
|
+
end = int(time.time())
|
|
604
|
+
duration_seconds = end - start
|
|
605
|
+
|
|
606
|
+
units = {
|
|
607
|
+
'y': 365 * 24 * 60 * 60,
|
|
608
|
+
'd': 60 * 60 * 24,
|
|
609
|
+
'h': 60 * 60,
|
|
610
|
+
'm': 60,
|
|
611
|
+
's': 1,
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
if duration_seconds <= 0:
|
|
615
|
+
return '0s'
|
|
616
|
+
elif duration_seconds < 60 * 2:
|
|
617
|
+
return f'{duration_seconds}s'
|
|
618
|
+
|
|
619
|
+
minutes = int(duration_seconds / units['m'])
|
|
620
|
+
if minutes < 10:
|
|
621
|
+
s = int(duration_seconds / units['s']) % 60
|
|
622
|
+
if s == 0:
|
|
623
|
+
return f'{minutes}m'
|
|
624
|
+
return f'{minutes}m{s}s'
|
|
625
|
+
elif minutes < 60 * 3:
|
|
626
|
+
return f'{minutes}m'
|
|
627
|
+
|
|
628
|
+
hours = int(duration_seconds / units['h'])
|
|
629
|
+
days = int(hours / 24)
|
|
630
|
+
years = int(hours / 24 / 365)
|
|
631
|
+
if hours < 8:
|
|
632
|
+
m = int(duration_seconds / units['m']) % 60
|
|
633
|
+
if m == 0:
|
|
634
|
+
return f'{hours}h'
|
|
635
|
+
return f'{hours}h{m}m'
|
|
636
|
+
elif hours < 48:
|
|
637
|
+
return f'{hours}h'
|
|
638
|
+
elif hours < 24 * 8:
|
|
639
|
+
h = hours % 24
|
|
640
|
+
if h == 0:
|
|
641
|
+
return f'{days}d'
|
|
642
|
+
return f'{days}d{h}h'
|
|
643
|
+
elif hours < 24 * 365 * 2:
|
|
644
|
+
return f'{days}d'
|
|
645
|
+
elif hours < 24 * 365 * 8:
|
|
646
|
+
dy = int(hours / 24) % 365
|
|
647
|
+
if dy == 0:
|
|
648
|
+
return f'{years}y'
|
|
649
|
+
return f'{years}y{dy}d'
|
|
650
|
+
return f'{years}y'
|
|
651
|
+
|
|
652
|
+
|
|
359
653
|
def follow_logs(
|
|
360
654
|
file: TextIO,
|
|
361
655
|
*,
|
sky/utils/perf_utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Utility functions for performance monitoring."""
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from sky import sky_logging
|
|
6
|
+
from sky.skylet import constants
|
|
7
|
+
|
|
8
|
+
logger = sky_logging.init_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_loop_lag_threshold() -> Optional[float]:
|
|
12
|
+
"""Get the loop lag threshold from the environment variable."""
|
|
13
|
+
lag_threshold = os.getenv(constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS, None)
|
|
14
|
+
if lag_threshold is not None:
|
|
15
|
+
try:
|
|
16
|
+
return float(lag_threshold) / 1000.0
|
|
17
|
+
except ValueError:
|
|
18
|
+
logger.warning(
|
|
19
|
+
f'Invalid value for {constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS}:'
|
|
20
|
+
f' {lag_threshold}')
|
|
21
|
+
return None
|
|
22
|
+
return None
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""Resource checking utilities for finding active clusters and managed jobs."""
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
from typing import Any, Callable, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from sky import exceptions
|
|
7
|
+
from sky import global_user_state
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.skylet import constants
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check_no_active_resources_for_users(
|
|
15
|
+
user_operations: List[Tuple[str, str]]) -> None:
|
|
16
|
+
"""Check if users have active clusters or managed jobs.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
user_operations: List of tuples (user_id, operation) where
|
|
20
|
+
operation is 'update' or 'delete'.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If any user has active clusters or managed jobs.
|
|
24
|
+
The error message will include all users with issues.
|
|
25
|
+
"""
|
|
26
|
+
if not user_operations:
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
def filter_by_user(user_id: str):
|
|
30
|
+
return lambda resource: resource.get('user_hash') == user_id
|
|
31
|
+
|
|
32
|
+
_check_active_resources(user_operations, filter_by_user, 'user')
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def check_no_active_resources_for_workspaces(
|
|
36
|
+
workspace_operations: List[Tuple[str, str]]) -> None:
|
|
37
|
+
"""Check if workspaces have active clusters or managed jobs.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
workspace_operations: List of tuples (workspace_name, operation) where
|
|
41
|
+
operation is 'update' or 'delete'.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ValueError: If any workspace has active clusters or managed jobs.
|
|
45
|
+
The error message will include all workspaces with issues.
|
|
46
|
+
"""
|
|
47
|
+
if not workspace_operations:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
def filter_by_workspace(workspace_name: str):
|
|
51
|
+
return lambda resource: (resource.get(
|
|
52
|
+
'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) == workspace_name
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_check_active_resources(workspace_operations, filter_by_workspace,
|
|
56
|
+
'workspace')
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
|
60
|
+
filter_factory: Callable[[str],
|
|
61
|
+
Callable[[Dict[str, Any]],
|
|
62
|
+
bool]],
|
|
63
|
+
resource_type: str) -> None:
|
|
64
|
+
"""Check if resource entities have active clusters or managed jobs.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
resource_operations: List of tuples (resource_name, operation) where
|
|
68
|
+
operation is 'update' or 'delete'.
|
|
69
|
+
filter_factory: Function that takes a resource_name and returns a filter
|
|
70
|
+
function for clusters/jobs.
|
|
71
|
+
resource_type: Type of resource being checked ('user' or 'workspace').
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If any resource has active clusters or managed jobs.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
all_clusters, all_managed_jobs = _get_active_resources()
|
|
78
|
+
|
|
79
|
+
# Collect all error messages instead of raising immediately
|
|
80
|
+
error_messages = []
|
|
81
|
+
|
|
82
|
+
# Check each resource against the fetched data
|
|
83
|
+
for resource_name, operation in resource_operations:
|
|
84
|
+
resource_filter = filter_factory(resource_name)
|
|
85
|
+
|
|
86
|
+
# Filter clusters for this resource
|
|
87
|
+
resource_clusters = [
|
|
88
|
+
cluster for cluster in all_clusters if resource_filter(cluster)
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Filter managed jobs for this resource
|
|
92
|
+
resource_active_jobs = [
|
|
93
|
+
job for job in all_managed_jobs if resource_filter(job)
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
# Collect error messages for this resource
|
|
97
|
+
resource_errors = []
|
|
98
|
+
|
|
99
|
+
if resource_clusters:
|
|
100
|
+
active_cluster_names = [
|
|
101
|
+
cluster['name'] for cluster in resource_clusters
|
|
102
|
+
]
|
|
103
|
+
cluster_list = ', '.join(active_cluster_names)
|
|
104
|
+
resource_errors.append(
|
|
105
|
+
f'{len(resource_clusters)} active cluster(s): {cluster_list}')
|
|
106
|
+
|
|
107
|
+
if resource_active_jobs:
|
|
108
|
+
job_names = [str(job['job_id']) for job in resource_active_jobs]
|
|
109
|
+
job_list = ', '.join(job_names)
|
|
110
|
+
resource_errors.append(
|
|
111
|
+
f'{len(resource_active_jobs)} active managed job(s): '
|
|
112
|
+
f'{job_list}')
|
|
113
|
+
|
|
114
|
+
# If this resource has issues, add to overall error messages
|
|
115
|
+
if resource_errors:
|
|
116
|
+
resource_error_summary = ' and '.join(resource_errors)
|
|
117
|
+
if resource_type == 'user':
|
|
118
|
+
# resource_name is user_id
|
|
119
|
+
user_info = global_user_state.get_user(resource_name)
|
|
120
|
+
if user_info and user_info.name:
|
|
121
|
+
resource_name = user_info.name
|
|
122
|
+
error_messages.append(
|
|
123
|
+
f'Cannot {operation} {resource_type} {resource_name!r} '
|
|
124
|
+
f'because it has {resource_error_summary}.')
|
|
125
|
+
|
|
126
|
+
# If we collected any errors, raise them all together
|
|
127
|
+
if error_messages:
|
|
128
|
+
if len(error_messages) == 1:
|
|
129
|
+
# Single resource error
|
|
130
|
+
full_message = error_messages[
|
|
131
|
+
0] + ' Please terminate these resources first.'
|
|
132
|
+
else:
|
|
133
|
+
# Multiple resource errors
|
|
134
|
+
full_message = (f'Cannot proceed due to active resources in '
|
|
135
|
+
f'{len(error_messages)} {resource_type}(s):\n' +
|
|
136
|
+
'\n'.join(f'• {msg}' for msg in error_messages) +
|
|
137
|
+
'\nPlease terminate these resources first.')
|
|
138
|
+
raise ValueError(full_message)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def check_users_workspaces_active_resources(
|
|
142
|
+
user_ids: List[str],
|
|
143
|
+
workspace_names: List[str]) -> Tuple[str, List[str], Dict[str, str]]:
|
|
144
|
+
"""Check if all the active clusters or managed jobs in workspaces
|
|
145
|
+
belong to the user_ids. If not, return the error message.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
user_ids: List of user_id.
|
|
149
|
+
workspace_names: List of workspace_name.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
resource_error_summary: str
|
|
153
|
+
missed_users_names: List[str]
|
|
154
|
+
missed_user_dict: Dict[str, str]
|
|
155
|
+
"""
|
|
156
|
+
all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
|
|
157
|
+
workspace_names)
|
|
158
|
+
resource_errors = []
|
|
159
|
+
missed_users = set()
|
|
160
|
+
active_cluster_names = []
|
|
161
|
+
active_job_names = []
|
|
162
|
+
# Check clusters
|
|
163
|
+
if all_clusters:
|
|
164
|
+
for cluster in all_clusters:
|
|
165
|
+
user_hash = cluster.get('user_hash')
|
|
166
|
+
if user_hash and user_hash not in user_ids:
|
|
167
|
+
missed_users.add(user_hash)
|
|
168
|
+
active_cluster_names.append(cluster['name'])
|
|
169
|
+
if active_cluster_names:
|
|
170
|
+
cluster_list = ', '.join(active_cluster_names)
|
|
171
|
+
resource_errors.append(
|
|
172
|
+
f'{len(active_cluster_names)} active cluster(s):'
|
|
173
|
+
f' {cluster_list}')
|
|
174
|
+
|
|
175
|
+
# Check managed jobs
|
|
176
|
+
if all_managed_jobs:
|
|
177
|
+
for job in all_managed_jobs:
|
|
178
|
+
user_hash = job.get('user_hash')
|
|
179
|
+
if user_hash and user_hash not in user_ids:
|
|
180
|
+
missed_users.add(user_hash)
|
|
181
|
+
active_job_names.append(str(job['job_id']))
|
|
182
|
+
if active_job_names:
|
|
183
|
+
job_list = ', '.join(active_job_names)
|
|
184
|
+
resource_errors.append(f'{len(active_job_names)} active'
|
|
185
|
+
f' managed job(s): {job_list}')
|
|
186
|
+
|
|
187
|
+
resource_error_summary = ''
|
|
188
|
+
if resource_errors:
|
|
189
|
+
resource_error_summary = ' and '.join(resource_errors)
|
|
190
|
+
missed_users_names = []
|
|
191
|
+
missed_user_dict = {}
|
|
192
|
+
if missed_users:
|
|
193
|
+
all_users = global_user_state.get_all_users()
|
|
194
|
+
for user in all_users:
|
|
195
|
+
if user.id in missed_users:
|
|
196
|
+
missed_users_names.append(user.name if user.name else user.id)
|
|
197
|
+
missed_user_dict[user.id] = user.name if user.name else user.id
|
|
198
|
+
return resource_error_summary, missed_users_names, missed_user_dict
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _get_active_resources_for_workspaces(
|
|
202
|
+
workspace_names: List[str]
|
|
203
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
204
|
+
"""Get active clusters or managed jobs for workspaces.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
workspace_names: List of workspace_name.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
all_clusters: List[Dict[str, Any]]
|
|
211
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
212
|
+
"""
|
|
213
|
+
if not workspace_names:
|
|
214
|
+
return [], []
|
|
215
|
+
|
|
216
|
+
def filter_by_workspaces(workspace_names: List[str]):
|
|
217
|
+
return lambda resource: (resource.get(
|
|
218
|
+
'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
219
|
+
workspace_names)
|
|
220
|
+
|
|
221
|
+
return _get_active_resources_by_names(workspace_names, filter_by_workspaces)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _get_active_resources_by_names(
|
|
225
|
+
resource_names: List[str],
|
|
226
|
+
filter_factory: Callable[[List[str]], Callable[[Dict[str, Any]], bool]]
|
|
227
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
228
|
+
"""Get active clusters or managed jobs.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
resource_names: List of resource_name.
|
|
232
|
+
filter_factory: Function that takes a resource_name and returns a filter
|
|
233
|
+
function for clusters/jobs.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
all_clusters: List[Dict[str, Any]]
|
|
237
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
all_clusters, all_managed_jobs = _get_active_resources()
|
|
241
|
+
|
|
242
|
+
resource_clusters = []
|
|
243
|
+
resource_active_jobs = []
|
|
244
|
+
|
|
245
|
+
# Check each resource against the fetched data,
|
|
246
|
+
# return the active resources by names
|
|
247
|
+
resource_filter = filter_factory(resource_names)
|
|
248
|
+
|
|
249
|
+
# Filter clusters for this resource
|
|
250
|
+
if all_clusters:
|
|
251
|
+
resource_clusters = [
|
|
252
|
+
cluster for cluster in all_clusters if resource_filter(cluster)
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
# Filter managed jobs for this resource
|
|
256
|
+
if all_managed_jobs:
|
|
257
|
+
resource_active_jobs = [
|
|
258
|
+
job for job in all_managed_jobs if resource_filter(job)
|
|
259
|
+
]
|
|
260
|
+
|
|
261
|
+
return resource_clusters, resource_active_jobs
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _get_active_resources(
|
|
265
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
266
|
+
"""Get all active clusters and managed jobs.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
all_clusters: List[Dict[str, Any]]
|
|
270
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
def get_all_clusters() -> List[Dict[str, Any]]:
|
|
274
|
+
return global_user_state.get_clusters()
|
|
275
|
+
|
|
276
|
+
def get_all_managed_jobs() -> List[Dict[str, Any]]:
|
|
277
|
+
# pylint: disable=import-outside-toplevel
|
|
278
|
+
from sky.jobs.server import core as managed_jobs_core
|
|
279
|
+
try:
|
|
280
|
+
filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
|
|
281
|
+
refresh=False,
|
|
282
|
+
skip_finished=True,
|
|
283
|
+
all_users=True,
|
|
284
|
+
fields=['job_id', 'user_hash', 'workspace'])
|
|
285
|
+
return filtered_jobs
|
|
286
|
+
except exceptions.ClusterNotUpError:
|
|
287
|
+
logger.warning('All jobs should be finished.')
|
|
288
|
+
return []
|
|
289
|
+
|
|
290
|
+
# Fetch both clusters and jobs in parallel
|
|
291
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
292
|
+
clusters_future = executor.submit(get_all_clusters)
|
|
293
|
+
jobs_future = executor.submit(get_all_managed_jobs)
|
|
294
|
+
|
|
295
|
+
all_clusters = clusters_future.result()
|
|
296
|
+
all_managed_jobs = jobs_future.result()
|
|
297
|
+
|
|
298
|
+
return all_clusters, all_managed_jobs
|