skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/utils/controller_utils.py
CHANGED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
import copy
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
|
-
import getpass
|
|
6
5
|
import os
|
|
7
6
|
import tempfile
|
|
8
7
|
import typing
|
|
9
|
-
from typing import Any, Dict, Iterable, List, Optional, Set
|
|
8
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Set
|
|
10
9
|
import uuid
|
|
11
10
|
|
|
12
11
|
import colorama
|
|
@@ -24,10 +23,14 @@ from sky.clouds import gcp
|
|
|
24
23
|
from sky.data import data_utils
|
|
25
24
|
from sky.data import storage as storage_lib
|
|
26
25
|
from sky.jobs import constants as managed_job_constants
|
|
26
|
+
from sky.jobs import state as managed_job_state
|
|
27
|
+
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
27
28
|
from sky.serve import constants as serve_constants
|
|
29
|
+
from sky.serve import serve_state
|
|
28
30
|
from sky.setup_files import dependencies
|
|
29
31
|
from sky.skylet import constants
|
|
30
32
|
from sky.skylet import log_lib
|
|
33
|
+
from sky.utils import annotations
|
|
31
34
|
from sky.utils import common
|
|
32
35
|
from sky.utils import common_utils
|
|
33
36
|
from sky.utils import config_utils
|
|
@@ -35,10 +38,16 @@ from sky.utils import env_options
|
|
|
35
38
|
from sky.utils import registry
|
|
36
39
|
from sky.utils import rich_utils
|
|
37
40
|
from sky.utils import ux_utils
|
|
41
|
+
from sky.utils import yaml_utils
|
|
38
42
|
|
|
39
43
|
if typing.TYPE_CHECKING:
|
|
44
|
+
import psutil
|
|
45
|
+
|
|
40
46
|
from sky import task as task_lib
|
|
41
47
|
from sky.backends import cloud_vm_ray_backend
|
|
48
|
+
else:
|
|
49
|
+
from sky.adaptors import common as adaptors_common
|
|
50
|
+
psutil = adaptors_common.LazyImport('psutil')
|
|
42
51
|
|
|
43
52
|
logger = sky_logging.init_logger(__name__)
|
|
44
53
|
|
|
@@ -63,8 +72,9 @@ class _ControllerSpec:
|
|
|
63
72
|
"""Spec for skypilot controllers."""
|
|
64
73
|
controller_type: str
|
|
65
74
|
name: str
|
|
66
|
-
|
|
67
|
-
|
|
75
|
+
_cluster_name_func: Callable[[], str]
|
|
76
|
+
_cluster_name_from_server: Optional[str] # For client-side only
|
|
77
|
+
in_progress_hint: Callable[[bool], str]
|
|
68
78
|
decline_cancel_hint: str
|
|
69
79
|
_decline_down_when_failed_to_fetch_status_hint: str
|
|
70
80
|
decline_down_for_dirty_controller_hint: str
|
|
@@ -84,6 +94,24 @@ class _ControllerSpec:
|
|
|
84
94
|
return self._check_cluster_name_hint.format(
|
|
85
95
|
cluster_name=self.cluster_name)
|
|
86
96
|
|
|
97
|
+
@property
|
|
98
|
+
def cluster_name(self) -> str:
|
|
99
|
+
"""The cluster name of the controller.
|
|
100
|
+
|
|
101
|
+
On the server-side, the cluster name is the actual cluster name,
|
|
102
|
+
which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
|
|
103
|
+
|
|
104
|
+
On the client-side, the cluster name may not be accurate,
|
|
105
|
+
as we may not know the exact name, because we are missing
|
|
106
|
+
the server-side common.SERVER_ID. We have to wait until
|
|
107
|
+
we get the actual cluster name from the server.
|
|
108
|
+
"""
|
|
109
|
+
return (self._cluster_name_from_server if self._cluster_name_from_server
|
|
110
|
+
is not None else self._cluster_name_func())
|
|
111
|
+
|
|
112
|
+
def set_cluster_name_from_server(self, cluster_name: str) -> None:
|
|
113
|
+
self._cluster_name_from_server = cluster_name
|
|
114
|
+
|
|
87
115
|
|
|
88
116
|
# TODO: refactor controller class to not be an enum.
|
|
89
117
|
class Controllers(enum.Enum):
|
|
@@ -93,10 +121,11 @@ class Controllers(enum.Enum):
|
|
|
93
121
|
JOBS_CONTROLLER = _ControllerSpec(
|
|
94
122
|
controller_type='jobs',
|
|
95
123
|
name='managed jobs controller',
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
124
|
+
_cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
|
|
125
|
+
_cluster_name_from_server=None,
|
|
126
|
+
in_progress_hint=lambda _:
|
|
127
|
+
('* {job_info}To see all managed jobs: '
|
|
128
|
+
f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
|
|
100
129
|
decline_cancel_hint=(
|
|
101
130
|
'Cancelling the jobs controller\'s jobs is not allowed.\nTo cancel '
|
|
102
131
|
f'managed jobs, use: {colorama.Style.BRIGHT}sky jobs cancel '
|
|
@@ -124,10 +153,14 @@ class Controllers(enum.Enum):
|
|
|
124
153
|
SKY_SERVE_CONTROLLER = _ControllerSpec(
|
|
125
154
|
controller_type='serve',
|
|
126
155
|
name='serve controller',
|
|
127
|
-
|
|
156
|
+
_cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
|
|
157
|
+
_cluster_name_from_server=None,
|
|
128
158
|
in_progress_hint=(
|
|
129
|
-
|
|
130
|
-
f'
|
|
159
|
+
lambda pool:
|
|
160
|
+
(f'* To see detailed pool status: {colorama.Style.BRIGHT}'
|
|
161
|
+
f'sky jobs pool status -v{colorama.Style.RESET_ALL}') if pool else
|
|
162
|
+
(f'* To see detailed service status: {colorama.Style.BRIGHT}'
|
|
163
|
+
f'sky serve status -v{colorama.Style.RESET_ALL}')),
|
|
131
164
|
decline_cancel_hint=(
|
|
132
165
|
'Cancelling the sky serve controller\'s jobs is not allowed.'),
|
|
133
166
|
_decline_down_when_failed_to_fetch_status_hint=(
|
|
@@ -154,7 +187,9 @@ class Controllers(enum.Enum):
|
|
|
154
187
|
default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
|
|
155
188
|
|
|
156
189
|
@classmethod
|
|
157
|
-
def from_name(cls,
|
|
190
|
+
def from_name(cls,
|
|
191
|
+
name: Optional[str],
|
|
192
|
+
expect_exact_match: bool = True) -> Optional['Controllers']:
|
|
158
193
|
"""Check if the cluster name is a controller name.
|
|
159
194
|
|
|
160
195
|
Returns:
|
|
@@ -175,7 +210,11 @@ class Controllers(enum.Enum):
|
|
|
175
210
|
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
|
176
211
|
controller = cls.JOBS_CONTROLLER
|
|
177
212
|
prefix = common.JOB_CONTROLLER_PREFIX
|
|
178
|
-
|
|
213
|
+
|
|
214
|
+
if controller is not None and expect_exact_match:
|
|
215
|
+
assert name == controller.value.cluster_name, (
|
|
216
|
+
name, controller.value.cluster_name)
|
|
217
|
+
elif controller is not None and name != controller.value.cluster_name:
|
|
179
218
|
# The client-side cluster_name is not accurate. Assume that `name`
|
|
180
219
|
# is the actual cluster name, so need to set the controller's
|
|
181
220
|
# cluster name to the input name.
|
|
@@ -189,7 +228,7 @@ class Controllers(enum.Enum):
|
|
|
189
228
|
prefix)
|
|
190
229
|
|
|
191
230
|
# Update the cluster name.
|
|
192
|
-
controller.value.
|
|
231
|
+
controller.value.set_cluster_name_from_server(name)
|
|
193
232
|
return controller
|
|
194
233
|
|
|
195
234
|
@classmethod
|
|
@@ -206,27 +245,35 @@ class Controllers(enum.Enum):
|
|
|
206
245
|
return None
|
|
207
246
|
|
|
208
247
|
|
|
209
|
-
def
|
|
210
|
-
|
|
248
|
+
def get_controller_for_pool(pool: bool) -> Controllers:
|
|
249
|
+
"""Get the controller type."""
|
|
250
|
+
if pool:
|
|
251
|
+
return Controllers.JOBS_CONTROLLER
|
|
252
|
+
return Controllers.SKY_SERVE_CONTROLLER
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
211
256
|
"""Check if the controller high availability is specified in user config.
|
|
212
257
|
"""
|
|
213
|
-
controller = Controllers.from_name(cluster_name)
|
|
258
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
214
259
|
if controller is None:
|
|
215
260
|
return False
|
|
216
261
|
|
|
262
|
+
if controller.value.controller_type == 'jobs':
|
|
263
|
+
# pylint: disable-next=import-outside-toplevel
|
|
264
|
+
from sky.jobs import utils as managed_job_utils
|
|
265
|
+
if managed_job_utils.is_consolidation_mode():
|
|
266
|
+
return True
|
|
267
|
+
elif controller.value.controller_type == 'serve':
|
|
268
|
+
# pylint: disable-next=import-outside-toplevel
|
|
269
|
+
from sky.serve import serve_utils
|
|
270
|
+
if serve_utils.is_consolidation_mode():
|
|
271
|
+
return True
|
|
272
|
+
|
|
217
273
|
if skypilot_config.loaded():
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
if high_availability:
|
|
222
|
-
if controller.value.controller_type != 'serve':
|
|
223
|
-
if not skip_warning:
|
|
224
|
-
print(f'{colorama.Fore.RED}High availability controller is'
|
|
225
|
-
'only supported for SkyServe controller. It cannot'
|
|
226
|
-
f'be enabled for {controller.value.name}.'
|
|
227
|
-
f'Skipping this flag.{colorama.Style.RESET_ALL}')
|
|
228
|
-
else:
|
|
229
|
-
return True
|
|
274
|
+
return skypilot_config.get_nested((controller.value.controller_type,
|
|
275
|
+
'controller', 'high_availability'),
|
|
276
|
+
False)
|
|
230
277
|
return False
|
|
231
278
|
|
|
232
279
|
|
|
@@ -263,6 +310,13 @@ def _get_cloud_dependencies_installation_commands(
|
|
|
263
310
|
sky_check.get_cached_enabled_clouds_or_refresh(
|
|
264
311
|
sky_cloud.CloudCapability.STORAGE))
|
|
265
312
|
enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
|
|
313
|
+
enabled_k8s_and_ssh = [
|
|
314
|
+
repr(cloud)
|
|
315
|
+
for cloud in enabled_clouds
|
|
316
|
+
if isinstance(cloud, clouds.Kubernetes)
|
|
317
|
+
]
|
|
318
|
+
k8s_and_ssh_label = ' and '.join(sorted(enabled_k8s_and_ssh))
|
|
319
|
+
k8s_dependencies_installed = False
|
|
266
320
|
|
|
267
321
|
for cloud in enabled_clouds:
|
|
268
322
|
cloud_python_dependencies: List[str] = copy.deepcopy(
|
|
@@ -282,10 +336,33 @@ def _get_cloud_dependencies_installation_commands(
|
|
|
282
336
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
283
337
|
commands.append(f'echo -en "\\r{step_prefix}GCP SDK{empty_str}" &&'
|
|
284
338
|
f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
|
|
285
|
-
|
|
339
|
+
if clouds.cloud_in_iterable(clouds.Kubernetes(), enabled_clouds):
|
|
340
|
+
# Install gke-gcloud-auth-plugin used for exec-auth with GKE.
|
|
341
|
+
# We install the plugin here instead of the next elif branch
|
|
342
|
+
# because gcloud is required to install the plugin, so the order
|
|
343
|
+
# of command execution is critical.
|
|
344
|
+
|
|
345
|
+
# We install plugin here regardless of whether exec-auth is
|
|
346
|
+
# actually used as exec-auth may be used in the future.
|
|
347
|
+
# TODO (kyuds): how to implement conservative installation?
|
|
348
|
+
commands.append(
|
|
349
|
+
'(command -v gke-gcloud-auth-plugin &>/dev/null || '
|
|
350
|
+
'(gcloud components install gke-gcloud-auth-plugin --quiet &>/dev/null))') # pylint: disable=line-too-long
|
|
351
|
+
elif isinstance(cloud, clouds.Nebius):
|
|
352
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
353
|
+
commands.append(
|
|
354
|
+
f'echo -en "\\r{step_prefix}Nebius{empty_str}" && '
|
|
355
|
+
'curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh ' # pylint: disable=line-too-long
|
|
356
|
+
'| sudo NEBIUS_INSTALL_FOLDER=/usr/local/bin bash &> /dev/null && '
|
|
357
|
+
'nebius profile create --profile sky '
|
|
358
|
+
'--endpoint api.nebius.cloud '
|
|
359
|
+
'--service-account-file $HOME/.nebius/credentials.json '
|
|
360
|
+
'&> /dev/null || echo "Unable to create Nebius profile."')
|
|
361
|
+
elif (isinstance(cloud, clouds.Kubernetes) and
|
|
362
|
+
not k8s_dependencies_installed):
|
|
286
363
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
287
364
|
commands.append(
|
|
288
|
-
f'echo -en "\\r{step_prefix}
|
|
365
|
+
f'echo -en "\\r{step_prefix}{k8s_and_ssh_label}{empty_str}" && '
|
|
289
366
|
# Install k8s + skypilot dependencies
|
|
290
367
|
'sudo bash -c "if '
|
|
291
368
|
'! command -v curl &> /dev/null || '
|
|
@@ -305,7 +382,10 @@ def _get_cloud_dependencies_installation_commands(
|
|
|
305
382
|
'(curl -s -LO "https://dl.k8s.io/release/v1.31.6'
|
|
306
383
|
'/bin/linux/$ARCH/kubectl" && '
|
|
307
384
|
'sudo install -o root -g root -m 0755 '
|
|
308
|
-
'kubectl /usr/local/bin/kubectl))'
|
|
385
|
+
'kubectl /usr/local/bin/kubectl)) && '
|
|
386
|
+
f'echo -e \'#!/bin/bash\\nexport PATH="{kubernetes_constants.SKY_K8S_EXEC_AUTH_PATH}"\\nexec "$@"\' | sudo tee /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER} > /dev/null && ' # pylint: disable=line-too-long
|
|
387
|
+
f'sudo chmod +x /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER}') # pylint: disable=line-too-long
|
|
388
|
+
k8s_dependencies_installed = True
|
|
309
389
|
elif isinstance(cloud, clouds.Cudo):
|
|
310
390
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
|
311
391
|
commands.append(
|
|
@@ -358,7 +438,7 @@ def check_cluster_name_not_controller(
|
|
|
358
438
|
Returns:
|
|
359
439
|
None, if the cluster name is not a controller name.
|
|
360
440
|
"""
|
|
361
|
-
controller = Controllers.from_name(cluster_name)
|
|
441
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
362
442
|
if controller is not None:
|
|
363
443
|
msg = controller.value.check_cluster_name_hint
|
|
364
444
|
if operation_str is not None:
|
|
@@ -368,10 +448,11 @@ def check_cluster_name_not_controller(
|
|
|
368
448
|
|
|
369
449
|
|
|
370
450
|
# Internal only:
|
|
371
|
-
def
|
|
451
|
+
def download_and_stream_job_log(
|
|
372
452
|
backend: 'cloud_vm_ray_backend.CloudVmRayBackend',
|
|
373
453
|
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
374
|
-
local_dir: str
|
|
454
|
+
local_dir: str,
|
|
455
|
+
job_ids: Optional[List[str]] = None) -> Optional[str]:
|
|
375
456
|
"""Downloads and streams the latest job log.
|
|
376
457
|
|
|
377
458
|
This function is only used by jobs controller and sky serve controller.
|
|
@@ -389,7 +470,7 @@ def download_and_stream_latest_job_log(
|
|
|
389
470
|
# multi-node cluster is preempted, and we recover the managed job
|
|
390
471
|
# on the existing cluster, which leads to a larger job_id. Those
|
|
391
472
|
# job_ids all represent the same logical managed job.
|
|
392
|
-
job_ids=
|
|
473
|
+
job_ids=job_ids,
|
|
393
474
|
local_dir=local_dir)
|
|
394
475
|
except Exception as e: # pylint: disable=broad-except
|
|
395
476
|
# We want to avoid crashing the controller. sync_down_logs() is pretty
|
|
@@ -407,7 +488,7 @@ def download_and_stream_latest_job_log(
|
|
|
407
488
|
return None
|
|
408
489
|
|
|
409
490
|
log_dir = list(log_dirs.values())[0]
|
|
410
|
-
log_file = os.path.join(log_dir, 'run.log')
|
|
491
|
+
log_file = os.path.expanduser(os.path.join(log_dir, 'run.log'))
|
|
411
492
|
|
|
412
493
|
# Print the logs to the console.
|
|
413
494
|
# TODO(zhwu): refactor this into log_utils, along with the refactoring for
|
|
@@ -452,10 +533,13 @@ def shared_controller_vars_to_fill(
|
|
|
452
533
|
# before popping allowed_contexts. If it is not on Kubernetes,
|
|
453
534
|
# we may be able to use allowed_contexts.
|
|
454
535
|
local_user_config.pop('allowed_contexts', None)
|
|
536
|
+
# Remove api_server config so that the controller does not try to use
|
|
537
|
+
# a remote API server.
|
|
538
|
+
local_user_config.pop('api_server', None)
|
|
455
539
|
with tempfile.NamedTemporaryFile(
|
|
456
540
|
delete=False,
|
|
457
541
|
suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
|
|
458
|
-
|
|
542
|
+
yaml_utils.dump_yaml(temp_file.name, dict(**local_user_config))
|
|
459
543
|
local_user_config_path = temp_file.name
|
|
460
544
|
|
|
461
545
|
vars_to_fill: Dict[str, Any] = {
|
|
@@ -474,7 +558,7 @@ def shared_controller_vars_to_fill(
|
|
|
474
558
|
env_vars.update({
|
|
475
559
|
# Should not use $USER here, as that env var can be empty when
|
|
476
560
|
# running in a container.
|
|
477
|
-
constants.USER_ENV_VAR:
|
|
561
|
+
constants.USER_ENV_VAR: common_utils.get_current_user_name(),
|
|
478
562
|
constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
|
|
479
563
|
# Skip cloud identity check to avoid the overhead.
|
|
480
564
|
env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
|
|
@@ -517,6 +601,30 @@ def get_controller_resources(
|
|
|
517
601
|
if custom_controller_resources_config is not None:
|
|
518
602
|
controller_resources_config_copied.update(
|
|
519
603
|
custom_controller_resources_config)
|
|
604
|
+
# Compatibility with the old way of specifying the controller autostop
|
|
605
|
+
# config. TODO(cooperc): Remove this before 0.12.0.
|
|
606
|
+
custom_controller_autostop_config = skypilot_config.get_nested(
|
|
607
|
+
(controller.value.controller_type, 'controller', 'autostop'), None)
|
|
608
|
+
if custom_controller_autostop_config is not None:
|
|
609
|
+
logger.warning(
|
|
610
|
+
f'{colorama.Fore.YELLOW}Warning: Config value '
|
|
611
|
+
f'`{controller.value.controller_type}.controller.autostop` '
|
|
612
|
+
'is deprecated. Please use '
|
|
613
|
+
f'`{controller.value.controller_type}.controller.resources.'
|
|
614
|
+
f'autostop` instead.{colorama.Style.RESET_ALL}')
|
|
615
|
+
# Only set the autostop config if it is not already specified.
|
|
616
|
+
if controller_resources_config_copied.get('autostop') is None:
|
|
617
|
+
controller_resources_config_copied['autostop'] = (
|
|
618
|
+
custom_controller_autostop_config)
|
|
619
|
+
else:
|
|
620
|
+
logger.warning(f'{colorama.Fore.YELLOW}Ignoring the old '
|
|
621
|
+
'config, since it is already specified in '
|
|
622
|
+
f'resources.{colorama.Style.RESET_ALL}')
|
|
623
|
+
# Set the default autostop config for the controller, if not already
|
|
624
|
+
# specified.
|
|
625
|
+
if controller_resources_config_copied.get('autostop') is None:
|
|
626
|
+
controller_resources_config_copied['autostop'] = (
|
|
627
|
+
controller.value.default_autostop_config)
|
|
520
628
|
|
|
521
629
|
try:
|
|
522
630
|
controller_resources = resources.Resources.from_yaml_config(
|
|
@@ -542,12 +650,16 @@ def get_controller_resources(
|
|
|
542
650
|
controller_resources_to_use: resources.Resources = list(
|
|
543
651
|
controller_resources)[0]
|
|
544
652
|
|
|
545
|
-
|
|
653
|
+
controller_handle = global_user_state.get_handle_from_cluster_name(
|
|
546
654
|
controller.value.cluster_name)
|
|
547
|
-
if
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
655
|
+
if controller_handle is not None:
|
|
656
|
+
if controller_handle is not None:
|
|
657
|
+
# Use the existing resources, but override the autostop config with
|
|
658
|
+
# the one currently specified in the config.
|
|
659
|
+
controller_resources_to_use = (
|
|
660
|
+
controller_handle.launched_resources.copy(
|
|
661
|
+
autostop=controller_resources_config_copied.get('autostop'))
|
|
662
|
+
)
|
|
551
663
|
|
|
552
664
|
# If the controller and replicas are from the same cloud (and region/zone),
|
|
553
665
|
# it should provide better connectivity. We will let the controller choose
|
|
@@ -608,8 +720,9 @@ def get_controller_resources(
|
|
|
608
720
|
controller_zone = controller_resources_to_use.zone
|
|
609
721
|
|
|
610
722
|
# Filter clouds if controller_resources_to_use.cloud is specified.
|
|
611
|
-
filtered_clouds =
|
|
612
|
-
|
|
723
|
+
filtered_clouds: Set[str] = {controller_cloud
|
|
724
|
+
} if controller_cloud is not None else set(
|
|
725
|
+
requested_clouds_with_region_zone.keys())
|
|
613
726
|
|
|
614
727
|
# Filter regions and zones and construct the result.
|
|
615
728
|
result: Set[resources.Resources] = set()
|
|
@@ -618,15 +731,17 @@ def get_controller_resources(
|
|
|
618
731
|
{None: {None}})
|
|
619
732
|
|
|
620
733
|
# Filter regions if controller_resources_to_use.region is specified.
|
|
621
|
-
filtered_regions = ({
|
|
622
|
-
|
|
734
|
+
filtered_regions: Set[Optional[str]] = ({
|
|
735
|
+
controller_region
|
|
736
|
+
} if controller_region is not None else set(regions.keys()))
|
|
623
737
|
|
|
624
738
|
for region in filtered_regions:
|
|
625
739
|
zones = regions.get(region, {None})
|
|
626
740
|
|
|
627
741
|
# Filter zones if controller_resources_to_use.zone is specified.
|
|
628
|
-
filtered_zones = ({
|
|
629
|
-
|
|
742
|
+
filtered_zones: Set[Optional[str]] = ({
|
|
743
|
+
controller_zone
|
|
744
|
+
} if controller_zone is not None else set(zones))
|
|
630
745
|
|
|
631
746
|
# Create combinations of cloud, region, and zone.
|
|
632
747
|
for zone in filtered_zones:
|
|
@@ -641,38 +756,15 @@ def get_controller_resources(
|
|
|
641
756
|
return result
|
|
642
757
|
|
|
643
758
|
|
|
644
|
-
def
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
controller.value.default_autostop_config)
|
|
654
|
-
if skypilot_config.loaded():
|
|
655
|
-
custom_controller_autostop_config = skypilot_config.get_nested(
|
|
656
|
-
(controller.value.controller_type, 'controller', 'autostop'), None)
|
|
657
|
-
if custom_controller_autostop_config is False:
|
|
658
|
-
# Disabled with `autostop: false` in config.
|
|
659
|
-
# To indicate autostop is disabled, we return None for
|
|
660
|
-
# idle_minutes_to_autostop.
|
|
661
|
-
return None, False
|
|
662
|
-
elif custom_controller_autostop_config is True:
|
|
663
|
-
# Enabled with default values. There is no change in behavior, but
|
|
664
|
-
# this is included by for completeness, since `False` is valid.
|
|
665
|
-
pass
|
|
666
|
-
elif custom_controller_autostop_config is not None:
|
|
667
|
-
# We have specific config values.
|
|
668
|
-
# Override the controller autostop config with the ones specified in
|
|
669
|
-
# the config.
|
|
670
|
-
assert isinstance(custom_controller_autostop_config, dict)
|
|
671
|
-
controller_autostop_config_copied.update(
|
|
672
|
-
custom_controller_autostop_config)
|
|
673
|
-
|
|
674
|
-
return (controller_autostop_config_copied['idle_minutes'],
|
|
675
|
-
controller_autostop_config_copied['down'])
|
|
759
|
+
def get_controller_mem_size_gb() -> float:
|
|
760
|
+
try:
|
|
761
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
762
|
+
'r',
|
|
763
|
+
encoding='utf-8') as f:
|
|
764
|
+
return float(f.read())
|
|
765
|
+
except FileNotFoundError:
|
|
766
|
+
pass
|
|
767
|
+
return common_utils.get_mem_size_gb()
|
|
676
768
|
|
|
677
769
|
|
|
678
770
|
def _setup_proxy_command_on_controller(
|
|
@@ -703,7 +795,7 @@ def _setup_proxy_command_on_controller(
|
|
|
703
795
|
# NOTE: suppose that we have a controller in old VPC, then user
|
|
704
796
|
# changes 'vpc_name' in the config and does a 'job launch' /
|
|
705
797
|
# 'serve up'. In general, the old controller may not successfully
|
|
706
|
-
# launch the job in the new VPC. This happens if the two VPCs don
|
|
798
|
+
# launch the job in the new VPC. This happens if the two VPCs don't
|
|
707
799
|
# have peering set up. Like other places in the code, we assume
|
|
708
800
|
# properly setting up networking is user's responsibilities.
|
|
709
801
|
# TODO(zongheng): consider adding a basic check that checks
|
|
@@ -714,7 +806,11 @@ def _setup_proxy_command_on_controller(
|
|
|
714
806
|
config = config_utils.Config.from_dict(user_config)
|
|
715
807
|
proxy_command_key = (str(controller_launched_cloud).lower(),
|
|
716
808
|
'ssh_proxy_command')
|
|
717
|
-
ssh_proxy_command =
|
|
809
|
+
ssh_proxy_command = skypilot_config.get_effective_region_config(
|
|
810
|
+
cloud=str(controller_launched_cloud).lower(),
|
|
811
|
+
region=None,
|
|
812
|
+
keys=('ssh_proxy_command',),
|
|
813
|
+
default_value=None)
|
|
718
814
|
if isinstance(ssh_proxy_command, str):
|
|
719
815
|
config.set_nested(proxy_command_key, None)
|
|
720
816
|
elif isinstance(ssh_proxy_command, dict):
|
|
@@ -744,9 +840,9 @@ def replace_skypilot_config_path_in_file_mounts(
|
|
|
744
840
|
continue
|
|
745
841
|
if local_path.endswith(_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX):
|
|
746
842
|
with tempfile.NamedTemporaryFile('w', delete=False) as f:
|
|
747
|
-
user_config =
|
|
843
|
+
user_config = yaml_utils.read_yaml(local_path)
|
|
748
844
|
config = _setup_proxy_command_on_controller(cloud, user_config)
|
|
749
|
-
|
|
845
|
+
yaml_utils.dump_yaml(f.name, dict(**config))
|
|
750
846
|
file_mounts[remote_path] = f.name
|
|
751
847
|
replaced = True
|
|
752
848
|
if replaced:
|
|
@@ -789,7 +885,7 @@ def translate_local_file_mounts_to_two_hop(
|
|
|
789
885
|
file_mount_id = 0
|
|
790
886
|
|
|
791
887
|
file_mounts_to_translate = task.file_mounts or {}
|
|
792
|
-
if task.workdir is not None:
|
|
888
|
+
if task.workdir is not None and isinstance(task.workdir, str):
|
|
793
889
|
file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
|
|
794
890
|
task.workdir = None
|
|
795
891
|
|
|
@@ -857,7 +953,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
857
953
|
copy_mounts = {}
|
|
858
954
|
|
|
859
955
|
has_local_source_paths_file_mounts = bool(copy_mounts)
|
|
860
|
-
has_local_source_paths_workdir = task.workdir is not None
|
|
956
|
+
has_local_source_paths_workdir = (task.workdir is not None and
|
|
957
|
+
isinstance(task.workdir, str))
|
|
861
958
|
|
|
862
959
|
msg = None
|
|
863
960
|
if has_local_source_paths_workdir and has_local_source_paths_file_mounts:
|
|
@@ -905,7 +1002,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
905
1002
|
|
|
906
1003
|
# Step 1: Translate the workdir to SkyPilot storage.
|
|
907
1004
|
new_storage_mounts = {}
|
|
908
|
-
if task.workdir is not None:
|
|
1005
|
+
if task.workdir is not None and isinstance(task.workdir, str):
|
|
909
1006
|
workdir = task.workdir
|
|
910
1007
|
task.workdir = None
|
|
911
1008
|
if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
|
|
@@ -1126,3 +1223,81 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
1126
1223
|
task.update_storage_mounts(updated_mount_storages)
|
|
1127
1224
|
if msg:
|
|
1128
1225
|
logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
|
|
1226
|
+
|
|
1227
|
+
|
|
1228
|
+
# ======================= Resources Management Functions =======================
|
|
1229
|
+
|
|
1230
|
+
# Based on testing, assume a running job process uses 350MB memory. We use the
|
|
1231
|
+
# same estimation for service controller process.
|
|
1232
|
+
JOB_MEMORY_MB = 350
|
|
1233
|
+
# Monitoring process for service is 1GB. This is based on an old estimation but
|
|
1234
|
+
# we keep it here for now.
|
|
1235
|
+
# TODO(tian): Remeasure this.
|
|
1236
|
+
SERVE_MONITORING_MEMORY_MB = 1024
|
|
1237
|
+
# The ratio of service controller process to job process. We will treat each
|
|
1238
|
+
# service as SERVE_PROC_RATIO job processes.
|
|
1239
|
+
SERVE_PROC_RATIO = SERVE_MONITORING_MEMORY_MB / JOB_MEMORY_MB
|
|
1240
|
+
# Past 2000 simultaneous jobs, we become unstable.
|
|
1241
|
+
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
|
1242
|
+
MAX_JOB_LIMIT = 2000
|
|
1243
|
+
# Number of ongoing launches launches allowed per CPU, for managed jobs.
|
|
1244
|
+
JOB_LAUNCHES_PER_CPU = 4
|
|
1245
|
+
# Number of ongoing launches launches allowed per CPU, for services. This is
|
|
1246
|
+
# also based on an old estimation, but SKyServe indeed spawn a new process
|
|
1247
|
+
# for each launch operation, so it should be slightly more resources demanding
|
|
1248
|
+
# than managed jobs.
|
|
1249
|
+
SERVE_LAUNCHES_PER_CPU = 2
|
|
1250
|
+
# The ratio of service launch to job launch. This is inverted as the parallelism
|
|
1251
|
+
# is determined by 1 / LAUNCHES_PER_CPU.
|
|
1252
|
+
SERVE_LAUNCH_RATIO = JOB_LAUNCHES_PER_CPU / SERVE_LAUNCHES_PER_CPU
|
|
1253
|
+
|
|
1254
|
+
# The _RESOURCES_LOCK should be held whenever we are checking the parallelism
|
|
1255
|
+
# control or updating the schedule_state of any job or service. Any code that
|
|
1256
|
+
# takes this lock must conclude by calling maybe_schedule_next_jobs.
|
|
1257
|
+
_RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
1261
|
+
def get_resources_lock_path() -> str:
|
|
1262
|
+
path = os.path.expanduser(_RESOURCES_LOCK)
|
|
1263
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
1264
|
+
return path
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
@annotations.lru_cache(scope='request')
|
|
1268
|
+
def _get_job_parallelism() -> int:
|
|
1269
|
+
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
|
1270
|
+
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
|
1271
|
+
return max(job_limit, 1)
|
|
1272
|
+
|
|
1273
|
+
|
|
1274
|
+
@annotations.lru_cache(scope='request')
|
|
1275
|
+
def _get_launch_parallelism() -> int:
|
|
1276
|
+
cpus = os.cpu_count()
|
|
1277
|
+
return cpus * JOB_LAUNCHES_PER_CPU if cpus is not None else 1
|
|
1278
|
+
|
|
1279
|
+
|
|
1280
|
+
def can_provision() -> bool:
|
|
1281
|
+
# We always prioritize terminating over provisioning, to save the cost on
|
|
1282
|
+
# idle resources.
|
|
1283
|
+
if serve_state.total_number_scheduled_to_terminate_replicas() > 0:
|
|
1284
|
+
return False
|
|
1285
|
+
return can_terminate()
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
def can_start_new_process() -> bool:
|
|
1289
|
+
num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
|
|
1290
|
+
managed_job_state.get_num_alive_jobs())
|
|
1291
|
+
return num_procs < _get_job_parallelism()
|
|
1292
|
+
|
|
1293
|
+
|
|
1294
|
+
# We limit the number of terminating replicas to the number of CPUs. This is
|
|
1295
|
+
# just a temporary solution to avoid overwhelming the controller. After one job
|
|
1296
|
+
# controller PR, we should use API server to handle resources management.
|
|
1297
|
+
def can_terminate() -> bool:
|
|
1298
|
+
num_terminating = (
|
|
1299
|
+
serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
|
|
1300
|
+
# Each terminate process will take roughly the same CPUs as job launch.
|
|
1301
|
+
serve_state.total_number_terminating_replicas() +
|
|
1302
|
+
managed_job_state.get_num_launching_jobs())
|
|
1303
|
+
return num_terminating < _get_launch_parallelism()
|