skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/execution.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
|
4
4
|
"""
|
|
5
5
|
import enum
|
|
6
|
+
import logging
|
|
6
7
|
import typing
|
|
7
|
-
from typing import List, Optional, Tuple, Union
|
|
8
|
+
from typing import Callable, List, Optional, Tuple, Union
|
|
8
9
|
|
|
9
10
|
import colorama
|
|
10
11
|
|
|
@@ -15,6 +16,8 @@ from sky import global_user_state
|
|
|
15
16
|
from sky import optimizer
|
|
16
17
|
from sky import sky_logging
|
|
17
18
|
from sky.backends import backend_utils
|
|
19
|
+
from sky.server.requests import request_names
|
|
20
|
+
from sky.skylet import autostop_lib
|
|
18
21
|
from sky.usage import usage_lib
|
|
19
22
|
from sky.utils import admin_policy_utils
|
|
20
23
|
from sky.utils import common
|
|
@@ -23,11 +26,13 @@ from sky.utils import dag_utils
|
|
|
23
26
|
from sky.utils import resources_utils
|
|
24
27
|
from sky.utils import rich_utils
|
|
25
28
|
from sky.utils import status_lib
|
|
29
|
+
from sky.utils import tempstore
|
|
26
30
|
from sky.utils import timeline
|
|
27
31
|
from sky.utils import ux_utils
|
|
28
32
|
|
|
29
33
|
if typing.TYPE_CHECKING:
|
|
30
34
|
import sky
|
|
35
|
+
from sky import resources as resources_lib
|
|
31
36
|
|
|
32
37
|
logger = sky_logging.init_logger(__name__)
|
|
33
38
|
|
|
@@ -108,16 +113,18 @@ def _execute(
|
|
|
108
113
|
stages: Optional[List[Stage]] = None,
|
|
109
114
|
cluster_name: Optional[str] = None,
|
|
110
115
|
detach_setup: bool = False,
|
|
111
|
-
detach_run: bool = False,
|
|
112
116
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
113
117
|
no_setup: bool = False,
|
|
114
118
|
clone_disk_from: Optional[str] = None,
|
|
115
119
|
skip_unnecessary_provisioning: bool = False,
|
|
120
|
+
*, #keyword only separator
|
|
116
121
|
# Internal only:
|
|
117
122
|
# pylint: disable=invalid-name
|
|
123
|
+
_request_name: request_names.AdminPolicyRequestName,
|
|
118
124
|
_quiet_optimizer: bool = False,
|
|
119
125
|
_is_launched_by_jobs_controller: bool = False,
|
|
120
126
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
127
|
+
job_logger: logging.Logger = logger,
|
|
121
128
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
122
129
|
"""Execute an entrypoint.
|
|
123
130
|
|
|
@@ -152,8 +159,6 @@ def _execute(
|
|
|
152
159
|
job itself. You can safely ctrl-c to detach from logging, and it will
|
|
153
160
|
not interrupt the setup process. To see the logs again after detaching,
|
|
154
161
|
use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
|
|
155
|
-
detach_run: If True, as soon as a job is submitted, return from this
|
|
156
|
-
function and do not stream execution logs.
|
|
157
162
|
idle_minutes_to_autostop: int; if provided, the cluster will be set to
|
|
158
163
|
autostop after this many minutes of idleness.
|
|
159
164
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
|
@@ -170,26 +175,89 @@ def _execute(
|
|
|
170
175
|
handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
|
|
171
176
|
if dryrun.
|
|
172
177
|
"""
|
|
173
|
-
|
|
174
178
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
175
179
|
for task in dag.tasks:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
180
|
+
for resource in task.resources:
|
|
181
|
+
# For backward compatibility, we need to override the autostop
|
|
182
|
+
# config at server-side for legacy clients. This should be set
|
|
183
|
+
# before admin policy to make the admin policy get the final
|
|
184
|
+
# value of autostop config.
|
|
185
|
+
# TODO(aylei): remove this after we bump the API version.
|
|
186
|
+
resource.override_autostop_config(
|
|
187
|
+
down=down, idle_minutes=idle_minutes_to_autostop)
|
|
188
|
+
if resource.autostop_config is not None:
|
|
189
|
+
down = resource.autostop_config.down
|
|
190
|
+
idle_minutes_to_autostop = resource.autostop_config.idle_minutes
|
|
191
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
192
|
+
dag,
|
|
193
|
+
request_name=_request_name,
|
|
194
|
+
request_options=admin_policy.RequestOptions(
|
|
195
|
+
cluster_name=cluster_name,
|
|
196
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
197
|
+
down=down,
|
|
198
|
+
dryrun=dryrun,
|
|
199
|
+
)) as dag:
|
|
200
|
+
dag.resolve_and_validate_volumes()
|
|
201
|
+
if (not _is_launched_by_jobs_controller and
|
|
202
|
+
not _is_launched_by_sky_serve_controller):
|
|
203
|
+
# Only process pre-mount operations on API server.
|
|
204
|
+
dag.pre_mount_volumes()
|
|
205
|
+
for task in dag.tasks:
|
|
206
|
+
if task.storage_mounts is not None:
|
|
207
|
+
for storage in task.storage_mounts.values():
|
|
208
|
+
# Ensure the storage is constructed.
|
|
209
|
+
storage.construct()
|
|
210
|
+
return _execute_dag(
|
|
211
|
+
dag,
|
|
186
212
|
dryrun=dryrun,
|
|
187
|
-
|
|
213
|
+
stream_logs=stream_logs,
|
|
214
|
+
handle=handle,
|
|
215
|
+
backend=backend,
|
|
216
|
+
retry_until_up=retry_until_up,
|
|
217
|
+
optimize_target=optimize_target,
|
|
218
|
+
stages=stages,
|
|
219
|
+
cluster_name=cluster_name,
|
|
220
|
+
detach_setup=detach_setup,
|
|
221
|
+
no_setup=no_setup,
|
|
222
|
+
clone_disk_from=clone_disk_from,
|
|
223
|
+
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
|
224
|
+
_quiet_optimizer=_quiet_optimizer,
|
|
225
|
+
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
226
|
+
_is_launched_by_sky_serve_controller=
|
|
227
|
+
_is_launched_by_sky_serve_controller,
|
|
228
|
+
job_logger=job_logger)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _execute_dag(
|
|
232
|
+
dag: 'sky.Dag',
|
|
233
|
+
dryrun: bool,
|
|
234
|
+
stream_logs: bool,
|
|
235
|
+
handle: Optional[backends.ResourceHandle],
|
|
236
|
+
backend: Optional[backends.Backend],
|
|
237
|
+
retry_until_up: bool,
|
|
238
|
+
optimize_target: common.OptimizeTarget,
|
|
239
|
+
stages: Optional[List[Stage]],
|
|
240
|
+
cluster_name: Optional[str],
|
|
241
|
+
detach_setup: bool,
|
|
242
|
+
no_setup: bool,
|
|
243
|
+
clone_disk_from: Optional[str],
|
|
244
|
+
skip_unnecessary_provisioning: bool,
|
|
245
|
+
# pylint: disable=invalid-name
|
|
246
|
+
_quiet_optimizer: bool,
|
|
247
|
+
_is_launched_by_jobs_controller: bool,
|
|
248
|
+
_is_launched_by_sky_serve_controller: bool,
|
|
249
|
+
job_logger: logging.Logger = logger,
|
|
250
|
+
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
251
|
+
"""Execute a DAG.
|
|
252
|
+
|
|
253
|
+
This is an internal helper function for _execute() and is expected to be
|
|
254
|
+
called only by _execute().
|
|
255
|
+
"""
|
|
188
256
|
assert len(dag) == 1, f'We support 1 task for now. {dag}'
|
|
189
257
|
task = dag.tasks[0]
|
|
190
258
|
|
|
191
259
|
if any(r.job_recovery is not None for r in task.resources):
|
|
192
|
-
|
|
260
|
+
job_logger.warning(
|
|
193
261
|
f'{colorama.Style.DIM}The task has `job_recovery` specified, '
|
|
194
262
|
'but is launched as an unmanaged job. It will be ignored.'
|
|
195
263
|
'To enable job recovery, use managed jobs: sky jobs launch.'
|
|
@@ -197,8 +265,10 @@ def _execute(
|
|
|
197
265
|
|
|
198
266
|
cluster_exists = False
|
|
199
267
|
if cluster_name is not None:
|
|
200
|
-
|
|
201
|
-
|
|
268
|
+
# We use launched_at to check if the cluster exists, because this
|
|
269
|
+
# db query is faster than get_cluster_from_name.
|
|
270
|
+
cluster_exists = global_user_state.cluster_with_name_exists(
|
|
271
|
+
cluster_name)
|
|
202
272
|
# TODO(woosuk): If the cluster exists, print a warning that
|
|
203
273
|
# `cpus` and `memory` are not used as a job scheduling constraint,
|
|
204
274
|
# unlike `gpus`.
|
|
@@ -214,8 +284,7 @@ def _execute(
|
|
|
214
284
|
if controller is not None:
|
|
215
285
|
requested_features.add(
|
|
216
286
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS)
|
|
217
|
-
if controller_utils.high_availability_specified(cluster_name
|
|
218
|
-
skip_warning=False):
|
|
287
|
+
if controller_utils.high_availability_specified(cluster_name):
|
|
219
288
|
requested_features.add(clouds.CloudImplementationFeatures.
|
|
220
289
|
HIGH_AVAILABILITY_CONTROLLERS)
|
|
221
290
|
# If we provision a cluster that supports high availability
|
|
@@ -226,11 +295,43 @@ def _execute(
|
|
|
226
295
|
requested_features |= task.get_required_cloud_features()
|
|
227
296
|
|
|
228
297
|
backend = backend if backend is not None else backends.CloudVmRayBackend()
|
|
298
|
+
# Figure out autostop config.
|
|
299
|
+
# Note: Ideally this can happen after provisioning, so we can check the
|
|
300
|
+
# autostop config from the launched resources. Before provisioning,
|
|
301
|
+
# we aren't sure which resources will be launched, and different
|
|
302
|
+
# resources may have different autostop configs.
|
|
229
303
|
if isinstance(backend, backends.CloudVmRayBackend):
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
304
|
+
# No autostop config specified on command line, use the
|
|
305
|
+
# config from resources.
|
|
306
|
+
# TODO(cooperc): This should be done after provisioning, in order to
|
|
307
|
+
# support different autostop configs for different resources.
|
|
308
|
+
# Blockers:
|
|
309
|
+
# - Need autostop config to set requested_features before
|
|
310
|
+
# provisioning.
|
|
311
|
+
# - Need to send info message about idle_minutes_to_autostop==0 here
|
|
312
|
+
# - Need to check if autostop is supported by the backend.
|
|
313
|
+
resources = list(task.resources)
|
|
314
|
+
for resource in resources:
|
|
315
|
+
if resource.autostop_config != resources[0].autostop_config:
|
|
316
|
+
raise ValueError(
|
|
317
|
+
'All resources must have the same autostop config.')
|
|
318
|
+
resource_autostop_config = resources[0].autostop_config
|
|
319
|
+
|
|
320
|
+
idle_minutes_to_autostop: Optional[int] = None
|
|
321
|
+
down = False
|
|
322
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
323
|
+
if resource_autostop_config is not None:
|
|
324
|
+
if resource_autostop_config.enabled:
|
|
325
|
+
idle_minutes_to_autostop = (
|
|
326
|
+
resource_autostop_config.idle_minutes)
|
|
327
|
+
down = resource_autostop_config.down
|
|
328
|
+
wait_for = resource_autostop_config.wait_for
|
|
329
|
+
else:
|
|
330
|
+
# Autostop is explicitly disabled, so cancel it if it's
|
|
331
|
+
# already set.
|
|
332
|
+
assert not resource_autostop_config.enabled
|
|
333
|
+
idle_minutes_to_autostop = -1
|
|
334
|
+
down = False
|
|
234
335
|
if idle_minutes_to_autostop is not None:
|
|
235
336
|
if idle_minutes_to_autostop == 0:
|
|
236
337
|
# idle_minutes_to_autostop=0 can cause the following problem:
|
|
@@ -239,10 +340,10 @@ def _execute(
|
|
|
239
340
|
# itself have no task running and start the auto{stop,down}
|
|
240
341
|
# process, before the task is submitted in the EXEC stage.
|
|
241
342
|
verb = 'torn down' if down else 'stopped'
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
343
|
+
job_logger.info(f'{colorama.Style.DIM}The cluster will '
|
|
344
|
+
f'be {verb} after 1 minutes of idleness '
|
|
345
|
+
'(after all jobs finish).'
|
|
346
|
+
f'{colorama.Style.RESET_ALL}')
|
|
246
347
|
idle_minutes_to_autostop = 1
|
|
247
348
|
if Stage.DOWN in stages:
|
|
248
349
|
stages.remove(Stage.DOWN)
|
|
@@ -257,27 +358,21 @@ def _execute(
|
|
|
257
358
|
# (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
|
|
258
359
|
# the backend.
|
|
259
360
|
|
|
260
|
-
elif idle_minutes_to_autostop is not None:
|
|
261
|
-
# TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend.
|
|
262
|
-
with ux_utils.print_exception_no_traceback():
|
|
263
|
-
raise ValueError(
|
|
264
|
-
f'Backend {backend.NAME} does not support autostop, please try'
|
|
265
|
-
f' {backends.CloudVmRayBackend.NAME}')
|
|
266
|
-
|
|
267
361
|
if Stage.CLONE_DISK in stages:
|
|
268
362
|
task = _maybe_clone_disk_from_cluster(clone_disk_from, cluster_name,
|
|
269
363
|
task)
|
|
270
364
|
|
|
365
|
+
is_managed = (_is_launched_by_jobs_controller or
|
|
366
|
+
_is_launched_by_sky_serve_controller)
|
|
367
|
+
|
|
271
368
|
if not cluster_exists:
|
|
272
369
|
# If spot is launched on serve or jobs controller, we don't need to
|
|
273
370
|
# print out the hint.
|
|
274
|
-
if (Stage.PROVISION in stages and task.use_spot and
|
|
275
|
-
not _is_launched_by_jobs_controller and
|
|
276
|
-
not _is_launched_by_sky_serve_controller):
|
|
371
|
+
if (Stage.PROVISION in stages and task.use_spot and not is_managed):
|
|
277
372
|
yellow = colorama.Fore.YELLOW
|
|
278
373
|
bold = colorama.Style.BRIGHT
|
|
279
374
|
reset = colorama.Style.RESET_ALL
|
|
280
|
-
|
|
375
|
+
job_logger.info(
|
|
281
376
|
f'{yellow}Launching a spot job that does not '
|
|
282
377
|
f'automatically recover from preemptions. To '
|
|
283
378
|
'get automatic recovery, use managed job instead: '
|
|
@@ -296,7 +391,7 @@ def _execute(
|
|
|
296
391
|
controller = controller_utils.Controllers.from_name(
|
|
297
392
|
cluster_name)
|
|
298
393
|
if controller is not None:
|
|
299
|
-
|
|
394
|
+
job_logger.info(
|
|
300
395
|
f'Choosing resources for {controller.value.name}...'
|
|
301
396
|
)
|
|
302
397
|
dag = optimizer.Optimizer.optimize(dag,
|
|
@@ -305,6 +400,26 @@ def _execute(
|
|
|
305
400
|
task = dag.tasks[0] # Keep: dag may have been deep-copied.
|
|
306
401
|
assert task.best_resources is not None, task
|
|
307
402
|
|
|
403
|
+
# Note on race vs. lock: OPTIMIZE typically runs outside the per-cluster
|
|
404
|
+
# lock. After the backend acquires the lock and refreshes state, the
|
|
405
|
+
# original "do we need to optimize?" decision may be stale (e.g., the
|
|
406
|
+
# cluster just got terminated). To compensate without moving the optimizer
|
|
407
|
+
# into the backend, we inject a small planner the backend can call under
|
|
408
|
+
# the lock only when no reusable snapshot and no caller plan exist.
|
|
409
|
+
planner: Optional[Callable[['sky.Task'], 'resources_lib.Resources']] = None
|
|
410
|
+
if isinstance(backend,
|
|
411
|
+
backends.CloudVmRayBackend) and Stage.OPTIMIZE in stages:
|
|
412
|
+
|
|
413
|
+
def _planner(_t: 'sky.Task'):
|
|
414
|
+
new_dag = optimizer.Optimizer.optimize(dag,
|
|
415
|
+
minimize=optimize_target,
|
|
416
|
+
quiet=_quiet_optimizer)
|
|
417
|
+
new_task = new_dag.tasks[0]
|
|
418
|
+
assert new_task.best_resources is not None, new_task
|
|
419
|
+
return new_task.best_resources.assert_launchable()
|
|
420
|
+
|
|
421
|
+
planner = _planner
|
|
422
|
+
|
|
308
423
|
backend.register_info(
|
|
309
424
|
dag=dag,
|
|
310
425
|
optimize_target=optimize_target,
|
|
@@ -312,7 +427,9 @@ def _execute(
|
|
|
312
427
|
# That's because we want to do commands in task.setup and task.run again
|
|
313
428
|
# after K8S pod recovers from a crash.
|
|
314
429
|
# See `kubernetes-ray.yml.j2` for more details.
|
|
315
|
-
dump_final_script=is_controller_high_availability_supported
|
|
430
|
+
dump_final_script=is_controller_high_availability_supported,
|
|
431
|
+
is_managed=is_managed,
|
|
432
|
+
planner=planner)
|
|
316
433
|
|
|
317
434
|
if task.storage_mounts is not None:
|
|
318
435
|
# Optimizer should eventually choose where to store bucket
|
|
@@ -337,7 +454,7 @@ def _execute(
|
|
|
337
454
|
if handle is None:
|
|
338
455
|
assert dryrun, ('If not dryrun, handle must be set or '
|
|
339
456
|
'Stage.PROVISION must be included in stages.')
|
|
340
|
-
|
|
457
|
+
job_logger.info('Dryrun finished.')
|
|
341
458
|
return None, None
|
|
342
459
|
|
|
343
460
|
do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
|
|
@@ -346,39 +463,50 @@ def _execute(
|
|
|
346
463
|
(task.file_mounts is not None or
|
|
347
464
|
task.storage_mounts is not None))
|
|
348
465
|
if do_workdir or do_file_mounts:
|
|
349
|
-
|
|
466
|
+
job_logger.info(ux_utils.starting_message('Syncing files.'))
|
|
350
467
|
|
|
351
468
|
if do_workdir:
|
|
352
|
-
|
|
469
|
+
if cluster_name is not None:
|
|
470
|
+
global_user_state.add_cluster_event(
|
|
471
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
472
|
+
'Syncing files to cluster',
|
|
473
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
474
|
+
backend.sync_workdir(handle, task.workdir, task.envs_and_secrets)
|
|
353
475
|
|
|
354
476
|
if do_file_mounts:
|
|
477
|
+
if cluster_name is not None:
|
|
478
|
+
global_user_state.add_cluster_event(
|
|
479
|
+
cluster_name, status_lib.ClusterStatus.UP,
|
|
480
|
+
'Syncing file mounts',
|
|
481
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
355
482
|
backend.sync_file_mounts(handle, task.file_mounts,
|
|
356
483
|
task.storage_mounts)
|
|
357
484
|
|
|
358
485
|
if no_setup:
|
|
359
|
-
|
|
486
|
+
job_logger.info('Setup commands skipped.')
|
|
360
487
|
elif Stage.SETUP in stages and not dryrun:
|
|
361
488
|
if skip_unnecessary_provisioning and provisioning_skipped:
|
|
362
|
-
|
|
363
|
-
|
|
489
|
+
job_logger.debug('Unnecessary provisioning was skipped, so '
|
|
490
|
+
'skipping setup as well.')
|
|
364
491
|
else:
|
|
492
|
+
if cluster_name is not None:
|
|
493
|
+
global_user_state.add_cluster_event(
|
|
494
|
+
cluster_name, status_lib.ClusterStatus.UP,
|
|
495
|
+
'Running setup commands to install dependencies',
|
|
496
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
365
497
|
backend.setup(handle, task, detach_setup=detach_setup)
|
|
366
498
|
|
|
367
499
|
if Stage.PRE_EXEC in stages and not dryrun:
|
|
368
500
|
if idle_minutes_to_autostop is not None:
|
|
369
501
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
370
502
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
371
|
-
backend.set_autostop(handle,
|
|
372
|
-
|
|
373
|
-
down=down)
|
|
503
|
+
backend.set_autostop(handle, idle_minutes_to_autostop, wait_for,
|
|
504
|
+
down)
|
|
374
505
|
|
|
375
506
|
if Stage.EXEC in stages:
|
|
376
507
|
try:
|
|
377
508
|
global_user_state.update_last_use(handle.get_cluster_name())
|
|
378
|
-
job_id = backend.execute(handle,
|
|
379
|
-
task,
|
|
380
|
-
detach_run,
|
|
381
|
-
dryrun=dryrun)
|
|
509
|
+
job_id = backend.execute(handle, task, dryrun=dryrun)
|
|
382
510
|
finally:
|
|
383
511
|
# Enables post_execute() to be run after KeyboardInterrupt.
|
|
384
512
|
backend.post_execute(handle, down)
|
|
@@ -395,6 +523,9 @@ def _execute(
|
|
|
395
523
|
|
|
396
524
|
@timeline.event
|
|
397
525
|
@usage_lib.entrypoint
|
|
526
|
+
# A launch routine will share tempfiles between steps, so we init a tempdir
|
|
527
|
+
# for the launch routine and gc the entire dir after launch.
|
|
528
|
+
@tempstore.with_tempdir
|
|
398
529
|
def launch(
|
|
399
530
|
task: Union['sky.Task', 'sky.Dag'],
|
|
400
531
|
cluster_name: Optional[str] = None,
|
|
@@ -408,12 +539,16 @@ def launch(
|
|
|
408
539
|
no_setup: bool = False,
|
|
409
540
|
clone_disk_from: Optional[str] = None,
|
|
410
541
|
fast: bool = False,
|
|
542
|
+
*, #keyword only separator
|
|
411
543
|
# Internal only:
|
|
412
544
|
# pylint: disable=invalid-name
|
|
413
545
|
_quiet_optimizer: bool = False,
|
|
414
546
|
_is_launched_by_jobs_controller: bool = False,
|
|
415
547
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
416
548
|
_disable_controller_check: bool = False,
|
|
549
|
+
_request_name: request_names.AdminPolicyRequestName = request_names.
|
|
550
|
+
AdminPolicyRequestName.CLUSTER_LAUNCH,
|
|
551
|
+
job_logger: logging.Logger = logger,
|
|
417
552
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
418
553
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
419
554
|
"""Launches a cluster or task.
|
|
@@ -432,7 +567,7 @@ def launch(
|
|
|
432
567
|
import sky
|
|
433
568
|
task = sky.Task(run='echo hello SkyPilot')
|
|
434
569
|
task.set_resources(
|
|
435
|
-
sky.Resources(
|
|
570
|
+
sky.Resources(infra='aws', accelerators='V100:4'))
|
|
436
571
|
sky.launch(task, cluster_name='my-cluster')
|
|
437
572
|
|
|
438
573
|
|
|
@@ -448,13 +583,16 @@ def launch(
|
|
|
448
583
|
running/pending jobs are found in the job queue. Setting this
|
|
449
584
|
flag is equivalent to running
|
|
450
585
|
``sky.launch(...)`` and then
|
|
451
|
-
``sky.autostop(idle_minutes=<minutes>)``. If
|
|
452
|
-
will
|
|
586
|
+
``sky.autostop(idle_minutes=<minutes>)``. If set, the autostop
|
|
587
|
+
config specified in the task' resources will be overridden by
|
|
588
|
+
this parameter.
|
|
453
589
|
down: Tear down the cluster after all jobs finish (successfully or
|
|
454
590
|
abnormally). If --idle-minutes-to-autostop is also set, the
|
|
455
591
|
cluster will be torn down after the specified idle time.
|
|
456
592
|
Note that if errors occur during provisioning/data syncing/setting
|
|
457
|
-
up, the cluster will not be torn down for debugging purposes.
|
|
593
|
+
up, the cluster will not be torn down for debugging purposes. If
|
|
594
|
+
set, the autostop config specified in the task' resources will be
|
|
595
|
+
overridden by this parameter.
|
|
458
596
|
dryrun: if True, do not actually launch the cluster.
|
|
459
597
|
stream_logs: if True, show the logs in the terminal.
|
|
460
598
|
backend: backend to use. If None, use the default backend
|
|
@@ -556,7 +694,6 @@ def launch(
|
|
|
556
694
|
# see the setup logs when inspecting the launch process to know
|
|
557
695
|
# excatly what the job is waiting for.
|
|
558
696
|
detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
|
|
559
|
-
|
|
560
697
|
return _execute(
|
|
561
698
|
entrypoint=entrypoint,
|
|
562
699
|
dryrun=dryrun,
|
|
@@ -569,7 +706,6 @@ def launch(
|
|
|
569
706
|
stages=stages,
|
|
570
707
|
cluster_name=cluster_name,
|
|
571
708
|
detach_setup=detach_setup,
|
|
572
|
-
detach_run=True,
|
|
573
709
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
574
710
|
no_setup=no_setup,
|
|
575
711
|
clone_disk_from=clone_disk_from,
|
|
@@ -578,7 +714,12 @@ def launch(
|
|
|
578
714
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
579
715
|
_is_launched_by_sky_serve_controller=
|
|
580
716
|
_is_launched_by_sky_serve_controller,
|
|
581
|
-
|
|
717
|
+
_request_name=_request_name,
|
|
718
|
+
job_logger=job_logger)
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
# needed for backward compatibility. Remove by v0.10.7 or v0.11.0
|
|
722
|
+
cluster_launch = launch
|
|
582
723
|
|
|
583
724
|
|
|
584
725
|
@usage_lib.entrypoint
|
|
@@ -589,6 +730,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
589
730
|
down: bool = False,
|
|
590
731
|
stream_logs: bool = True,
|
|
591
732
|
backend: Optional[backends.Backend] = None,
|
|
733
|
+
job_logger: logging.Logger = logger,
|
|
592
734
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
593
735
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
594
736
|
"""Executes a task on an existing cluster.
|
|
@@ -663,5 +805,6 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
663
805
|
Stage.EXEC,
|
|
664
806
|
],
|
|
665
807
|
cluster_name=cluster_name,
|
|
666
|
-
|
|
808
|
+
job_logger=job_logger,
|
|
809
|
+
_request_name=request_names.AdminPolicyRequestName.CLUSTER_EXEC,
|
|
667
810
|
)
|