skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/execution.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
|
4
4
|
"""
|
|
5
5
|
import enum
|
|
6
|
+
import logging
|
|
6
7
|
import typing
|
|
7
|
-
from typing import List, Optional, Tuple, Union
|
|
8
|
+
from typing import Callable, List, Optional, Tuple, Union
|
|
8
9
|
|
|
9
10
|
import colorama
|
|
10
11
|
|
|
@@ -14,7 +15,10 @@ from sky import clouds
|
|
|
14
15
|
from sky import global_user_state
|
|
15
16
|
from sky import optimizer
|
|
16
17
|
from sky import sky_logging
|
|
18
|
+
from sky import task as task_lib
|
|
17
19
|
from sky.backends import backend_utils
|
|
20
|
+
from sky.server.requests import request_names
|
|
21
|
+
from sky.skylet import autostop_lib
|
|
18
22
|
from sky.usage import usage_lib
|
|
19
23
|
from sky.utils import admin_policy_utils
|
|
20
24
|
from sky.utils import common
|
|
@@ -23,11 +27,13 @@ from sky.utils import dag_utils
|
|
|
23
27
|
from sky.utils import resources_utils
|
|
24
28
|
from sky.utils import rich_utils
|
|
25
29
|
from sky.utils import status_lib
|
|
30
|
+
from sky.utils import tempstore
|
|
26
31
|
from sky.utils import timeline
|
|
27
32
|
from sky.utils import ux_utils
|
|
28
33
|
|
|
29
34
|
if typing.TYPE_CHECKING:
|
|
30
35
|
import sky
|
|
36
|
+
from sky import resources as resources_lib
|
|
31
37
|
|
|
32
38
|
logger = sky_logging.init_logger(__name__)
|
|
33
39
|
|
|
@@ -108,16 +114,18 @@ def _execute(
|
|
|
108
114
|
stages: Optional[List[Stage]] = None,
|
|
109
115
|
cluster_name: Optional[str] = None,
|
|
110
116
|
detach_setup: bool = False,
|
|
111
|
-
detach_run: bool = False,
|
|
112
117
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
113
118
|
no_setup: bool = False,
|
|
114
119
|
clone_disk_from: Optional[str] = None,
|
|
115
120
|
skip_unnecessary_provisioning: bool = False,
|
|
121
|
+
*, #keyword only separator
|
|
116
122
|
# Internal only:
|
|
117
123
|
# pylint: disable=invalid-name
|
|
124
|
+
_request_name: request_names.AdminPolicyRequestName,
|
|
118
125
|
_quiet_optimizer: bool = False,
|
|
119
126
|
_is_launched_by_jobs_controller: bool = False,
|
|
120
127
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
128
|
+
job_logger: logging.Logger = logger,
|
|
121
129
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
122
130
|
"""Execute an entrypoint.
|
|
123
131
|
|
|
@@ -152,8 +160,6 @@ def _execute(
|
|
|
152
160
|
job itself. You can safely ctrl-c to detach from logging, and it will
|
|
153
161
|
not interrupt the setup process. To see the logs again after detaching,
|
|
154
162
|
use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
|
|
155
|
-
detach_run: If True, as soon as a job is submitted, return from this
|
|
156
|
-
function and do not stream execution logs.
|
|
157
163
|
idle_minutes_to_autostop: int; if provided, the cluster will be set to
|
|
158
164
|
autostop after this many minutes of idleness.
|
|
159
165
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
|
@@ -170,26 +176,96 @@ def _execute(
|
|
|
170
176
|
handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
|
|
171
177
|
if dryrun.
|
|
172
178
|
"""
|
|
173
|
-
|
|
179
|
+
if _request_name == request_names.AdminPolicyRequestName.CLUSTER_LAUNCH:
|
|
180
|
+
if _is_launched_by_jobs_controller:
|
|
181
|
+
_request_name = (
|
|
182
|
+
request_names.AdminPolicyRequestName.JOBS_LAUNCH_CLUSTER)
|
|
183
|
+
elif _is_launched_by_sky_serve_controller:
|
|
184
|
+
_request_name = (
|
|
185
|
+
request_names.AdminPolicyRequestName.SERVE_LAUNCH_REPLICA)
|
|
174
186
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
175
187
|
for task in dag.tasks:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
188
|
+
for resource in task.resources:
|
|
189
|
+
# For backward compatibility, we need to override the autostop
|
|
190
|
+
# config at server-side for legacy clients. This should be set
|
|
191
|
+
# before admin policy to make the admin policy get the final
|
|
192
|
+
# value of autostop config.
|
|
193
|
+
# TODO(aylei): remove this after we bump the API version.
|
|
194
|
+
resource.override_autostop_config(
|
|
195
|
+
down=down, idle_minutes=idle_minutes_to_autostop)
|
|
196
|
+
if resource.autostop_config is not None:
|
|
197
|
+
down = resource.autostop_config.down
|
|
198
|
+
idle_minutes_to_autostop = resource.autostop_config.idle_minutes
|
|
199
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
200
|
+
dag,
|
|
201
|
+
request_name=_request_name,
|
|
202
|
+
request_options=admin_policy.RequestOptions(
|
|
203
|
+
cluster_name=cluster_name,
|
|
204
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
205
|
+
down=down,
|
|
206
|
+
dryrun=dryrun,
|
|
207
|
+
)) as dag:
|
|
208
|
+
dag.resolve_and_validate_volumes()
|
|
209
|
+
if (not _is_launched_by_jobs_controller and
|
|
210
|
+
not _is_launched_by_sky_serve_controller):
|
|
211
|
+
# Only process pre-mount operations on API server.
|
|
212
|
+
dag.pre_mount_volumes()
|
|
213
|
+
for task in dag.tasks:
|
|
214
|
+
if task.storage_mounts is not None:
|
|
215
|
+
for storage in task.storage_mounts.values():
|
|
216
|
+
# Ensure the storage is constructed.
|
|
217
|
+
storage.construct()
|
|
218
|
+
return _execute_dag(
|
|
219
|
+
dag,
|
|
186
220
|
dryrun=dryrun,
|
|
187
|
-
|
|
221
|
+
stream_logs=stream_logs,
|
|
222
|
+
handle=handle,
|
|
223
|
+
backend=backend,
|
|
224
|
+
retry_until_up=retry_until_up,
|
|
225
|
+
optimize_target=optimize_target,
|
|
226
|
+
stages=stages,
|
|
227
|
+
cluster_name=cluster_name,
|
|
228
|
+
detach_setup=detach_setup,
|
|
229
|
+
no_setup=no_setup,
|
|
230
|
+
clone_disk_from=clone_disk_from,
|
|
231
|
+
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
|
232
|
+
_quiet_optimizer=_quiet_optimizer,
|
|
233
|
+
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
234
|
+
_is_launched_by_sky_serve_controller=
|
|
235
|
+
_is_launched_by_sky_serve_controller,
|
|
236
|
+
job_logger=job_logger)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _execute_dag(
|
|
240
|
+
dag: 'sky.Dag',
|
|
241
|
+
dryrun: bool,
|
|
242
|
+
stream_logs: bool,
|
|
243
|
+
handle: Optional[backends.ResourceHandle],
|
|
244
|
+
backend: Optional[backends.Backend],
|
|
245
|
+
retry_until_up: bool,
|
|
246
|
+
optimize_target: common.OptimizeTarget,
|
|
247
|
+
stages: Optional[List[Stage]],
|
|
248
|
+
cluster_name: Optional[str],
|
|
249
|
+
detach_setup: bool,
|
|
250
|
+
no_setup: bool,
|
|
251
|
+
clone_disk_from: Optional[str],
|
|
252
|
+
skip_unnecessary_provisioning: bool,
|
|
253
|
+
# pylint: disable=invalid-name
|
|
254
|
+
_quiet_optimizer: bool,
|
|
255
|
+
_is_launched_by_jobs_controller: bool,
|
|
256
|
+
_is_launched_by_sky_serve_controller: bool,
|
|
257
|
+
job_logger: logging.Logger = logger,
|
|
258
|
+
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
259
|
+
"""Execute a DAG.
|
|
260
|
+
|
|
261
|
+
This is an internal helper function for _execute() and is expected to be
|
|
262
|
+
called only by _execute().
|
|
263
|
+
"""
|
|
188
264
|
assert len(dag) == 1, f'We support 1 task for now. {dag}'
|
|
189
265
|
task = dag.tasks[0]
|
|
190
266
|
|
|
191
267
|
if any(r.job_recovery is not None for r in task.resources):
|
|
192
|
-
|
|
268
|
+
job_logger.warning(
|
|
193
269
|
f'{colorama.Style.DIM}The task has `job_recovery` specified, '
|
|
194
270
|
'but is launched as an unmanaged job. It will be ignored.'
|
|
195
271
|
'To enable job recovery, use managed jobs: sky jobs launch.'
|
|
@@ -197,8 +273,10 @@ def _execute(
|
|
|
197
273
|
|
|
198
274
|
cluster_exists = False
|
|
199
275
|
if cluster_name is not None:
|
|
200
|
-
|
|
201
|
-
|
|
276
|
+
# We use launched_at to check if the cluster exists, because this
|
|
277
|
+
# db query is faster than get_cluster_from_name.
|
|
278
|
+
cluster_exists = global_user_state.cluster_with_name_exists(
|
|
279
|
+
cluster_name)
|
|
202
280
|
# TODO(woosuk): If the cluster exists, print a warning that
|
|
203
281
|
# `cpus` and `memory` are not used as a job scheduling constraint,
|
|
204
282
|
# unlike `gpus`.
|
|
@@ -214,8 +292,7 @@ def _execute(
|
|
|
214
292
|
if controller is not None:
|
|
215
293
|
requested_features.add(
|
|
216
294
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS)
|
|
217
|
-
if controller_utils.high_availability_specified(cluster_name
|
|
218
|
-
skip_warning=False):
|
|
295
|
+
if controller_utils.high_availability_specified(cluster_name):
|
|
219
296
|
requested_features.add(clouds.CloudImplementationFeatures.
|
|
220
297
|
HIGH_AVAILABILITY_CONTROLLERS)
|
|
221
298
|
# If we provision a cluster that supports high availability
|
|
@@ -226,11 +303,43 @@ def _execute(
|
|
|
226
303
|
requested_features |= task.get_required_cloud_features()
|
|
227
304
|
|
|
228
305
|
backend = backend if backend is not None else backends.CloudVmRayBackend()
|
|
306
|
+
# Figure out autostop config.
|
|
307
|
+
# Note: Ideally this can happen after provisioning, so we can check the
|
|
308
|
+
# autostop config from the launched resources. Before provisioning,
|
|
309
|
+
# we aren't sure which resources will be launched, and different
|
|
310
|
+
# resources may have different autostop configs.
|
|
229
311
|
if isinstance(backend, backends.CloudVmRayBackend):
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
312
|
+
# No autostop config specified on command line, use the
|
|
313
|
+
# config from resources.
|
|
314
|
+
# TODO(cooperc): This should be done after provisioning, in order to
|
|
315
|
+
# support different autostop configs for different resources.
|
|
316
|
+
# Blockers:
|
|
317
|
+
# - Need autostop config to set requested_features before
|
|
318
|
+
# provisioning.
|
|
319
|
+
# - Need to send info message about idle_minutes_to_autostop==0 here
|
|
320
|
+
# - Need to check if autostop is supported by the backend.
|
|
321
|
+
resources = list(task.resources)
|
|
322
|
+
for resource in resources:
|
|
323
|
+
if resource.autostop_config != resources[0].autostop_config:
|
|
324
|
+
raise ValueError(
|
|
325
|
+
'All resources must have the same autostop config.')
|
|
326
|
+
resource_autostop_config = resources[0].autostop_config
|
|
327
|
+
|
|
328
|
+
idle_minutes_to_autostop: Optional[int] = None
|
|
329
|
+
down = False
|
|
330
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
331
|
+
if resource_autostop_config is not None:
|
|
332
|
+
if resource_autostop_config.enabled:
|
|
333
|
+
idle_minutes_to_autostop = (
|
|
334
|
+
resource_autostop_config.idle_minutes)
|
|
335
|
+
down = resource_autostop_config.down
|
|
336
|
+
wait_for = resource_autostop_config.wait_for
|
|
337
|
+
else:
|
|
338
|
+
# Autostop is explicitly disabled, so cancel it if it's
|
|
339
|
+
# already set.
|
|
340
|
+
assert not resource_autostop_config.enabled
|
|
341
|
+
idle_minutes_to_autostop = -1
|
|
342
|
+
down = False
|
|
234
343
|
if idle_minutes_to_autostop is not None:
|
|
235
344
|
if idle_minutes_to_autostop == 0:
|
|
236
345
|
# idle_minutes_to_autostop=0 can cause the following problem:
|
|
@@ -239,10 +348,10 @@ def _execute(
|
|
|
239
348
|
# itself have no task running and start the auto{stop,down}
|
|
240
349
|
# process, before the task is submitted in the EXEC stage.
|
|
241
350
|
verb = 'torn down' if down else 'stopped'
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
351
|
+
job_logger.info(f'{colorama.Style.DIM}The cluster will '
|
|
352
|
+
f'be {verb} after 1 minutes of idleness '
|
|
353
|
+
'(after all jobs finish).'
|
|
354
|
+
f'{colorama.Style.RESET_ALL}')
|
|
246
355
|
idle_minutes_to_autostop = 1
|
|
247
356
|
if Stage.DOWN in stages:
|
|
248
357
|
stages.remove(Stage.DOWN)
|
|
@@ -257,27 +366,21 @@ def _execute(
|
|
|
257
366
|
# (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
|
|
258
367
|
# the backend.
|
|
259
368
|
|
|
260
|
-
elif idle_minutes_to_autostop is not None:
|
|
261
|
-
# TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend.
|
|
262
|
-
with ux_utils.print_exception_no_traceback():
|
|
263
|
-
raise ValueError(
|
|
264
|
-
f'Backend {backend.NAME} does not support autostop, please try'
|
|
265
|
-
f' {backends.CloudVmRayBackend.NAME}')
|
|
266
|
-
|
|
267
369
|
if Stage.CLONE_DISK in stages:
|
|
268
370
|
task = _maybe_clone_disk_from_cluster(clone_disk_from, cluster_name,
|
|
269
371
|
task)
|
|
270
372
|
|
|
373
|
+
is_managed = (_is_launched_by_jobs_controller or
|
|
374
|
+
_is_launched_by_sky_serve_controller)
|
|
375
|
+
|
|
271
376
|
if not cluster_exists:
|
|
272
377
|
# If spot is launched on serve or jobs controller, we don't need to
|
|
273
378
|
# print out the hint.
|
|
274
|
-
if (Stage.PROVISION in stages and task.use_spot and
|
|
275
|
-
not _is_launched_by_jobs_controller and
|
|
276
|
-
not _is_launched_by_sky_serve_controller):
|
|
379
|
+
if (Stage.PROVISION in stages and task.use_spot and not is_managed):
|
|
277
380
|
yellow = colorama.Fore.YELLOW
|
|
278
381
|
bold = colorama.Style.BRIGHT
|
|
279
382
|
reset = colorama.Style.RESET_ALL
|
|
280
|
-
|
|
383
|
+
job_logger.info(
|
|
281
384
|
f'{yellow}Launching a spot job that does not '
|
|
282
385
|
f'automatically recover from preemptions. To '
|
|
283
386
|
'get automatic recovery, use managed job instead: '
|
|
@@ -296,7 +399,7 @@ def _execute(
|
|
|
296
399
|
controller = controller_utils.Controllers.from_name(
|
|
297
400
|
cluster_name)
|
|
298
401
|
if controller is not None:
|
|
299
|
-
|
|
402
|
+
job_logger.info(
|
|
300
403
|
f'Choosing resources for {controller.value.name}...'
|
|
301
404
|
)
|
|
302
405
|
dag = optimizer.Optimizer.optimize(dag,
|
|
@@ -305,6 +408,26 @@ def _execute(
|
|
|
305
408
|
task = dag.tasks[0] # Keep: dag may have been deep-copied.
|
|
306
409
|
assert task.best_resources is not None, task
|
|
307
410
|
|
|
411
|
+
# Note on race vs. lock: OPTIMIZE typically runs outside the per-cluster
|
|
412
|
+
# lock. After the backend acquires the lock and refreshes state, the
|
|
413
|
+
# original "do we need to optimize?" decision may be stale (e.g., the
|
|
414
|
+
# cluster just got terminated). To compensate without moving the optimizer
|
|
415
|
+
# into the backend, we inject a small planner the backend can call under
|
|
416
|
+
# the lock only when no reusable snapshot and no caller plan exist.
|
|
417
|
+
planner: Optional[Callable[['sky.Task'], 'resources_lib.Resources']] = None
|
|
418
|
+
if isinstance(backend,
|
|
419
|
+
backends.CloudVmRayBackend) and Stage.OPTIMIZE in stages:
|
|
420
|
+
|
|
421
|
+
def _planner(_t: 'sky.Task'):
|
|
422
|
+
new_dag = optimizer.Optimizer.optimize(dag,
|
|
423
|
+
minimize=optimize_target,
|
|
424
|
+
quiet=_quiet_optimizer)
|
|
425
|
+
new_task = new_dag.tasks[0]
|
|
426
|
+
assert new_task.best_resources is not None, new_task
|
|
427
|
+
return new_task.best_resources.assert_launchable()
|
|
428
|
+
|
|
429
|
+
planner = _planner
|
|
430
|
+
|
|
308
431
|
backend.register_info(
|
|
309
432
|
dag=dag,
|
|
310
433
|
optimize_target=optimize_target,
|
|
@@ -312,7 +435,9 @@ def _execute(
|
|
|
312
435
|
# That's because we want to do commands in task.setup and task.run again
|
|
313
436
|
# after K8S pod recovers from a crash.
|
|
314
437
|
# See `kubernetes-ray.yml.j2` for more details.
|
|
315
|
-
dump_final_script=is_controller_high_availability_supported
|
|
438
|
+
dump_final_script=is_controller_high_availability_supported,
|
|
439
|
+
is_managed=is_managed,
|
|
440
|
+
planner=planner)
|
|
316
441
|
|
|
317
442
|
if task.storage_mounts is not None:
|
|
318
443
|
# Optimizer should eventually choose where to store bucket
|
|
@@ -337,7 +462,7 @@ def _execute(
|
|
|
337
462
|
if handle is None:
|
|
338
463
|
assert dryrun, ('If not dryrun, handle must be set or '
|
|
339
464
|
'Stage.PROVISION must be included in stages.')
|
|
340
|
-
|
|
465
|
+
job_logger.info('Dryrun finished.')
|
|
341
466
|
return None, None
|
|
342
467
|
|
|
343
468
|
do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
|
|
@@ -346,39 +471,52 @@ def _execute(
|
|
|
346
471
|
(task.file_mounts is not None or
|
|
347
472
|
task.storage_mounts is not None))
|
|
348
473
|
if do_workdir or do_file_mounts:
|
|
349
|
-
|
|
474
|
+
job_logger.info(ux_utils.starting_message('Syncing files.'))
|
|
350
475
|
|
|
351
476
|
if do_workdir:
|
|
352
|
-
|
|
477
|
+
if cluster_name is not None:
|
|
478
|
+
global_user_state.add_cluster_event(
|
|
479
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
480
|
+
'Syncing files to cluster',
|
|
481
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
482
|
+
envs_and_secrets = task_lib.get_plaintext_envs_and_secrets(
|
|
483
|
+
task.envs_and_secrets)
|
|
484
|
+
backend.sync_workdir(handle, task.workdir, envs_and_secrets)
|
|
353
485
|
|
|
354
486
|
if do_file_mounts:
|
|
487
|
+
if cluster_name is not None:
|
|
488
|
+
global_user_state.add_cluster_event(
|
|
489
|
+
cluster_name, status_lib.ClusterStatus.UP,
|
|
490
|
+
'Syncing file mounts',
|
|
491
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
355
492
|
backend.sync_file_mounts(handle, task.file_mounts,
|
|
356
493
|
task.storage_mounts)
|
|
357
494
|
|
|
358
495
|
if no_setup:
|
|
359
|
-
|
|
496
|
+
job_logger.info('Setup commands skipped.')
|
|
360
497
|
elif Stage.SETUP in stages and not dryrun:
|
|
361
498
|
if skip_unnecessary_provisioning and provisioning_skipped:
|
|
362
|
-
|
|
363
|
-
|
|
499
|
+
job_logger.debug('Unnecessary provisioning was skipped, so '
|
|
500
|
+
'skipping setup as well.')
|
|
364
501
|
else:
|
|
502
|
+
if cluster_name is not None:
|
|
503
|
+
global_user_state.add_cluster_event(
|
|
504
|
+
cluster_name, status_lib.ClusterStatus.UP,
|
|
505
|
+
'Running setup commands to install dependencies',
|
|
506
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
365
507
|
backend.setup(handle, task, detach_setup=detach_setup)
|
|
366
508
|
|
|
367
509
|
if Stage.PRE_EXEC in stages and not dryrun:
|
|
368
510
|
if idle_minutes_to_autostop is not None:
|
|
369
511
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
370
512
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
371
|
-
backend.set_autostop(handle,
|
|
372
|
-
|
|
373
|
-
down=down)
|
|
513
|
+
backend.set_autostop(handle, idle_minutes_to_autostop, wait_for,
|
|
514
|
+
down)
|
|
374
515
|
|
|
375
516
|
if Stage.EXEC in stages:
|
|
376
517
|
try:
|
|
377
518
|
global_user_state.update_last_use(handle.get_cluster_name())
|
|
378
|
-
job_id = backend.execute(handle,
|
|
379
|
-
task,
|
|
380
|
-
detach_run,
|
|
381
|
-
dryrun=dryrun)
|
|
519
|
+
job_id = backend.execute(handle, task, dryrun=dryrun)
|
|
382
520
|
finally:
|
|
383
521
|
# Enables post_execute() to be run after KeyboardInterrupt.
|
|
384
522
|
backend.post_execute(handle, down)
|
|
@@ -395,6 +533,9 @@ def _execute(
|
|
|
395
533
|
|
|
396
534
|
@timeline.event
|
|
397
535
|
@usage_lib.entrypoint
|
|
536
|
+
# A launch routine will share tempfiles between steps, so we init a tempdir
|
|
537
|
+
# for the launch routine and gc the entire dir after launch.
|
|
538
|
+
@tempstore.with_tempdir
|
|
398
539
|
def launch(
|
|
399
540
|
task: Union['sky.Task', 'sky.Dag'],
|
|
400
541
|
cluster_name: Optional[str] = None,
|
|
@@ -408,12 +549,16 @@ def launch(
|
|
|
408
549
|
no_setup: bool = False,
|
|
409
550
|
clone_disk_from: Optional[str] = None,
|
|
410
551
|
fast: bool = False,
|
|
552
|
+
*, #keyword only separator
|
|
411
553
|
# Internal only:
|
|
412
554
|
# pylint: disable=invalid-name
|
|
413
555
|
_quiet_optimizer: bool = False,
|
|
414
556
|
_is_launched_by_jobs_controller: bool = False,
|
|
415
557
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
416
558
|
_disable_controller_check: bool = False,
|
|
559
|
+
_request_name: request_names.AdminPolicyRequestName = request_names.
|
|
560
|
+
AdminPolicyRequestName.CLUSTER_LAUNCH,
|
|
561
|
+
job_logger: logging.Logger = logger,
|
|
417
562
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
418
563
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
419
564
|
"""Launches a cluster or task.
|
|
@@ -432,7 +577,7 @@ def launch(
|
|
|
432
577
|
import sky
|
|
433
578
|
task = sky.Task(run='echo hello SkyPilot')
|
|
434
579
|
task.set_resources(
|
|
435
|
-
sky.Resources(
|
|
580
|
+
sky.Resources(infra='aws', accelerators='V100:4'))
|
|
436
581
|
sky.launch(task, cluster_name='my-cluster')
|
|
437
582
|
|
|
438
583
|
|
|
@@ -448,13 +593,16 @@ def launch(
|
|
|
448
593
|
running/pending jobs are found in the job queue. Setting this
|
|
449
594
|
flag is equivalent to running
|
|
450
595
|
``sky.launch(...)`` and then
|
|
451
|
-
``sky.autostop(idle_minutes=<minutes>)``. If
|
|
452
|
-
will
|
|
596
|
+
``sky.autostop(idle_minutes=<minutes>)``. If set, the autostop
|
|
597
|
+
config specified in the task' resources will be overridden by
|
|
598
|
+
this parameter.
|
|
453
599
|
down: Tear down the cluster after all jobs finish (successfully or
|
|
454
600
|
abnormally). If --idle-minutes-to-autostop is also set, the
|
|
455
601
|
cluster will be torn down after the specified idle time.
|
|
456
602
|
Note that if errors occur during provisioning/data syncing/setting
|
|
457
|
-
up, the cluster will not be torn down for debugging purposes.
|
|
603
|
+
up, the cluster will not be torn down for debugging purposes. If
|
|
604
|
+
set, the autostop config specified in the task' resources will be
|
|
605
|
+
overridden by this parameter.
|
|
458
606
|
dryrun: if True, do not actually launch the cluster.
|
|
459
607
|
stream_logs: if True, show the logs in the terminal.
|
|
460
608
|
backend: backend to use. If None, use the default backend
|
|
@@ -556,7 +704,6 @@ def launch(
|
|
|
556
704
|
# see the setup logs when inspecting the launch process to know
|
|
557
705
|
# excatly what the job is waiting for.
|
|
558
706
|
detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
|
|
559
|
-
|
|
560
707
|
return _execute(
|
|
561
708
|
entrypoint=entrypoint,
|
|
562
709
|
dryrun=dryrun,
|
|
@@ -569,7 +716,6 @@ def launch(
|
|
|
569
716
|
stages=stages,
|
|
570
717
|
cluster_name=cluster_name,
|
|
571
718
|
detach_setup=detach_setup,
|
|
572
|
-
detach_run=True,
|
|
573
719
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
574
720
|
no_setup=no_setup,
|
|
575
721
|
clone_disk_from=clone_disk_from,
|
|
@@ -578,7 +724,12 @@ def launch(
|
|
|
578
724
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
579
725
|
_is_launched_by_sky_serve_controller=
|
|
580
726
|
_is_launched_by_sky_serve_controller,
|
|
581
|
-
|
|
727
|
+
_request_name=_request_name,
|
|
728
|
+
job_logger=job_logger)
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
# needed for backward compatibility. Remove by v0.12.0
|
|
732
|
+
cluster_launch = launch
|
|
582
733
|
|
|
583
734
|
|
|
584
735
|
@usage_lib.entrypoint
|
|
@@ -589,6 +740,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
589
740
|
down: bool = False,
|
|
590
741
|
stream_logs: bool = True,
|
|
591
742
|
backend: Optional[backends.Backend] = None,
|
|
743
|
+
job_logger: logging.Logger = logger,
|
|
592
744
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
593
745
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
594
746
|
"""Executes a task on an existing cluster.
|
|
@@ -663,5 +815,6 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
663
815
|
Stage.EXEC,
|
|
664
816
|
],
|
|
665
817
|
cluster_name=cluster_name,
|
|
666
|
-
|
|
818
|
+
job_logger=job_logger,
|
|
819
|
+
_request_name=request_names.AdminPolicyRequestName.CLUSTER_EXEC,
|
|
667
820
|
)
|