skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/command_runner.py
CHANGED
|
@@ -3,20 +3,30 @@ import enum
|
|
|
3
3
|
import hashlib
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import re
|
|
6
7
|
import shlex
|
|
8
|
+
import sys
|
|
7
9
|
import time
|
|
8
|
-
from typing import Any, Callable, Iterable, List, Optional, Tuple, Type,
|
|
10
|
+
from typing import (Any, Callable, Dict, Iterable, List, Optional, Tuple, Type,
|
|
11
|
+
Union)
|
|
9
12
|
|
|
13
|
+
from sky import exceptions
|
|
10
14
|
from sky import sky_logging
|
|
11
15
|
from sky.skylet import constants
|
|
12
16
|
from sky.skylet import log_lib
|
|
17
|
+
from sky.utils import auth_utils
|
|
13
18
|
from sky.utils import common_utils
|
|
19
|
+
from sky.utils import context_utils
|
|
14
20
|
from sky.utils import control_master_utils
|
|
21
|
+
from sky.utils import git as git_utils
|
|
15
22
|
from sky.utils import subprocess_utils
|
|
16
23
|
from sky.utils import timeline
|
|
17
24
|
|
|
18
25
|
logger = sky_logging.init_logger(__name__)
|
|
19
26
|
|
|
27
|
+
# Pattern to extract home directory from command output
|
|
28
|
+
_HOME_DIR_PATTERN = re.compile(r'SKYPILOT_HOME_DIR: ([^\s\n]+)')
|
|
29
|
+
|
|
20
30
|
# Rsync options
|
|
21
31
|
# TODO(zhwu): This will print a per-file progress bar (with -P),
|
|
22
32
|
# shooting a lot of messages to the output. --info=progress2 is used
|
|
@@ -36,6 +46,8 @@ RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
|
|
|
36
46
|
# The git exclude file to support.
|
|
37
47
|
GIT_EXCLUDE = '.git/info/exclude'
|
|
38
48
|
RSYNC_EXCLUDE_OPTION = '--exclude-from={}'
|
|
49
|
+
# Owner and group metadata is not needed for downloads.
|
|
50
|
+
RSYNC_NO_OWNER_NO_GROUP_OPTION = '--no-owner --no-group'
|
|
39
51
|
|
|
40
52
|
_HASH_MAX_LENGTH = 10
|
|
41
53
|
_DEFAULT_CONNECT_TIMEOUT = 30
|
|
@@ -175,6 +187,28 @@ class CommandRunner:
|
|
|
175
187
|
def node_id(self) -> str:
|
|
176
188
|
return '-'.join(str(x) for x in self.node)
|
|
177
189
|
|
|
190
|
+
def _get_remote_home_dir(self) -> str:
|
|
191
|
+
# Use pattern matching to extract home directory.
|
|
192
|
+
# Some container images print MOTD when login shells start, which can
|
|
193
|
+
# contaminate command output. We use a unique pattern to extract the
|
|
194
|
+
# actual home directory reliably.
|
|
195
|
+
rc, output, stderr = self.run('echo "SKYPILOT_HOME_DIR: $(echo ~)"',
|
|
196
|
+
require_outputs=True,
|
|
197
|
+
separate_stderr=True,
|
|
198
|
+
stream_logs=False)
|
|
199
|
+
if rc != 0:
|
|
200
|
+
raise ValueError('Failed to get remote home directory: '
|
|
201
|
+
f'{output + stderr}')
|
|
202
|
+
|
|
203
|
+
# Extract home directory using pattern matching
|
|
204
|
+
home_dir_match = _HOME_DIR_PATTERN.search(output)
|
|
205
|
+
if home_dir_match:
|
|
206
|
+
remote_home_dir = home_dir_match.group(1)
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError('Failed to find remote home directory identifier: '
|
|
209
|
+
f'{output + stderr}')
|
|
210
|
+
return remote_home_dir
|
|
211
|
+
|
|
178
212
|
def _get_command_to_run(
|
|
179
213
|
self,
|
|
180
214
|
cmd: Union[str, List[str]],
|
|
@@ -182,6 +216,7 @@ class CommandRunner:
|
|
|
182
216
|
separate_stderr: bool,
|
|
183
217
|
skip_num_lines: int,
|
|
184
218
|
source_bashrc: bool = False,
|
|
219
|
+
use_login: bool = True,
|
|
185
220
|
) -> str:
|
|
186
221
|
"""Returns the command to run."""
|
|
187
222
|
if isinstance(cmd, list):
|
|
@@ -192,7 +227,7 @@ class CommandRunner:
|
|
|
192
227
|
'/bin/bash',
|
|
193
228
|
'--login',
|
|
194
229
|
'-c',
|
|
195
|
-
]
|
|
230
|
+
] if use_login else ['/bin/bash', '-c']
|
|
196
231
|
if source_bashrc:
|
|
197
232
|
command += [
|
|
198
233
|
# Need this `-i` option to make sure `source ~/.bashrc` work.
|
|
@@ -226,13 +261,34 @@ class CommandRunner:
|
|
|
226
261
|
command_str = ' '.join(command)
|
|
227
262
|
return command_str
|
|
228
263
|
|
|
264
|
+
def _get_remote_home_dir_with_retry(
|
|
265
|
+
self,
|
|
266
|
+
max_retry: int,
|
|
267
|
+
get_remote_home_dir: Callable[[], str],
|
|
268
|
+
) -> str:
|
|
269
|
+
"""Returns the remote home directory with retry."""
|
|
270
|
+
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5)
|
|
271
|
+
retries_left = max_retry
|
|
272
|
+
assert retries_left > 0, f'max_retry {max_retry} must be positive.'
|
|
273
|
+
while retries_left >= 0:
|
|
274
|
+
try:
|
|
275
|
+
return get_remote_home_dir()
|
|
276
|
+
except Exception: # pylint: disable=broad-except
|
|
277
|
+
if retries_left == 0:
|
|
278
|
+
raise
|
|
279
|
+
sleep_time = backoff.current_backoff()
|
|
280
|
+
logger.warning(f'Failed to get remote home dir '
|
|
281
|
+
f'- retrying in {sleep_time} seconds.')
|
|
282
|
+
retries_left -= 1
|
|
283
|
+
time.sleep(sleep_time)
|
|
284
|
+
|
|
229
285
|
def _rsync(
|
|
230
286
|
self,
|
|
231
287
|
source: str,
|
|
232
288
|
target: str,
|
|
233
|
-
node_destination: str,
|
|
289
|
+
node_destination: Optional[str],
|
|
234
290
|
up: bool,
|
|
235
|
-
rsh_option: str,
|
|
291
|
+
rsh_option: Optional[str],
|
|
236
292
|
# Advanced options.
|
|
237
293
|
log_path: str = os.devnull,
|
|
238
294
|
stream_logs: bool = True,
|
|
@@ -245,23 +301,8 @@ class CommandRunner:
|
|
|
245
301
|
if prefix_command is not None:
|
|
246
302
|
rsync_command.append(prefix_command)
|
|
247
303
|
rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
backoff = common_utils.Backoff(initial_backoff=1,
|
|
251
|
-
max_backoff_factor=5)
|
|
252
|
-
retries_left = max_retry
|
|
253
|
-
assert retries_left > 0, f'max_retry {max_retry} must be positive.'
|
|
254
|
-
while retries_left >= 0:
|
|
255
|
-
try:
|
|
256
|
-
return get_remote_home_dir()
|
|
257
|
-
except Exception: # pylint: disable=broad-except
|
|
258
|
-
if retries_left == 0:
|
|
259
|
-
raise
|
|
260
|
-
sleep_time = backoff.current_backoff()
|
|
261
|
-
logger.warning(f'Failed to get remote home dir '
|
|
262
|
-
f'- retrying in {sleep_time} seconds.')
|
|
263
|
-
retries_left -= 1
|
|
264
|
-
time.sleep(sleep_time)
|
|
304
|
+
if not up:
|
|
305
|
+
rsync_command.append(RSYNC_NO_OWNER_NO_GROUP_OPTION)
|
|
265
306
|
|
|
266
307
|
# --filter
|
|
267
308
|
# The source is a local path, so we need to resolve it.
|
|
@@ -282,28 +323,47 @@ class CommandRunner:
|
|
|
282
323
|
RSYNC_EXCLUDE_OPTION.format(
|
|
283
324
|
shlex.quote(str(resolved_source / GIT_EXCLUDE))))
|
|
284
325
|
|
|
285
|
-
|
|
326
|
+
if rsh_option is not None:
|
|
327
|
+
rsync_command.append(f'-e {shlex.quote(rsh_option)}')
|
|
328
|
+
maybe_dest_prefix = ('' if node_destination is None else
|
|
329
|
+
f'{node_destination}:')
|
|
286
330
|
|
|
287
331
|
if up:
|
|
288
332
|
resolved_target = target
|
|
289
|
-
if
|
|
290
|
-
|
|
291
|
-
resolved_target =
|
|
333
|
+
if node_destination is None:
|
|
334
|
+
# Is a local rsync. Directly resolve the target.
|
|
335
|
+
resolved_target = str(
|
|
336
|
+
pathlib.Path(target).expanduser().resolve())
|
|
337
|
+
else:
|
|
338
|
+
if target.startswith('~'):
|
|
339
|
+
remote_home_dir = self._get_remote_home_dir_with_retry(
|
|
340
|
+
max_retry=max_retry,
|
|
341
|
+
get_remote_home_dir=get_remote_home_dir)
|
|
342
|
+
resolved_target = target.replace('~', remote_home_dir)
|
|
292
343
|
full_source_str = str(resolved_source)
|
|
293
344
|
if resolved_source.is_dir():
|
|
294
345
|
full_source_str = os.path.join(full_source_str, '')
|
|
295
346
|
rsync_command.extend([
|
|
296
347
|
f'{full_source_str!r}',
|
|
297
|
-
f'{
|
|
348
|
+
f'{maybe_dest_prefix}{resolved_target!r}',
|
|
298
349
|
])
|
|
299
350
|
else:
|
|
300
351
|
resolved_source = source
|
|
301
|
-
if
|
|
302
|
-
|
|
303
|
-
|
|
352
|
+
if node_destination is None:
|
|
353
|
+
resolved_target = str(
|
|
354
|
+
pathlib.Path(target).expanduser().resolve())
|
|
355
|
+
resolved_source = str(
|
|
356
|
+
pathlib.Path(source).expanduser().resolve())
|
|
357
|
+
else:
|
|
358
|
+
resolved_target = os.path.expanduser(target)
|
|
359
|
+
if source.startswith('~'):
|
|
360
|
+
remote_home_dir = self._get_remote_home_dir_with_retry(
|
|
361
|
+
max_retry=max_retry,
|
|
362
|
+
get_remote_home_dir=get_remote_home_dir)
|
|
363
|
+
resolved_source = source.replace('~', remote_home_dir)
|
|
304
364
|
rsync_command.extend([
|
|
305
|
-
f'{
|
|
306
|
-
f'{
|
|
365
|
+
f'{maybe_dest_prefix}{resolved_source!r}',
|
|
366
|
+
f'{resolved_target!r}',
|
|
307
367
|
])
|
|
308
368
|
command = ' '.join(rsync_command)
|
|
309
369
|
logger.debug(f'Running rsync command: {command}')
|
|
@@ -367,7 +427,6 @@ class CommandRunner:
|
|
|
367
427
|
SkyPilot but we still want to get rid of some warning messages,
|
|
368
428
|
such as SSH warnings.
|
|
369
429
|
|
|
370
|
-
|
|
371
430
|
Returns:
|
|
372
431
|
returncode
|
|
373
432
|
or
|
|
@@ -422,18 +481,120 @@ class CommandRunner:
|
|
|
422
481
|
"""Close the cached connection to the remote machine."""
|
|
423
482
|
pass
|
|
424
483
|
|
|
425
|
-
def port_forward_command(
|
|
426
|
-
|
|
427
|
-
|
|
484
|
+
def port_forward_command(
|
|
485
|
+
self,
|
|
486
|
+
port_forward: List[Tuple[int, int]],
|
|
487
|
+
connect_timeout: int = 1,
|
|
488
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
428
489
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
429
490
|
|
|
430
491
|
Args:
|
|
431
492
|
port_forward: A list of ports to forward from the localhost to the
|
|
432
493
|
remote host.
|
|
433
494
|
connect_timeout: The timeout for the connection.
|
|
495
|
+
ssh_mode: The mode to use for ssh.
|
|
496
|
+
See SSHMode for more details.
|
|
434
497
|
"""
|
|
435
498
|
raise NotImplementedError
|
|
436
499
|
|
|
500
|
+
@timeline.event
|
|
501
|
+
def git_clone(
|
|
502
|
+
self,
|
|
503
|
+
target_dir: str,
|
|
504
|
+
*,
|
|
505
|
+
# Advanced options.
|
|
506
|
+
log_path: str = os.devnull,
|
|
507
|
+
stream_logs: bool = True,
|
|
508
|
+
connect_timeout: Optional[int] = None,
|
|
509
|
+
max_retry: int = 1,
|
|
510
|
+
envs_and_secrets: Optional[Dict[str, str]] = None,
|
|
511
|
+
) -> None:
|
|
512
|
+
"""Clones a Git repository on the remote machine using git_clone.sh.
|
|
513
|
+
|
|
514
|
+
Note: Git environment variables (GIT_URL, GIT_BRANCH, GIT_TOKEN, etc.)
|
|
515
|
+
must be set before calling this function.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
target_dir: Target directory where the repository will be cloned.
|
|
519
|
+
log_path: Redirect stdout/stderr to the log_path.
|
|
520
|
+
stream_logs: Stream logs to the stdout/stderr.
|
|
521
|
+
connect_timeout: timeout in seconds for the connection.
|
|
522
|
+
max_retry: The maximum number of retries for the rsync command.
|
|
523
|
+
This value should be non-negative.
|
|
524
|
+
envs_and_secrets: Environment variables and secrets to be set
|
|
525
|
+
before running the script.
|
|
526
|
+
Raises:
|
|
527
|
+
exceptions.CommandError: git clone command failed.
|
|
528
|
+
"""
|
|
529
|
+
# Find the git_clone.sh script path
|
|
530
|
+
git_clone_script_path = os.path.join(
|
|
531
|
+
os.path.dirname(os.path.abspath(__file__)), 'git_clone.sh')
|
|
532
|
+
|
|
533
|
+
if not os.path.exists(git_clone_script_path):
|
|
534
|
+
error_msg = f'git_clone.sh {git_clone_script_path} not found'
|
|
535
|
+
logger.error(error_msg)
|
|
536
|
+
raise exceptions.CommandError(1, '', error_msg, None)
|
|
537
|
+
|
|
538
|
+
# Remote script path (use a unique name to avoid conflicts)
|
|
539
|
+
script_hash = hashlib.md5(
|
|
540
|
+
f'{self.node_id}_{target_dir}'.encode()).hexdigest()[:8]
|
|
541
|
+
remote_script_path = f'/tmp/sky_git_clone_{script_hash}.sh'
|
|
542
|
+
|
|
543
|
+
# Step 1: Transfer the script to remote machine using rsync
|
|
544
|
+
logger.debug(
|
|
545
|
+
f'Transferring git_clone.sh to {self.node_id}:{remote_script_path}')
|
|
546
|
+
self.rsync(
|
|
547
|
+
source=git_clone_script_path,
|
|
548
|
+
target=remote_script_path,
|
|
549
|
+
up=True,
|
|
550
|
+
log_path=log_path,
|
|
551
|
+
stream_logs=False # Don't spam logs for script transfer
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Step 2: Execute the script on remote machine
|
|
555
|
+
if target_dir.startswith('~'):
|
|
556
|
+
remote_home_dir = self._get_remote_home_dir_with_retry(
|
|
557
|
+
max_retry=max_retry,
|
|
558
|
+
get_remote_home_dir=self._get_remote_home_dir)
|
|
559
|
+
target_dir = target_dir.replace('~', remote_home_dir)
|
|
560
|
+
quoted_target_dir = shlex.quote(target_dir)
|
|
561
|
+
quoted_script_path = shlex.quote(remote_script_path)
|
|
562
|
+
cmd = ''
|
|
563
|
+
log_cmd = ''
|
|
564
|
+
if envs_and_secrets:
|
|
565
|
+
for key, value in envs_and_secrets.items():
|
|
566
|
+
value = shlex.quote(value)
|
|
567
|
+
cmd += f'export {key}={value} && '
|
|
568
|
+
if (key == git_utils.GIT_TOKEN_ENV_VAR or
|
|
569
|
+
key == git_utils.GIT_SSH_KEY_ENV_VAR):
|
|
570
|
+
log_cmd += f'export {key}=******** && '
|
|
571
|
+
else:
|
|
572
|
+
log_cmd += f'export {key}={value} && '
|
|
573
|
+
exec_cmd = (f'bash {quoted_script_path} {quoted_target_dir} '
|
|
574
|
+
f'&& rm -f {quoted_script_path}')
|
|
575
|
+
cmd += exec_cmd
|
|
576
|
+
log_cmd += exec_cmd
|
|
577
|
+
|
|
578
|
+
logger.debug(f'Running git clone script on {self.node_id}: {log_cmd}')
|
|
579
|
+
|
|
580
|
+
backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
|
|
581
|
+
assert max_retry > 0, f'max_retry {max_retry} must be positive.'
|
|
582
|
+
while max_retry >= 0:
|
|
583
|
+
returncode = self.run(cmd,
|
|
584
|
+
log_path=log_path,
|
|
585
|
+
stream_logs=stream_logs,
|
|
586
|
+
connect_timeout=connect_timeout,
|
|
587
|
+
require_outputs=False)
|
|
588
|
+
if returncode == 0:
|
|
589
|
+
break
|
|
590
|
+
max_retry -= 1
|
|
591
|
+
time.sleep(backoff.current_backoff())
|
|
592
|
+
|
|
593
|
+
if returncode != 0:
|
|
594
|
+
error_msg = f'Git clone failed on {self.node_id}: {target_dir}'
|
|
595
|
+
logger.error(error_msg)
|
|
596
|
+
raise exceptions.CommandError(returncode, log_cmd, error_msg, None)
|
|
597
|
+
|
|
437
598
|
|
|
438
599
|
class SSHCommandRunner(CommandRunner):
|
|
439
600
|
"""Runner for SSH commands."""
|
|
@@ -447,6 +608,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
447
608
|
ssh_proxy_command: Optional[str] = None,
|
|
448
609
|
docker_user: Optional[str] = None,
|
|
449
610
|
disable_control_master: Optional[bool] = False,
|
|
611
|
+
port_forward_execute_remote_command: Optional[bool] = False,
|
|
450
612
|
):
|
|
451
613
|
"""Initialize SSHCommandRunner.
|
|
452
614
|
|
|
@@ -473,6 +635,10 @@ class SSHCommandRunner(CommandRunner):
|
|
|
473
635
|
disable_control_master: bool; specifies either or not the ssh
|
|
474
636
|
command will utilize ControlMaster. We currently disable
|
|
475
637
|
it for k8s instance.
|
|
638
|
+
port_forward_execute_remote_command: bool; specifies whether to
|
|
639
|
+
add -N to the port forwarding command. This is useful if you
|
|
640
|
+
want to run a command on the remote machine to make sure the
|
|
641
|
+
SSH tunnel is established.
|
|
476
642
|
"""
|
|
477
643
|
super().__init__(node)
|
|
478
644
|
ip, port = node
|
|
@@ -484,39 +650,63 @@ class SSHCommandRunner(CommandRunner):
|
|
|
484
650
|
self.disable_control_master = (
|
|
485
651
|
disable_control_master or
|
|
486
652
|
control_master_utils.should_disable_control_master())
|
|
653
|
+
# ensure the ssh key files are created from the database
|
|
654
|
+
auth_utils.create_ssh_key_files_from_db(ssh_private_key)
|
|
487
655
|
if docker_user is not None:
|
|
488
656
|
assert port is None or port == 22, (
|
|
489
657
|
f'port must be None or 22 for docker_user, got {port}.')
|
|
490
|
-
#
|
|
491
|
-
|
|
492
|
-
|
|
658
|
+
# When connecting via docker, the outer SSH hop points to the
|
|
659
|
+
# container's sshd (localhost). Preserve the user proxy for the
|
|
660
|
+
# inner hop that reaches the host VM, and clear the outer proxy to
|
|
661
|
+
# avoid forwarding localhost through the jump host.
|
|
662
|
+
inner_proxy_command = ssh_proxy_command
|
|
663
|
+
inner_proxy_port = port or 22
|
|
664
|
+
self._ssh_proxy_command = None
|
|
493
665
|
self.ip = 'localhost'
|
|
494
666
|
self.ssh_user = docker_user
|
|
495
667
|
self.port = constants.DEFAULT_DOCKER_PORT
|
|
668
|
+
if inner_proxy_command is not None:
|
|
669
|
+
# Replace %h/%p placeholders with actual host values, since the
|
|
670
|
+
# final destination from the perspective of the user proxy is
|
|
671
|
+
# the host VM (ip, inner_proxy_port).
|
|
672
|
+
inner_proxy_command = inner_proxy_command.replace('%h', ip)
|
|
673
|
+
inner_proxy_command = inner_proxy_command.replace(
|
|
674
|
+
'%p', str(inner_proxy_port))
|
|
496
675
|
self._docker_ssh_proxy_command = lambda ssh: ' '.join(
|
|
497
|
-
ssh + ssh_options_list(ssh_private_key,
|
|
498
|
-
|
|
676
|
+
ssh + ssh_options_list(ssh_private_key,
|
|
677
|
+
None,
|
|
678
|
+
ssh_proxy_command=inner_proxy_command,
|
|
679
|
+
port=inner_proxy_port,
|
|
680
|
+
disable_control_master=self.
|
|
681
|
+
disable_control_master) +
|
|
682
|
+
['-W', '%h:%p', f'{ssh_user}@{ip}'])
|
|
499
683
|
else:
|
|
500
684
|
self.ip = ip
|
|
501
685
|
self.ssh_user = ssh_user
|
|
502
686
|
self.port = port
|
|
503
687
|
self._docker_ssh_proxy_command = None
|
|
688
|
+
self.port_forward_execute_remote_command = (
|
|
689
|
+
port_forward_execute_remote_command)
|
|
504
690
|
|
|
505
|
-
def port_forward_command(
|
|
506
|
-
|
|
507
|
-
|
|
691
|
+
def port_forward_command(
|
|
692
|
+
self,
|
|
693
|
+
port_forward: List[Tuple[int, int]],
|
|
694
|
+
connect_timeout: int = 1,
|
|
695
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
508
696
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
509
697
|
|
|
510
698
|
Args:
|
|
511
699
|
port_forward: A list of ports to forward from the local port to the
|
|
512
700
|
remote port.
|
|
513
701
|
connect_timeout: The timeout for the ssh connection.
|
|
702
|
+
ssh_mode: The mode to use for ssh.
|
|
703
|
+
See SSHMode for more details.
|
|
514
704
|
|
|
515
705
|
Returns:
|
|
516
706
|
The command for forwarding ports from localhost to the remote
|
|
517
707
|
machine.
|
|
518
708
|
"""
|
|
519
|
-
return self.ssh_base_command(ssh_mode=
|
|
709
|
+
return self.ssh_base_command(ssh_mode=ssh_mode,
|
|
520
710
|
port_forward=port_forward,
|
|
521
711
|
connect_timeout=connect_timeout)
|
|
522
712
|
|
|
@@ -533,9 +723,13 @@ class SSHCommandRunner(CommandRunner):
|
|
|
533
723
|
ssh += ['-tt']
|
|
534
724
|
if port_forward is not None:
|
|
535
725
|
for local, remote in port_forward:
|
|
536
|
-
logger.
|
|
726
|
+
logger.debug(
|
|
537
727
|
f'Forwarding local port {local} to remote port {remote}.')
|
|
538
|
-
|
|
728
|
+
if self.port_forward_execute_remote_command:
|
|
729
|
+
ssh += ['-L']
|
|
730
|
+
else:
|
|
731
|
+
ssh += ['-NL']
|
|
732
|
+
ssh += [f'{local}:localhost:{remote}']
|
|
539
733
|
if self._docker_ssh_proxy_command is not None:
|
|
540
734
|
docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
|
|
541
735
|
else:
|
|
@@ -560,7 +754,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
560
754
|
if self.ssh_control_name is not None:
|
|
561
755
|
control_path = _ssh_control_path(self.ssh_control_name)
|
|
562
756
|
if control_path is not None:
|
|
563
|
-
# Suppress the `Exit request sent.` output for this
|
|
757
|
+
# Suppress the `Exit request sent.` output for this command
|
|
564
758
|
# which would interrupt the CLI spinner.
|
|
565
759
|
cmd = (f'ssh -O exit -S {control_path}/%C '
|
|
566
760
|
f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
|
|
@@ -574,6 +768,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
574
768
|
shell=True)
|
|
575
769
|
|
|
576
770
|
@timeline.event
|
|
771
|
+
@context_utils.cancellation_guard
|
|
577
772
|
def run(
|
|
578
773
|
self,
|
|
579
774
|
cmd: Union[str, List[str]],
|
|
@@ -748,9 +943,11 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
748
943
|
else:
|
|
749
944
|
return f'pod/{self.pod_name}'
|
|
750
945
|
|
|
751
|
-
def port_forward_command(
|
|
752
|
-
|
|
753
|
-
|
|
946
|
+
def port_forward_command(
|
|
947
|
+
self,
|
|
948
|
+
port_forward: List[Tuple[int, int]],
|
|
949
|
+
connect_timeout: int = 1,
|
|
950
|
+
ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
|
|
754
951
|
"""Command for forwarding ports from localhost to the remote machine.
|
|
755
952
|
|
|
756
953
|
Args:
|
|
@@ -758,14 +955,25 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
758
955
|
remote port. Currently, only one port is supported, i.e. the
|
|
759
956
|
list should have only one element.
|
|
760
957
|
connect_timeout: The timeout for the ssh connection.
|
|
958
|
+
ssh_mode: The mode to use for ssh.
|
|
959
|
+
See SSHMode for more details.
|
|
761
960
|
"""
|
|
961
|
+
del ssh_mode # unused
|
|
762
962
|
assert port_forward and len(port_forward) == 1, (
|
|
763
963
|
'Only one port is supported for Kubernetes port-forward.')
|
|
764
964
|
kubectl_args = [
|
|
765
965
|
'--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
|
|
766
966
|
]
|
|
967
|
+
# The same logic to either set `--context` to the k8s context where
|
|
968
|
+
# the sky cluster is hosted, or `--kubeconfig` to /dev/null for
|
|
969
|
+
# in-cluster k8s is used below in the `run()` method.
|
|
767
970
|
if self.context:
|
|
768
971
|
kubectl_args += ['--context', self.context]
|
|
972
|
+
# If context is none, it means the cluster is hosted on in-cluster k8s.
|
|
973
|
+
# In this case, we need to set KUBECONFIG to /dev/null to avoid looking
|
|
974
|
+
# for the cluster in whatever active context is set in the kubeconfig.
|
|
975
|
+
else:
|
|
976
|
+
kubectl_args += ['--kubeconfig', '/dev/null']
|
|
769
977
|
local_port, remote_port = port_forward[0]
|
|
770
978
|
local_port_str = f'{local_port}' if local_port is not None else ''
|
|
771
979
|
|
|
@@ -779,6 +987,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
779
987
|
return kubectl_cmd
|
|
780
988
|
|
|
781
989
|
@timeline.event
|
|
990
|
+
@context_utils.cancellation_guard
|
|
782
991
|
def run(
|
|
783
992
|
self,
|
|
784
993
|
cmd: Union[str, List[str]],
|
|
@@ -820,7 +1029,6 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
820
1029
|
SkyPilot but we still want to get rid of some warning messages,
|
|
821
1030
|
such as SSH warnings.
|
|
822
1031
|
|
|
823
|
-
|
|
824
1032
|
Returns:
|
|
825
1033
|
returncode
|
|
826
1034
|
or
|
|
@@ -922,23 +1130,10 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
922
1130
|
exceptions.CommandError: rsync command failed.
|
|
923
1131
|
"""
|
|
924
1132
|
|
|
925
|
-
def get_remote_home_dir() -> str:
|
|
926
|
-
# Use `echo ~` to get the remote home directory, instead of pwd or
|
|
927
|
-
# echo $HOME, because pwd can be `/` when the remote user is root
|
|
928
|
-
# and $HOME is not always set.
|
|
929
|
-
rc, remote_home_dir, stderr = self.run('echo ~',
|
|
930
|
-
require_outputs=True,
|
|
931
|
-
separate_stderr=True,
|
|
932
|
-
stream_logs=False)
|
|
933
|
-
if rc != 0:
|
|
934
|
-
raise ValueError('Failed to get remote home directory: '
|
|
935
|
-
f'{remote_home_dir + stderr}')
|
|
936
|
-
remote_home_dir = remote_home_dir.strip()
|
|
937
|
-
return remote_home_dir
|
|
938
|
-
|
|
939
1133
|
# Build command.
|
|
940
|
-
helper_path =
|
|
941
|
-
|
|
1134
|
+
helper_path = shlex.quote(
|
|
1135
|
+
os.path.join(os.path.abspath(os.path.dirname(__file__)),
|
|
1136
|
+
'kubernetes', 'rsync_helper.sh'))
|
|
942
1137
|
namespace_context = f'{self.namespace}+{self.context}'
|
|
943
1138
|
# Avoid rsync interpreting :, /, and + in namespace_context as the
|
|
944
1139
|
# default delimiter for options and arguments.
|
|
@@ -960,4 +1155,95 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
960
1155
|
# rsync with `kubectl` as the rsh command will cause ~/xx parsed as
|
|
961
1156
|
# /~/xx, so we need to replace ~ with the remote home directory. We
|
|
962
1157
|
# only need to do this when ~ is at the beginning of the path.
|
|
963
|
-
get_remote_home_dir=
|
|
1158
|
+
get_remote_home_dir=self._get_remote_home_dir)
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
class LocalProcessCommandRunner(CommandRunner):
|
|
1162
|
+
"""Runner for local process commands."""
|
|
1163
|
+
|
|
1164
|
+
def __init__(self):
|
|
1165
|
+
super().__init__('local')
|
|
1166
|
+
|
|
1167
|
+
@timeline.event
|
|
1168
|
+
@context_utils.cancellation_guard
|
|
1169
|
+
def run(
|
|
1170
|
+
self,
|
|
1171
|
+
cmd: Union[str, List[str]],
|
|
1172
|
+
*,
|
|
1173
|
+
require_outputs: bool = False,
|
|
1174
|
+
port_forward: Optional[List[Tuple[int, int]]] = None,
|
|
1175
|
+
# Advanced options.
|
|
1176
|
+
log_path: str = os.devnull,
|
|
1177
|
+
# If False, do not redirect stdout/stderr to optimize performance.
|
|
1178
|
+
process_stream: bool = True,
|
|
1179
|
+
stream_logs: bool = True,
|
|
1180
|
+
ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
|
|
1181
|
+
separate_stderr: bool = False,
|
|
1182
|
+
connect_timeout: Optional[int] = None,
|
|
1183
|
+
source_bashrc: bool = False,
|
|
1184
|
+
skip_num_lines: int = 0,
|
|
1185
|
+
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
1186
|
+
"""Use subprocess to run the command."""
|
|
1187
|
+
del port_forward, ssh_mode, connect_timeout # Unused.
|
|
1188
|
+
|
|
1189
|
+
command_str = self._get_command_to_run(cmd,
|
|
1190
|
+
process_stream,
|
|
1191
|
+
separate_stderr,
|
|
1192
|
+
skip_num_lines=skip_num_lines,
|
|
1193
|
+
source_bashrc=source_bashrc,
|
|
1194
|
+
use_login=False)
|
|
1195
|
+
|
|
1196
|
+
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
|
1197
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
1198
|
+
|
|
1199
|
+
executable = None
|
|
1200
|
+
command = [command_str]
|
|
1201
|
+
if not process_stream:
|
|
1202
|
+
if stream_logs:
|
|
1203
|
+
command += [
|
|
1204
|
+
f'| tee {log_path}',
|
|
1205
|
+
# This also requires the executor to be '/bin/bash' instead
|
|
1206
|
+
# of the default '/bin/sh'.
|
|
1207
|
+
'; exit ${PIPESTATUS[0]}'
|
|
1208
|
+
]
|
|
1209
|
+
else:
|
|
1210
|
+
command += [f'> {log_path}']
|
|
1211
|
+
executable = '/bin/bash'
|
|
1212
|
+
command_str = ' '.join(command)
|
|
1213
|
+
# For local process, the API server might not have this python path
|
|
1214
|
+
# setup. But this command runner should only be triggered from the API
|
|
1215
|
+
# server (in controller consolidation mode), so we can safely replace
|
|
1216
|
+
# the python path with the executable of the API server.
|
|
1217
|
+
command_str = command_str.replace(constants.SKY_PYTHON_CMD,
|
|
1218
|
+
sys.executable)
|
|
1219
|
+
logger.debug(f'Running command locally: {command_str}')
|
|
1220
|
+
return log_lib.run_with_log(command_str,
|
|
1221
|
+
log_path,
|
|
1222
|
+
require_outputs=require_outputs,
|
|
1223
|
+
stream_logs=stream_logs,
|
|
1224
|
+
process_stream=process_stream,
|
|
1225
|
+
shell=True,
|
|
1226
|
+
executable=executable,
|
|
1227
|
+
**kwargs)
|
|
1228
|
+
|
|
1229
|
+
@timeline.event
|
|
1230
|
+
def rsync(
|
|
1231
|
+
self,
|
|
1232
|
+
source: str,
|
|
1233
|
+
target: str,
|
|
1234
|
+
*,
|
|
1235
|
+
up: bool,
|
|
1236
|
+
# Advanced options.
|
|
1237
|
+
log_path: str = os.devnull,
|
|
1238
|
+
stream_logs: bool = True,
|
|
1239
|
+
max_retry: int = 1,
|
|
1240
|
+
) -> None:
|
|
1241
|
+
"""Use rsync to sync the source to the target."""
|
|
1242
|
+
self._rsync(source,
|
|
1243
|
+
target,
|
|
1244
|
+
node_destination=None,
|
|
1245
|
+
up=up,
|
|
1246
|
+
rsh_option=None,
|
|
1247
|
+
log_path=log_path,
|
|
1248
|
+
stream_logs=stream_logs,
|
|
1249
|
+
max_retry=max_retry)
|