skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/provision/docker_utils.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import shlex
|
|
5
5
|
import time
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
9
|
from sky.skylet import constants
|
|
@@ -15,23 +15,52 @@ logger = sky_logging.init_logger(__name__)
|
|
|
15
15
|
# Configure environment variables. A docker image can have environment variables
|
|
16
16
|
# set in the Dockerfile with `ENV``. We need to export these variables to the
|
|
17
17
|
# shell environment, so that our ssh session can access them.
|
|
18
|
+
# Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
|
|
19
|
+
# Docker images with Ray 2.48.0+ set this for UV package manager support,
|
|
20
|
+
# but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
|
|
21
|
+
# See: https://github.com/skypilot-org/skypilot/pull/7181
|
|
18
22
|
SETUP_ENV_VARS_CMD = (
|
|
19
23
|
'prefix_cmd() '
|
|
20
24
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
|
21
|
-
'export -p > ~/container_env_var.sh && '
|
|
25
|
+
'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
|
|
22
26
|
'$(prefix_cmd) '
|
|
23
27
|
'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
|
|
24
28
|
|
|
25
29
|
# Docker daemon may not be ready when the machine is firstly started. The error
|
|
26
30
|
# message starts with the following string. We should wait for a while and retry
|
|
27
31
|
# the command.
|
|
28
|
-
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
|
|
29
|
-
'the Docker daemon socket')
|
|
32
|
+
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')
|
|
30
33
|
|
|
31
34
|
DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
|
|
35
|
+
DOCKER_SOCKET_NOT_READY_STR_2 = (
|
|
36
|
+
'check if the path is correct and if the daemon is running')
|
|
32
37
|
|
|
33
38
|
_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
|
|
34
39
|
|
|
40
|
+
# Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
|
|
41
|
+
# AWS CLI v2 is installed as a standalone binary, not a Python package. See:
|
|
42
|
+
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
|
|
43
|
+
INSTALL_AWS_CLI_CMD = (
|
|
44
|
+
'which aws || ((command -v unzip >/dev/null 2>&1 || '
|
|
45
|
+
'(sudo apt-get update && sudo apt-get install -y unzip)) && '
|
|
46
|
+
'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
|
|
47
|
+
'-o "/tmp/awscliv2.zip" && '
|
|
48
|
+
'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
|
|
49
|
+
'&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _extract_region_from_ecr_server(server: str) -> str:
|
|
53
|
+
"""Extract AWS region from ECR server URL.
|
|
54
|
+
|
|
55
|
+
ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
56
|
+
Returns the region part from the URL.
|
|
57
|
+
"""
|
|
58
|
+
# Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
|
|
59
|
+
parts = server.split('.')
|
|
60
|
+
if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
|
|
61
|
+
return parts[3]
|
|
62
|
+
raise ValueError(f'Invalid ECR server format: {server}')
|
|
63
|
+
|
|
35
64
|
|
|
36
65
|
@dataclasses.dataclass
|
|
37
66
|
class DockerLoginConfig:
|
|
@@ -83,6 +112,21 @@ def check_docker_image(cname, docker_cmd):
|
|
|
83
112
|
return _check_helper(cname, '.Config.Image', docker_cmd)
|
|
84
113
|
|
|
85
114
|
|
|
115
|
+
def maybe_remove_container_cmds(container_name, docker_cmd):
|
|
116
|
+
"""Remove the container if it exists. If not, it will be a no-op.
|
|
117
|
+
"""
|
|
118
|
+
docker_rm = [
|
|
119
|
+
docker_cmd,
|
|
120
|
+
'rm',
|
|
121
|
+
'-f',
|
|
122
|
+
container_name,
|
|
123
|
+
'2>/dev/null',
|
|
124
|
+
'||',
|
|
125
|
+
'true',
|
|
126
|
+
]
|
|
127
|
+
return ' '.join(docker_rm)
|
|
128
|
+
|
|
129
|
+
|
|
86
130
|
def docker_start_cmds(
|
|
87
131
|
image,
|
|
88
132
|
container_name,
|
|
@@ -142,19 +186,23 @@ class DockerInitializer:
|
|
|
142
186
|
self.docker_config = docker_config
|
|
143
187
|
self.container_name = docker_config['container_name']
|
|
144
188
|
self.runner = runner
|
|
145
|
-
self.home_dir = None
|
|
189
|
+
self.home_dir: Optional[str] = None
|
|
146
190
|
self.initialized = False
|
|
147
191
|
# podman is not fully tested yet.
|
|
148
192
|
use_podman = docker_config.get('use_podman', False)
|
|
149
193
|
self.docker_cmd = 'podman' if use_podman else 'docker'
|
|
150
194
|
self.log_path = log_path
|
|
151
195
|
|
|
152
|
-
def _run(
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
196
|
+
def _run(
|
|
197
|
+
self,
|
|
198
|
+
cmd,
|
|
199
|
+
run_env='host',
|
|
200
|
+
wait_for_docker_daemon: bool = False,
|
|
201
|
+
separate_stderr: bool = False,
|
|
202
|
+
log_err_when_fail: bool = True,
|
|
203
|
+
flock_name: Optional[str] = None,
|
|
204
|
+
flock_args: Optional[str] = None,
|
|
205
|
+
) -> str:
|
|
158
206
|
|
|
159
207
|
if run_env == 'docker':
|
|
160
208
|
cmd = self._docker_expand_user(cmd, any_char=True)
|
|
@@ -163,8 +211,13 @@ class DockerInitializer:
|
|
|
163
211
|
# an error: `the input device is not a TTY`, and it works without
|
|
164
212
|
# `-it` flag.
|
|
165
213
|
# TODO(zhwu): ray use the `-it` flag, we need to check why.
|
|
166
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
167
|
-
f' {shlex.quote(cmd)} ')
|
|
214
|
+
cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
|
|
215
|
+
f' /bin/bash -c {shlex.quote(cmd)} ')
|
|
216
|
+
|
|
217
|
+
if flock_name is not None:
|
|
218
|
+
flock_args = flock_args or ''
|
|
219
|
+
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
220
|
+
f'-c {shlex.quote(cmd)}')
|
|
168
221
|
|
|
169
222
|
logger.debug(f'+ {cmd}')
|
|
170
223
|
start = time.time()
|
|
@@ -176,7 +229,8 @@ class DockerInitializer:
|
|
|
176
229
|
separate_stderr=separate_stderr,
|
|
177
230
|
log_path=self.log_path)
|
|
178
231
|
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
|
|
179
|
-
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr
|
|
232
|
+
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
|
|
233
|
+
DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
|
|
180
234
|
if wait_for_docker_daemon:
|
|
181
235
|
if time.time(
|
|
182
236
|
) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
|
|
@@ -216,20 +270,56 @@ class DockerInitializer:
|
|
|
216
270
|
if self._check_container_exited():
|
|
217
271
|
self.initialized = True
|
|
218
272
|
self._run(f'{self.docker_cmd} start {self.container_name}')
|
|
219
|
-
self._run('sudo service ssh start',
|
|
273
|
+
self._run('sudo service ssh start',
|
|
274
|
+
run_env='docker',
|
|
275
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
276
|
+
flock_args='-s -w 1')
|
|
220
277
|
return self._run('whoami', run_env='docker')
|
|
221
278
|
|
|
222
279
|
# SkyPilot: Docker login if user specified a private docker registry.
|
|
223
280
|
if 'docker_login_config' in self.docker_config:
|
|
224
|
-
# TODO(tian): Maybe support a command to get the login password?
|
|
225
281
|
docker_login_config = DockerLoginConfig(
|
|
226
282
|
**self.docker_config['docker_login_config'])
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
283
|
+
|
|
284
|
+
if docker_login_config.password:
|
|
285
|
+
# Password is allowed to be empty, in that case, we will not run
|
|
286
|
+
# the login command, and assume that the image pulling is
|
|
287
|
+
# authenticated by the IAM permission on the VM.
|
|
288
|
+
self._run(
|
|
289
|
+
f'{self.docker_cmd} login --username '
|
|
290
|
+
f'{shlex.quote(docker_login_config.username)} '
|
|
291
|
+
f'--password {shlex.quote(docker_login_config.password)} '
|
|
292
|
+
f'{shlex.quote(docker_login_config.server)}',
|
|
293
|
+
wait_for_docker_daemon=True)
|
|
294
|
+
elif (docker_login_config.server.endswith('.amazonaws.com') and
|
|
295
|
+
'.dkr.ecr.' in docker_login_config.server):
|
|
296
|
+
# AWS ECR: Use aws ecr get-login-password for authentication
|
|
297
|
+
# ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
298
|
+
# This command uses the IAM credentials from the EC2 instance
|
|
299
|
+
# Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
|
|
300
|
+
region = _extract_region_from_ecr_server(
|
|
301
|
+
docker_login_config.server)
|
|
302
|
+
|
|
303
|
+
# AWS CLI is not pre-installed on AWS instances, unlike gcloud
|
|
304
|
+
# on GCP instances, so we need to install it first
|
|
305
|
+
self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
|
|
306
|
+
|
|
307
|
+
self._run(
|
|
308
|
+
f'aws ecr get-login-password --region {region} | '
|
|
309
|
+
f'{self.docker_cmd} login --username AWS '
|
|
310
|
+
f'--password-stdin '
|
|
311
|
+
f'{shlex.quote(docker_login_config.server)}',
|
|
312
|
+
wait_for_docker_daemon=True)
|
|
313
|
+
elif docker_login_config.server.endswith('-docker.pkg.dev'):
|
|
314
|
+
# Docker image server is on GCR, we need to do additional setup
|
|
315
|
+
# to pull the image.
|
|
316
|
+
# When no username or password is provided, we assume that
|
|
317
|
+
# we are on GCP VM (i.e. gcloud auth configure-docker is
|
|
318
|
+
# enough), or the image server is public.
|
|
319
|
+
# For the former case, gcloud should be available, and latter
|
|
320
|
+
# should be fine to fail the following command.
|
|
321
|
+
self._run('gcloud auth configure-docker '
|
|
322
|
+
f'{docker_login_config.server} --quiet || true')
|
|
233
323
|
# We automatically add the server prefix to the image name if
|
|
234
324
|
# the user did not add it.
|
|
235
325
|
specific_image = docker_login_config.format_image(specific_image)
|
|
@@ -271,6 +361,10 @@ class DockerInitializer:
|
|
|
271
361
|
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
|
|
272
362
|
'sudo systemctl restart docker; } || true')
|
|
273
363
|
user_docker_run_options = self.docker_config.get('run_options', [])
|
|
364
|
+
remove_container_cmd = maybe_remove_container_cmds(
|
|
365
|
+
self.container_name,
|
|
366
|
+
self.docker_cmd,
|
|
367
|
+
)
|
|
274
368
|
start_command = docker_start_cmds(
|
|
275
369
|
specific_image,
|
|
276
370
|
self.container_name,
|
|
@@ -278,7 +372,9 @@ class DockerInitializer:
|
|
|
278
372
|
self._auto_configure_shm(user_docker_run_options)),
|
|
279
373
|
self.docker_cmd,
|
|
280
374
|
)
|
|
281
|
-
self._run(start_command
|
|
375
|
+
self._run(f'{remove_container_cmd} && {start_command}',
|
|
376
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
377
|
+
flock_args='-x -w 10')
|
|
282
378
|
|
|
283
379
|
# SkyPilot: Setup Commands.
|
|
284
380
|
# TODO(zhwu): the following setups should be aligned with the kubernetes
|
|
@@ -296,14 +392,18 @@ class DockerInitializer:
|
|
|
296
392
|
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
|
|
297
393
|
run_env='docker')
|
|
298
394
|
# Install dependencies.
|
|
299
|
-
|
|
300
|
-
'
|
|
395
|
+
cmd = (
|
|
396
|
+
'bash -lc \''
|
|
397
|
+
'exec 200>/var/tmp/sky_apt.lock; '
|
|
398
|
+
'flock -x -w 120 200 || exit 1; '
|
|
399
|
+
'export DEBIAN_FRONTEND=noninteractive; '
|
|
400
|
+
'apt-get -yq update && '
|
|
301
401
|
# Our mount script will install gcsfuse without fuse package.
|
|
302
402
|
# We need to install fuse package first to enable storage mount.
|
|
303
403
|
# The dpkg option is to suppress the prompt for fuse installation.
|
|
304
|
-
'
|
|
305
|
-
'rsync curl wget patch openssh-server python3-pip fuse
|
|
306
|
-
|
|
404
|
+
'apt-get -o DPkg::Options::=--force-confnew install -y '
|
|
405
|
+
'rsync curl wget patch openssh-server python3-pip fuse\'')
|
|
406
|
+
self._run(cmd, run_env='docker')
|
|
307
407
|
|
|
308
408
|
# Copy local authorized_keys to docker container.
|
|
309
409
|
# Stop and disable jupyter service. This is to avoid port conflict on
|
|
@@ -329,13 +429,16 @@ class DockerInitializer:
|
|
|
329
429
|
# `mesg: ttyname failed: inappropriate ioctl for device`.
|
|
330
430
|
# see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
|
331
431
|
port = constants.DEFAULT_DOCKER_PORT
|
|
432
|
+
# In case the port is already configured in the sshd_config file
|
|
433
|
+
# in some images, we delete it first and then append the new one.
|
|
332
434
|
# pylint: disable=anomalous-backslash-in-string
|
|
333
435
|
self._run(
|
|
334
|
-
|
|
436
|
+
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
|
437
|
+
f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
|
|
335
438
|
'mkdir -p ~/.ssh;'
|
|
336
439
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
337
440
|
'sudo service ssh start;'
|
|
338
|
-
'sudo sed -i "s/mesg n/tty -s
|
|
441
|
+
'sudo sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;'
|
|
339
442
|
f'{SETUP_ENV_VARS_CMD}',
|
|
340
443
|
run_env='docker')
|
|
341
444
|
|
|
@@ -376,9 +479,13 @@ class DockerInitializer:
|
|
|
376
479
|
user_pos = string.find('~')
|
|
377
480
|
if user_pos > -1:
|
|
378
481
|
if self.home_dir is None:
|
|
379
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
380
|
-
'printenv HOME')
|
|
381
|
-
self.home_dir = self._run(
|
|
482
|
+
cmd = (f'{self.docker_cmd} exec {self.container_name}'
|
|
483
|
+
' printenv HOME')
|
|
484
|
+
self.home_dir = self._run(
|
|
485
|
+
cmd,
|
|
486
|
+
separate_stderr=True,
|
|
487
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
488
|
+
flock_args='-s -w 1')
|
|
382
489
|
# Check for unexpected newline in home directory, which can be
|
|
383
490
|
# a common issue when the output is mixed with stderr.
|
|
384
491
|
assert '\n' not in self.home_dir, (
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
"""FluidStack instance provisioning."""
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
|
-
from sky import authentication as auth
|
|
7
6
|
from sky import exceptions
|
|
8
7
|
from sky import sky_logging
|
|
9
8
|
from sky.provision import common
|
|
10
9
|
from sky.provision.fluidstack import fluidstack_utils as utils
|
|
10
|
+
from sky.utils import auth_utils
|
|
11
11
|
from sky.utils import command_runner
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
@@ -26,7 +26,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
26
26
|
|
|
27
27
|
def get_internal_ip(node_info: Dict[str, Any]) -> None:
|
|
28
28
|
node_info['internal_ip'] = node_info['ip_address']
|
|
29
|
-
|
|
29
|
+
|
|
30
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
30
31
|
runner = command_runner.SSHCommandRunner(
|
|
31
32
|
(node_info['ip_address'], 22),
|
|
32
33
|
ssh_user='ubuntu',
|
|
@@ -77,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
77
78
|
return head_instance_id
|
|
78
79
|
|
|
79
80
|
|
|
80
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
81
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
81
82
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
82
83
|
"""Runs instances for the given cluster."""
|
|
83
|
-
|
|
84
|
+
del cluster_name # unused
|
|
84
85
|
pending_status = ['pending', 'provisioning']
|
|
85
86
|
while True:
|
|
86
87
|
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
|
@@ -286,11 +287,14 @@ def get_cluster_info(
|
|
|
286
287
|
|
|
287
288
|
|
|
288
289
|
def query_instances(
|
|
290
|
+
cluster_name: str,
|
|
289
291
|
cluster_name_on_cloud: str,
|
|
290
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
291
293
|
non_terminated_only: bool = True,
|
|
292
|
-
|
|
294
|
+
retry_if_missing: bool = False,
|
|
295
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
293
296
|
"""See sky/provision/__init__.py"""
|
|
297
|
+
del cluster_name, retry_if_missing # unused
|
|
294
298
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
295
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
296
300
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -301,7 +305,8 @@ def query_instances(
|
|
|
301
305
|
'failed': status_lib.ClusterStatus.INIT,
|
|
302
306
|
'terminated': None,
|
|
303
307
|
}
|
|
304
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
308
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
309
|
+
Optional[str]]] = {}
|
|
305
310
|
for inst_id, inst in instances.items():
|
|
306
311
|
if inst['status'] not in status_map:
|
|
307
312
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -310,7 +315,7 @@ def query_instances(
|
|
|
310
315
|
status = status_map.get(inst['status'], None)
|
|
311
316
|
if non_terminated_only and status is None:
|
|
312
317
|
continue
|
|
313
|
-
statuses[inst_id] = status
|
|
318
|
+
statuses[inst_id] = (status, None)
|
|
314
319
|
return statuses
|
|
315
320
|
|
|
316
321
|
|
sky/provision/gcp/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""GCP provisioner for SkyPilot."""
|
|
2
2
|
|
|
3
3
|
from sky.provision.gcp.config import bootstrap_instances
|
|
4
|
+
from sky.provision.gcp.instance import cleanup_custom_multi_network
|
|
4
5
|
from sky.provision.gcp.instance import cleanup_ports
|
|
5
6
|
from sky.provision.gcp.instance import get_cluster_info
|
|
6
7
|
from sky.provision.gcp.instance import open_ports
|