skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/backends/docker_utils.py
CHANGED
|
@@ -168,7 +168,7 @@ def build_dockerimage(task: task_mod.Task,
|
|
|
168
168
|
build_dir=temp_dir)
|
|
169
169
|
|
|
170
170
|
dst = os.path.join(temp_dir, SKY_DOCKER_WORKDIR)
|
|
171
|
-
if task.workdir is not None:
|
|
171
|
+
if task.workdir is not None and isinstance(task.workdir, str):
|
|
172
172
|
# Copy workdir contents to tempdir
|
|
173
173
|
shutil.copytree(os.path.expanduser(task.workdir), dst)
|
|
174
174
|
else:
|
|
@@ -178,7 +178,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
178
178
|
return handle, False
|
|
179
179
|
|
|
180
180
|
def _sync_workdir(self, handle: LocalDockerResourceHandle,
|
|
181
|
-
workdir: Path
|
|
181
|
+
workdir: Union[Path, Dict[str, Any]],
|
|
182
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
|
182
183
|
"""Workdir is sync'd by adding to the docker image.
|
|
183
184
|
|
|
184
185
|
This happens in the execute step.
|
|
@@ -188,6 +189,15 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
188
189
|
' a NoOp. If you are running sky exec, your workdir has not'
|
|
189
190
|
' been updated.')
|
|
190
191
|
|
|
192
|
+
def _download_file(self, handle: LocalDockerResourceHandle,
|
|
193
|
+
local_file_path: str, remote_file_path: str) -> None:
|
|
194
|
+
"""Syncs file from remote to local."""
|
|
195
|
+
# Copy from docker container to local
|
|
196
|
+
container = self.containers[handle]
|
|
197
|
+
copy_cmd = (
|
|
198
|
+
f'docker cp {container.name}:{remote_file_path} {local_file_path}')
|
|
199
|
+
subprocess.run(copy_cmd, shell=True, check=True)
|
|
200
|
+
|
|
191
201
|
def _sync_file_mounts(
|
|
192
202
|
self,
|
|
193
203
|
handle: LocalDockerResourceHandle,
|
|
@@ -273,13 +283,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
273
283
|
def _execute(self,
|
|
274
284
|
handle: LocalDockerResourceHandle,
|
|
275
285
|
task: 'task_lib.Task',
|
|
276
|
-
detach_run: bool,
|
|
277
286
|
dryrun: bool = False) -> None:
|
|
278
287
|
""" Launches the container."""
|
|
279
|
-
if detach_run:
|
|
280
|
-
raise NotImplementedError('detach_run=True is not supported in '
|
|
281
|
-
'LocalDockerBackend.')
|
|
282
|
-
|
|
283
288
|
if task.num_nodes > 1:
|
|
284
289
|
raise NotImplementedError(
|
|
285
290
|
'Tasks with num_nodes > 1 is currently not supported in '
|
|
@@ -0,0 +1,633 @@
|
|
|
1
|
+
"""Code generator for task execution."""
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import inspect
|
|
5
|
+
import json
|
|
6
|
+
import math
|
|
7
|
+
import textwrap
|
|
8
|
+
from typing import Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import colorama
|
|
11
|
+
|
|
12
|
+
from sky import sky_logging
|
|
13
|
+
from sky.skylet import constants
|
|
14
|
+
from sky.skylet import log_lib
|
|
15
|
+
from sky.utils import accelerator_registry
|
|
16
|
+
from sky.utils import ux_utils
|
|
17
|
+
|
|
18
|
+
# Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
|
|
19
|
+
# from interfering with the Ray cluster in the user's task (if any).
|
|
20
|
+
UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TaskCodeGen:
|
|
26
|
+
"""Base code generator for task execution on Ray and Slurm."""
|
|
27
|
+
|
|
28
|
+
def __init__(self) -> None:
|
|
29
|
+
# Code generated so far, to be joined via '\n'.
|
|
30
|
+
self._code: List[str] = []
|
|
31
|
+
# Guard method calling order.
|
|
32
|
+
self._has_prologue: bool = False
|
|
33
|
+
self._has_epilogue: bool = False
|
|
34
|
+
self._has_setup: bool = False
|
|
35
|
+
# Job ID is used to identify the job (also this generated code).
|
|
36
|
+
self.job_id: Optional[int] = None
|
|
37
|
+
|
|
38
|
+
def _add_common_imports(self) -> None:
|
|
39
|
+
"""Add common imports for both Ray and Slurm execution."""
|
|
40
|
+
self._code.append(
|
|
41
|
+
textwrap.dedent("""\
|
|
42
|
+
import functools
|
|
43
|
+
import getpass
|
|
44
|
+
import hashlib
|
|
45
|
+
import io
|
|
46
|
+
import os
|
|
47
|
+
import pathlib
|
|
48
|
+
import selectors
|
|
49
|
+
import shlex
|
|
50
|
+
import subprocess
|
|
51
|
+
import sys
|
|
52
|
+
import tempfile
|
|
53
|
+
import textwrap
|
|
54
|
+
import time
|
|
55
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
56
|
+
"""))
|
|
57
|
+
|
|
58
|
+
def _add_skylet_imports(self) -> None:
|
|
59
|
+
"""Add SkyPilot skylet imports."""
|
|
60
|
+
self._code.append(
|
|
61
|
+
textwrap.dedent("""\
|
|
62
|
+
from sky.skylet import autostop_lib
|
|
63
|
+
from sky.skylet import constants
|
|
64
|
+
from sky.skylet import job_lib
|
|
65
|
+
from sky.utils import log_utils
|
|
66
|
+
from sky.utils import subprocess_utils
|
|
67
|
+
"""))
|
|
68
|
+
|
|
69
|
+
def _add_logging_functions(self) -> None:
|
|
70
|
+
"""Add log streaming functions from log_lib."""
|
|
71
|
+
self._code += [
|
|
72
|
+
# FIXME: This is a hack to make sure that the functions can be found
|
|
73
|
+
# by ray.remote. This should be removed once we have a better way to
|
|
74
|
+
# specify dependencies for ray.
|
|
75
|
+
inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
|
|
76
|
+
inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
|
|
77
|
+
inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
|
|
78
|
+
inspect.getsource(log_lib.process_subprocess_stream),
|
|
79
|
+
inspect.getsource(log_lib.run_with_log),
|
|
80
|
+
inspect.getsource(log_lib.make_task_bash_script),
|
|
81
|
+
inspect.getsource(log_lib.add_ray_env_vars),
|
|
82
|
+
inspect.getsource(log_lib.run_bash_command_with_log),
|
|
83
|
+
inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
def _add_waiting_for_resources_msg(self, num_nodes: int) -> None:
|
|
87
|
+
self._code.append(
|
|
88
|
+
textwrap.dedent(f"""\
|
|
89
|
+
plural = 's' if {num_nodes} > 1 else ''
|
|
90
|
+
node_str = f'{num_nodes} node{{plural}}'
|
|
91
|
+
message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
|
92
|
+
'Waiting for task resources on '
|
|
93
|
+
f'{{node_str}}.{colorama.Style.RESET_ALL}')
|
|
94
|
+
print(message, flush=True)"""))
|
|
95
|
+
|
|
96
|
+
def _get_job_started_msg(self) -> str:
|
|
97
|
+
"""Returns the 'Job started' streaming message with ANSI formatting."""
|
|
98
|
+
return (
|
|
99
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
|
|
100
|
+
f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
|
|
101
|
+
f'be killed){colorama.Style.RESET_ALL}')
|
|
102
|
+
|
|
103
|
+
def _add_job_started_msg(self) -> None:
|
|
104
|
+
streaming_message = self._get_job_started_msg()
|
|
105
|
+
self._code.append(f'print({streaming_message!r}, flush=True)')
|
|
106
|
+
|
|
107
|
+
def _get_accelerator_details(
|
|
108
|
+
self,
|
|
109
|
+
resources_dict: Dict[str, float],
|
|
110
|
+
) -> Tuple[Optional[str], float]:
|
|
111
|
+
resources_copy = resources_dict.copy()
|
|
112
|
+
resources_copy.pop('CPU', None)
|
|
113
|
+
|
|
114
|
+
if not resources_copy:
|
|
115
|
+
return None, 0.0
|
|
116
|
+
|
|
117
|
+
assert len(resources_copy) == 1, (
|
|
118
|
+
'There can only be one type of accelerator per instance. '
|
|
119
|
+
f'Found: {resources_copy}.')
|
|
120
|
+
|
|
121
|
+
acc_name, acc_count = list(resources_copy.items())[0]
|
|
122
|
+
return acc_name, float(acc_count)
|
|
123
|
+
|
|
124
|
+
def _add_constants(self) -> None:
|
|
125
|
+
self._code.append(
|
|
126
|
+
textwrap.dedent(f"""\
|
|
127
|
+
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
|
128
|
+
|
|
129
|
+
CANCELLED_RETURN_CODE = 137
|
|
130
|
+
"""))
|
|
131
|
+
|
|
132
|
+
def _get_rclone_flush_script(self) -> str:
|
|
133
|
+
"""Generate rclone flush script for cached storage mounts.
|
|
134
|
+
|
|
135
|
+
This script blocks job completion until all storage mounted with
|
|
136
|
+
CACHED_MOUNT mode is uploaded to remote.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Bash script as string
|
|
140
|
+
"""
|
|
141
|
+
return textwrap.dedent(f"""\
|
|
142
|
+
|
|
143
|
+
# Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
|
|
144
|
+
# findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
|
|
145
|
+
# rclone for normal mounts as well.
|
|
146
|
+
if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
|
|
147
|
+
[ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
|
|
148
|
+
[ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
|
|
149
|
+
flushed=0
|
|
150
|
+
# extra second on top of --vfs-cache-poll-interval to
|
|
151
|
+
# avoid race condition between rclone log line creation and this check.
|
|
152
|
+
sleep 1
|
|
153
|
+
while [ $flushed -eq 0 ]; do
|
|
154
|
+
# sleep for the same interval as --vfs-cache-poll-interval
|
|
155
|
+
sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
|
|
156
|
+
flushed=1
|
|
157
|
+
for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
|
|
158
|
+
exitcode=0
|
|
159
|
+
tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
|
|
160
|
+
if [ $exitcode -ne 0 ]; then
|
|
161
|
+
echo "skypilot: cached mount is still uploading to remote"
|
|
162
|
+
flushed=0
|
|
163
|
+
break
|
|
164
|
+
fi
|
|
165
|
+
done
|
|
166
|
+
done
|
|
167
|
+
echo "skypilot: cached mount uploaded complete"
|
|
168
|
+
fi""")
|
|
169
|
+
|
|
170
|
+
def add_prologue(self, job_id: int) -> None:
|
|
171
|
+
"""Initialize code generator and add prologue code.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
job_id: SkyPilot internal job ID
|
|
175
|
+
"""
|
|
176
|
+
raise NotImplementedError
|
|
177
|
+
|
|
178
|
+
def add_setup(
|
|
179
|
+
self,
|
|
180
|
+
num_nodes: int,
|
|
181
|
+
resources_dict: Dict[str, float],
|
|
182
|
+
stable_cluster_internal_ips: List[str],
|
|
183
|
+
env_vars: Dict[str, str],
|
|
184
|
+
setup_cmd: Optional[str] = None,
|
|
185
|
+
setup_log_path: Optional[str] = None,
|
|
186
|
+
) -> None:
|
|
187
|
+
"""Generates code to set up the task on each node.
|
|
188
|
+
|
|
189
|
+
stable_cluster_internal_ips is used to ensure that the
|
|
190
|
+
SKYPILOT_NODE_RANK environment variable is assigned in a
|
|
191
|
+
deterministic order whenever a new task is added.
|
|
192
|
+
"""
|
|
193
|
+
raise NotImplementedError
|
|
194
|
+
|
|
195
|
+
def add_task(
|
|
196
|
+
self,
|
|
197
|
+
num_nodes: int,
|
|
198
|
+
bash_script: Optional[str],
|
|
199
|
+
task_name: Optional[str],
|
|
200
|
+
resources_dict: Dict[str, float],
|
|
201
|
+
log_dir: str,
|
|
202
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
203
|
+
) -> None:
|
|
204
|
+
"""Generates code to run the bash command on all num_nodes nodes."""
|
|
205
|
+
raise NotImplementedError
|
|
206
|
+
|
|
207
|
+
def add_epilogue(self) -> None:
|
|
208
|
+
"""Generate code that checks return codes and updates job status."""
|
|
209
|
+
assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
|
|
210
|
+
assert not self._has_epilogue, 'add_epilogue() called twice?'
|
|
211
|
+
self._has_epilogue = True
|
|
212
|
+
|
|
213
|
+
self._code += [
|
|
214
|
+
textwrap.dedent(f"""\
|
|
215
|
+
if sum(returncodes) != 0:
|
|
216
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
217
|
+
# Schedule the next pending job immediately to make the job
|
|
218
|
+
# scheduling more efficient.
|
|
219
|
+
job_lib.scheduler.schedule_step()
|
|
220
|
+
# This waits for all streaming logs to finish.
|
|
221
|
+
time.sleep(0.5)
|
|
222
|
+
reason = ''
|
|
223
|
+
# 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
|
|
224
|
+
if any(r == 139 for r in returncodes):
|
|
225
|
+
reason = '(likely due to Segmentation Fault)'
|
|
226
|
+
if any(r == 137 for r in returncodes):
|
|
227
|
+
# Find the first non-137 return code
|
|
228
|
+
non_137 = next(r for r in returncodes if r != 137)
|
|
229
|
+
reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
|
|
230
|
+
print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
|
|
231
|
+
'return code list:{colorama.Style.RESET_ALL}',
|
|
232
|
+
returncodes,
|
|
233
|
+
reason,
|
|
234
|
+
flush=True)
|
|
235
|
+
# Need this to set the job status in ray job to be FAILED.
|
|
236
|
+
sys.exit(1)
|
|
237
|
+
else:
|
|
238
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
|
|
239
|
+
# Schedule the next pending job immediately to make the job
|
|
240
|
+
# scheduling more efficient.
|
|
241
|
+
job_lib.scheduler.schedule_step()
|
|
242
|
+
# This waits for all streaming logs to finish.
|
|
243
|
+
time.sleep(0.5)
|
|
244
|
+
""")
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
def build(self) -> str:
|
|
248
|
+
"""Returns the entire generated program."""
|
|
249
|
+
assert self._has_epilogue, 'Call add_epilogue() before build().'
|
|
250
|
+
return '\n'.join(self._code)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class RayCodeGen(TaskCodeGen):
|
|
254
|
+
"""Code generator of a Ray program that executes a sky.Task.
|
|
255
|
+
|
|
256
|
+
Usage:
|
|
257
|
+
|
|
258
|
+
>> codegen = RayCodegen()
|
|
259
|
+
>> codegen.add_prologue()
|
|
260
|
+
|
|
261
|
+
>> codegen.add_task(...)
|
|
262
|
+
>> codegen.add_task(...)
|
|
263
|
+
|
|
264
|
+
>> codegen.add_epilogue()
|
|
265
|
+
>> code = codegen.build()
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
def add_prologue(self, job_id: int) -> None:
|
|
269
|
+
assert not self._has_prologue, 'add_prologue() called twice?'
|
|
270
|
+
self._has_prologue = True
|
|
271
|
+
self.job_id = job_id
|
|
272
|
+
# Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
|
|
273
|
+
# 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
|
|
274
|
+
# Otherwise, ray will fail to get the placement group because of a bug
|
|
275
|
+
# in ray job.
|
|
276
|
+
ray_address = 'auto'
|
|
277
|
+
|
|
278
|
+
# Add common imports
|
|
279
|
+
self._add_common_imports()
|
|
280
|
+
|
|
281
|
+
# Add Ray-specific setup
|
|
282
|
+
self._code.append(
|
|
283
|
+
textwrap.dedent("""\
|
|
284
|
+
# Set the environment variables to avoid deduplicating logs and
|
|
285
|
+
# scheduler events. This should be set in driver code, since we are
|
|
286
|
+
# not using `ray job submit` anymore, and the environment variables
|
|
287
|
+
# from the ray cluster is not inherited.
|
|
288
|
+
os.environ['RAY_DEDUP_LOGS'] = '0'
|
|
289
|
+
os.environ['RAY_SCHEDULER_EVENTS'] = '0'
|
|
290
|
+
|
|
291
|
+
import ray
|
|
292
|
+
import ray.util as ray_util
|
|
293
|
+
"""))
|
|
294
|
+
|
|
295
|
+
self._add_skylet_imports()
|
|
296
|
+
|
|
297
|
+
self._add_constants()
|
|
298
|
+
|
|
299
|
+
# Add Ray configuration
|
|
300
|
+
self._code.append(
|
|
301
|
+
textwrap.dedent(f"""\
|
|
302
|
+
kwargs = dict()
|
|
303
|
+
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
|
304
|
+
# the directory exists for backward compatibility for the VM
|
|
305
|
+
# launched before #1790.
|
|
306
|
+
if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
|
|
307
|
+
kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
|
|
308
|
+
ray.init(
|
|
309
|
+
address={ray_address!r},
|
|
310
|
+
namespace='__sky__{job_id}__',
|
|
311
|
+
log_to_driver=True,
|
|
312
|
+
**kwargs
|
|
313
|
+
)
|
|
314
|
+
def get_or_fail(futures, pg) -> List[int]:
|
|
315
|
+
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
|
316
|
+
if not futures:
|
|
317
|
+
return [], []
|
|
318
|
+
returncodes = [1] * len(futures)
|
|
319
|
+
pids = [None] * len(futures)
|
|
320
|
+
failed = False
|
|
321
|
+
# Wait for 1 task to be ready.
|
|
322
|
+
ready = []
|
|
323
|
+
# Keep invoking ray.wait if ready is empty. This is because
|
|
324
|
+
# ray.wait with timeout=None will only wait for 10**6 seconds,
|
|
325
|
+
# which will cause tasks running for more than 12 days to return
|
|
326
|
+
# before becoming ready.
|
|
327
|
+
# (Such tasks are common in serving jobs.)
|
|
328
|
+
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
|
|
329
|
+
|
|
330
|
+
def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
|
|
331
|
+
nonlocal returncodes, pids, failed
|
|
332
|
+
for task in tasks:
|
|
333
|
+
idx = futures.index(task)
|
|
334
|
+
res = ray.get(task)
|
|
335
|
+
returncodes[idx] = res['return_code']
|
|
336
|
+
pids[idx] = res['pid']
|
|
337
|
+
if res['return_code'] != 0:
|
|
338
|
+
failed = True
|
|
339
|
+
|
|
340
|
+
while not ready:
|
|
341
|
+
ready, unready = ray.wait(futures)
|
|
342
|
+
handle_ready_tasks(ready)
|
|
343
|
+
while unready:
|
|
344
|
+
if failed:
|
|
345
|
+
for task in unready:
|
|
346
|
+
# ray.cancel without force fails to kill tasks.
|
|
347
|
+
# We use force=True to kill unready tasks.
|
|
348
|
+
ray.cancel(task, force=True)
|
|
349
|
+
# Use SIGKILL=128+9 to indicate the task is forcely
|
|
350
|
+
# killed.
|
|
351
|
+
idx = futures.index(task)
|
|
352
|
+
returncodes[idx] = CANCELLED_RETURN_CODE
|
|
353
|
+
break
|
|
354
|
+
ready, unready = ray.wait(unready)
|
|
355
|
+
handle_ready_tasks(ready)
|
|
356
|
+
# Remove the placement group after all tasks are done, so that
|
|
357
|
+
# the next job can be scheduled on the released resources
|
|
358
|
+
# immediately.
|
|
359
|
+
ray_util.remove_placement_group(pg)
|
|
360
|
+
sys.stdout.flush()
|
|
361
|
+
return returncodes, pids
|
|
362
|
+
|
|
363
|
+
futures = []
|
|
364
|
+
"""))
|
|
365
|
+
|
|
366
|
+
self._add_logging_functions()
|
|
367
|
+
|
|
368
|
+
self._code += [
|
|
369
|
+
'run_bash_command_with_log = run_bash_command_with_log',
|
|
370
|
+
'run_bash_command_with_log_and_return_pid = \
|
|
371
|
+
ray.remote(run_bash_command_with_log_and_return_pid)',
|
|
372
|
+
'autostop_lib.set_last_active_time_to_now()',
|
|
373
|
+
f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
def add_setup(
|
|
377
|
+
self,
|
|
378
|
+
num_nodes: int,
|
|
379
|
+
resources_dict: Dict[str, float],
|
|
380
|
+
stable_cluster_internal_ips: List[str],
|
|
381
|
+
env_vars: Dict[str, str],
|
|
382
|
+
setup_cmd: Optional[str] = None,
|
|
383
|
+
setup_log_path: Optional[str] = None,
|
|
384
|
+
) -> None:
|
|
385
|
+
assert self._has_prologue, ('Call add_prologue() before '
|
|
386
|
+
'add_setup().')
|
|
387
|
+
self._has_setup = True
|
|
388
|
+
|
|
389
|
+
bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
|
|
390
|
+
# Set CPU to avoid ray hanging the resources allocation
|
|
391
|
+
# for remote functions, since the task will request 1 CPU
|
|
392
|
+
# by default.
|
|
393
|
+
task_cpu_demand = resources_dict.pop('CPU')
|
|
394
|
+
|
|
395
|
+
if resources_dict:
|
|
396
|
+
assert len(resources_dict) == 1, (
|
|
397
|
+
'There can only be one type of accelerator per instance. '
|
|
398
|
+
f'Found: {resources_dict}.')
|
|
399
|
+
acc_name, acc_count = list(resources_dict.items())[0]
|
|
400
|
+
gpu_dict = {'GPU': acc_count}
|
|
401
|
+
# gpu_dict should be empty when the accelerator is not GPU.
|
|
402
|
+
# TODO(zongheng,zhanghao): an alternative is to start the remote
|
|
403
|
+
# cluster with custom resource 'GPU': <n> even if the accelerator(s)
|
|
404
|
+
# are not GPU. We opt for the current solution for now.
|
|
405
|
+
if accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
406
|
+
acc_name):
|
|
407
|
+
gpu_dict = {}
|
|
408
|
+
for bundle in bundles:
|
|
409
|
+
bundle.update({
|
|
410
|
+
# Set the GPU to avoid ray hanging the resources allocation
|
|
411
|
+
**gpu_dict,
|
|
412
|
+
})
|
|
413
|
+
|
|
414
|
+
self._code.append(
|
|
415
|
+
f'pg = ray_util.placement_group({json.dumps(bundles)}, '
|
|
416
|
+
f'\'STRICT_SPREAD\')')
|
|
417
|
+
self._add_waiting_for_resources_msg(num_nodes)
|
|
418
|
+
self._code.append(
|
|
419
|
+
textwrap.dedent("""\
|
|
420
|
+
# FIXME: This will print the error message from autoscaler if
|
|
421
|
+
# it is waiting for other task to finish. We should hide the
|
|
422
|
+
# error message.
|
|
423
|
+
ray.get(pg.ready())"""))
|
|
424
|
+
self._add_job_started_msg()
|
|
425
|
+
|
|
426
|
+
job_id = self.job_id
|
|
427
|
+
if setup_cmd is not None:
|
|
428
|
+
setup_envs = env_vars.copy()
|
|
429
|
+
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
|
|
430
|
+
self._code += [
|
|
431
|
+
textwrap.dedent(f"""\
|
|
432
|
+
setup_cmd = {setup_cmd!r}
|
|
433
|
+
_SETUP_CPUS = 0.0001
|
|
434
|
+
# The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
|
|
435
|
+
# requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
|
|
436
|
+
# We unset it so that user setup command may properly use this env var.
|
|
437
|
+
setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
|
|
438
|
+
job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
|
|
439
|
+
|
|
440
|
+
# The schedule_step should be called after the job status is set to non-PENDING,
|
|
441
|
+
# otherwise, the scheduler will think the current job is not submitted yet, and
|
|
442
|
+
# skip the scheduling step.
|
|
443
|
+
job_lib.scheduler.schedule_step()
|
|
444
|
+
|
|
445
|
+
# If some nodes are down and then new nodes are added after launching again,
|
|
446
|
+
# the result of `ray.nodes()` will include all the nodes, so we need to get
|
|
447
|
+
# the alive nodes.
|
|
448
|
+
alive_nodes = [n for n in ray.nodes() if 'Alive' in n and n['Alive']]
|
|
449
|
+
total_num_nodes = len(alive_nodes)
|
|
450
|
+
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
451
|
+
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
452
|
+
setup_workers = [run_bash_command_with_log_and_return_pid \\
|
|
453
|
+
.options(
|
|
454
|
+
name='setup',
|
|
455
|
+
num_cpus=_SETUP_CPUS,
|
|
456
|
+
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
457
|
+
placement_group=setup_pg,
|
|
458
|
+
placement_group_bundle_index=i)
|
|
459
|
+
) \\
|
|
460
|
+
.remote(
|
|
461
|
+
setup_cmd,
|
|
462
|
+
os.path.expanduser({setup_log_path!r}),
|
|
463
|
+
env_vars={setup_envs!r},
|
|
464
|
+
stream_logs=True,
|
|
465
|
+
with_ray=True,
|
|
466
|
+
) for i in range(total_num_nodes)]
|
|
467
|
+
setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
|
|
468
|
+
success = True
|
|
469
|
+
failed_workers_and_returncodes = []
|
|
470
|
+
for i in range(len(setup_returncodes)):
|
|
471
|
+
returncode = setup_returncodes[i]
|
|
472
|
+
pid = setup_pids[i]
|
|
473
|
+
if pid == None:
|
|
474
|
+
pid = os.getpid()
|
|
475
|
+
if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
|
|
476
|
+
success = False
|
|
477
|
+
failed_workers_and_returncodes.append((pid, returncode))
|
|
478
|
+
if not success:
|
|
479
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
|
|
480
|
+
msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
|
|
481
|
+
msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
482
|
+
print(msg, flush=True)
|
|
483
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
484
|
+
# This waits for all streaming logs to finish.
|
|
485
|
+
time.sleep(1)
|
|
486
|
+
# Need this to set the job status in ray job to be FAILED.
|
|
487
|
+
sys.exit(1)
|
|
488
|
+
""")
|
|
489
|
+
]
|
|
490
|
+
|
|
491
|
+
self._code.append(f'job_lib.set_job_started({self.job_id!r})')
|
|
492
|
+
if setup_cmd is None:
|
|
493
|
+
# Need to call schedule_step() to make sure the scheduler
|
|
494
|
+
# schedule the next pending job.
|
|
495
|
+
self._code.append('job_lib.scheduler.schedule_step()')
|
|
496
|
+
|
|
497
|
+
# Export IP and node rank to the environment variables.
|
|
498
|
+
self._code += [
|
|
499
|
+
textwrap.dedent(f"""\
|
|
500
|
+
@ray.remote
|
|
501
|
+
def check_ip():
|
|
502
|
+
return ray.util.get_node_ip_address()
|
|
503
|
+
gang_scheduling_id_to_ip = ray.get([
|
|
504
|
+
check_ip.options(
|
|
505
|
+
num_cpus={task_cpu_demand},
|
|
506
|
+
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
507
|
+
placement_group=pg,
|
|
508
|
+
placement_group_bundle_index=i
|
|
509
|
+
)).remote()
|
|
510
|
+
for i in range(pg.bundle_count)
|
|
511
|
+
])
|
|
512
|
+
|
|
513
|
+
cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
|
|
514
|
+
job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
|
|
515
|
+
job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
|
|
516
|
+
job_ip_list_str = '\\n'.join(job_ip_rank_list)
|
|
517
|
+
"""),
|
|
518
|
+
]
|
|
519
|
+
|
|
520
|
+
def add_task(self,
|
|
521
|
+
num_nodes: int,
|
|
522
|
+
bash_script: Optional[str],
|
|
523
|
+
task_name: Optional[str],
|
|
524
|
+
resources_dict: Dict[str, float],
|
|
525
|
+
log_dir: str,
|
|
526
|
+
env_vars: Optional[Dict[str, str]] = None) -> None:
|
|
527
|
+
# TODO(zhwu): The resources limitation for multi-node ray.tune and
|
|
528
|
+
# horovod should be considered.
|
|
529
|
+
for i in range(num_nodes):
|
|
530
|
+
# Ray's per-node resources, to constrain scheduling each command to
|
|
531
|
+
# the corresponding node, represented by private IPs.
|
|
532
|
+
self._add_ray_task(bash_script=bash_script,
|
|
533
|
+
task_name=task_name,
|
|
534
|
+
resources_dict=resources_dict.copy(),
|
|
535
|
+
log_dir=log_dir,
|
|
536
|
+
env_vars=env_vars,
|
|
537
|
+
gang_scheduling_id=i)
|
|
538
|
+
|
|
539
|
+
def _add_ray_task(self,
|
|
540
|
+
bash_script: Optional[str],
|
|
541
|
+
task_name: Optional[str],
|
|
542
|
+
resources_dict: Dict[str, float],
|
|
543
|
+
log_dir: str,
|
|
544
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
545
|
+
gang_scheduling_id: int = 0) -> None:
|
|
546
|
+
"""Generates code for a ray remote task that runs a bash command."""
|
|
547
|
+
assert self._has_setup, 'Call add_setup() before add_task().'
|
|
548
|
+
|
|
549
|
+
task_cpu_demand = resources_dict.pop('CPU')
|
|
550
|
+
# Build remote_task.options(...)
|
|
551
|
+
# resources=...
|
|
552
|
+
# num_gpus=...
|
|
553
|
+
options = []
|
|
554
|
+
options.append(f'num_cpus={task_cpu_demand}')
|
|
555
|
+
|
|
556
|
+
acc_name, acc_count = self._get_accelerator_details(resources_dict)
|
|
557
|
+
num_gpus = 0.0
|
|
558
|
+
if acc_name is not None:
|
|
559
|
+
assert resources_dict, ('There can only be one type of accelerator '
|
|
560
|
+
'per instance.')
|
|
561
|
+
options.append(f'resources={json.dumps(resources_dict)}')
|
|
562
|
+
if not accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
563
|
+
acc_name):
|
|
564
|
+
num_gpus = acc_count
|
|
565
|
+
options.append(f'num_gpus={num_gpus}')
|
|
566
|
+
options.append(
|
|
567
|
+
'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
|
|
568
|
+
'placement_group=pg, '
|
|
569
|
+
f'placement_group_bundle_index={gang_scheduling_id})')
|
|
570
|
+
|
|
571
|
+
sky_env_vars_dict_str = [
|
|
572
|
+
textwrap.dedent(f"""\
|
|
573
|
+
sky_env_vars_dict = {{}}
|
|
574
|
+
sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
|
|
575
|
+
sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
|
|
576
|
+
""")
|
|
577
|
+
]
|
|
578
|
+
|
|
579
|
+
if env_vars is not None:
|
|
580
|
+
sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
|
|
581
|
+
for k, v in env_vars.items())
|
|
582
|
+
sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
|
|
583
|
+
|
|
584
|
+
options_str = ', '.join(options)
|
|
585
|
+
logger.debug('Added Task with options: '
|
|
586
|
+
f'{options_str}')
|
|
587
|
+
rclone_flush_script = self._get_rclone_flush_script()
|
|
588
|
+
unset_ray_env_vars = ' && '.join(
|
|
589
|
+
[f'unset {var}' for var in UNSET_RAY_ENV_VARS])
|
|
590
|
+
self._code += [
|
|
591
|
+
sky_env_vars_dict_str,
|
|
592
|
+
textwrap.dedent(f"""\
|
|
593
|
+
script = {bash_script!r}
|
|
594
|
+
rclone_flush_script = {rclone_flush_script!r}
|
|
595
|
+
|
|
596
|
+
if script is not None:
|
|
597
|
+
script=f'{unset_ray_env_vars}; {{script}}'
|
|
598
|
+
script += rclone_flush_script
|
|
599
|
+
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
|
600
|
+
|
|
601
|
+
ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
|
|
602
|
+
rank = job_ip_rank_map[ip]
|
|
603
|
+
|
|
604
|
+
if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
|
|
605
|
+
name_str = '{task_name},' if {task_name!r} != None else 'task,'
|
|
606
|
+
log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
|
|
607
|
+
else: # Single-node or multi-node task on multi-node cluster
|
|
608
|
+
idx_in_cluster = cluster_ips_to_node_id[ip]
|
|
609
|
+
if cluster_ips_to_node_id[ip] == 0:
|
|
610
|
+
node_name = 'head'
|
|
611
|
+
else:
|
|
612
|
+
node_name = f'worker{{idx_in_cluster}}'
|
|
613
|
+
name_str = f'{{node_name}}, rank={{rank}},'
|
|
614
|
+
log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
|
|
615
|
+
sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
|
|
616
|
+
|
|
617
|
+
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
618
|
+
|
|
619
|
+
futures.append(run_bash_command_with_log_and_return_pid \\
|
|
620
|
+
.options(name=name_str, {options_str}) \\
|
|
621
|
+
.remote(
|
|
622
|
+
script,
|
|
623
|
+
log_path,
|
|
624
|
+
env_vars=sky_env_vars_dict,
|
|
625
|
+
stream_logs=True,
|
|
626
|
+
with_ray=True,
|
|
627
|
+
))""")
|
|
628
|
+
]
|
|
629
|
+
|
|
630
|
+
def add_epilogue(self) -> None:
|
|
631
|
+
"""Generates code that waits for all tasks, then exits."""
|
|
632
|
+
self._code.append('returncodes, _ = get_or_fail(futures, pg)')
|
|
633
|
+
super().add_epilogue()
|