skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/skylet/constants.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
"""Constants for SkyPilot."""
|
|
2
|
-
import os
|
|
3
2
|
from typing import List, Tuple
|
|
4
3
|
|
|
5
4
|
from packaging import version
|
|
@@ -7,8 +6,26 @@ from packaging import version
|
|
|
7
6
|
import sky
|
|
8
7
|
from sky.setup_files import dependencies
|
|
9
8
|
|
|
9
|
+
# The base directory for all SkyPilot runtime artifacts.
|
|
10
|
+
# Historically, we have always used $HOME, but we couldn't
|
|
11
|
+
# do that for Slurm, because $HOME typically points to a NFS
|
|
12
|
+
# mounted directory, which does not work well with SQLite.
|
|
13
|
+
# https://sqlite.org/faq.html#q5
|
|
14
|
+
# Additionally, having the skypilot-runtime python venv be
|
|
15
|
+
# on an NFS makes things very slow.
|
|
16
|
+
SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
|
|
17
|
+
# Same as above but for use within python code instead of shell commands.
|
|
18
|
+
# Example usage:
|
|
19
|
+
# os.path.join(
|
|
20
|
+
# os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
|
|
21
|
+
# '.sky/jobs.db')
|
|
22
|
+
SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
|
|
23
|
+
# We keep sky_logs and sky_workdir in $HOME, because
|
|
24
|
+
# these are artifacts that users can access, and having
|
|
25
|
+
# them be in $HOME makes it more convenient.
|
|
10
26
|
SKY_LOGS_DIRECTORY = '~/sky_logs'
|
|
11
27
|
SKY_REMOTE_WORKDIR = '~/sky_workdir'
|
|
28
|
+
SKY_TEMPLATES_DIRECTORY = '~/sky_templates'
|
|
12
29
|
SKY_IGNORE_FILE = '.skyignore'
|
|
13
30
|
GIT_IGNORE_FILE = '.gitignore'
|
|
14
31
|
|
|
@@ -25,22 +42,23 @@ SKY_REMOTE_RAY_PORT_DICT_STR = (
|
|
|
25
42
|
f'"ray_dashboard_port":{SKY_REMOTE_RAY_DASHBOARD_PORT}}}')
|
|
26
43
|
# The file contains the ports of the Ray cluster that SkyPilot launched,
|
|
27
44
|
# i.e. the PORT_DICT_STR above.
|
|
28
|
-
SKY_REMOTE_RAY_PORT_FILE = '
|
|
45
|
+
SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
|
|
29
46
|
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
|
|
30
47
|
SKY_REMOTE_RAY_VERSION = '2.9.3'
|
|
31
48
|
|
|
49
|
+
SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
|
|
32
50
|
# We store the absolute path of the python executable (/opt/conda/bin/python3)
|
|
33
51
|
# in this file, so that any future internal commands that need to use python
|
|
34
52
|
# can use this path. This is useful for the case where the user has a custom
|
|
35
53
|
# conda environment as a default environment, which is not the same as the one
|
|
36
54
|
# used for installing SkyPilot runtime (ray and skypilot).
|
|
37
|
-
SKY_PYTHON_PATH_FILE = '
|
|
38
|
-
SKY_RAY_PATH_FILE = '
|
|
55
|
+
SKY_PYTHON_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/python_path'
|
|
56
|
+
SKY_RAY_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/ray_path'
|
|
39
57
|
SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
40
58
|
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
|
|
41
59
|
'which python3')
|
|
42
60
|
# Python executable, e.g., /opt/conda/bin/python3
|
|
43
|
-
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
|
|
61
|
+
SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
|
|
44
62
|
# Prefer SKY_UV_PIP_CMD, which is faster.
|
|
45
63
|
# TODO(cooperc): remove remaining usage (GCP TPU setup).
|
|
46
64
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
@@ -50,23 +68,44 @@ SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
|
50
68
|
# #!/opt/conda/bin/python3
|
|
51
69
|
SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
|
|
52
70
|
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
|
|
71
|
+
|
|
72
|
+
# Use $(which env) to find env, falling back to /usr/bin/env if which is
|
|
73
|
+
# unavailable. This works around a Slurm quirk where srun's execvp() doesn't
|
|
74
|
+
# check execute permissions, failing when $HOME/.local/bin/env (non-executable,
|
|
75
|
+
# from uv installation) shadows /usr/bin/env.
|
|
76
|
+
SKY_SLURM_UNSET_PYTHONPATH = ('$(which env 2>/dev/null || echo /usr/bin/env) '
|
|
77
|
+
'-u PYTHONPATH')
|
|
78
|
+
SKY_SLURM_PYTHON_CMD = (f'{SKY_SLURM_UNSET_PYTHONPATH} '
|
|
79
|
+
f'$({SKY_GET_PYTHON_PATH_CMD})')
|
|
80
|
+
|
|
53
81
|
# Separate env for SkyPilot runtime dependencies.
|
|
54
82
|
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
|
|
55
|
-
SKY_REMOTE_PYTHON_ENV: str = f'
|
|
83
|
+
SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
56
84
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
85
|
+
# Place the conda root in the runtime directory, as installing to $HOME
|
|
86
|
+
# on an NFS takes too long (1-2m slower).
|
|
87
|
+
SKY_CONDA_ROOT = f'{SKY_RUNTIME_DIR}/miniconda3'
|
|
57
88
|
# uv is used for venv and pip, much faster than python implementations.
|
|
58
89
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
59
|
-
|
|
90
|
+
# set UV_SYSTEM_PYTHON to false in case the
|
|
91
|
+
# user provided docker image set it to true.
|
|
92
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
93
|
+
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
94
|
+
f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
|
|
60
95
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
61
96
|
SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
62
97
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
63
98
|
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
|
64
99
|
SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
|
|
65
|
-
|
|
66
|
-
|
|
100
|
+
SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
|
|
101
|
+
'--no-project --no-config')
|
|
102
|
+
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
|
|
103
|
+
# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
|
|
104
|
+
# not work when conda is used.
|
|
67
105
|
DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
68
106
|
'export PATH='
|
|
69
|
-
f'$(echo $PATH | sed "s|$(echo
|
|
107
|
+
f'$(echo $PATH | sed "s|$(echo {SKY_REMOTE_PYTHON_ENV})/bin:||") && '
|
|
108
|
+
'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
|
|
70
109
|
|
|
71
110
|
# Prefix for SkyPilot environment variables
|
|
72
111
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
@@ -91,14 +130,17 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
91
130
|
# cluster yaml is updated.
|
|
92
131
|
#
|
|
93
132
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
94
|
-
SKYLET_VERSION = '
|
|
133
|
+
SKYLET_VERSION = '27'
|
|
95
134
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
96
135
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
97
136
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
98
137
|
SKYLET_LIB_VERSION = 4
|
|
99
|
-
SKYLET_VERSION_FILE = '
|
|
138
|
+
SKYLET_VERSION_FILE = '.sky/skylet_version'
|
|
139
|
+
SKYLET_LOG_FILE = '.sky/skylet.log'
|
|
140
|
+
SKYLET_PID_FILE = '.sky/skylet_pid'
|
|
141
|
+
SKYLET_PORT_FILE = '.sky/skylet_port'
|
|
100
142
|
SKYLET_GRPC_PORT = 46590
|
|
101
|
-
SKYLET_GRPC_TIMEOUT_SECONDS =
|
|
143
|
+
SKYLET_GRPC_TIMEOUT_SECONDS = 10
|
|
102
144
|
|
|
103
145
|
# Docker default options
|
|
104
146
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
|
@@ -134,6 +176,10 @@ DISABLE_GPU_ECC_COMMAND = (
|
|
|
134
176
|
'{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
|
|
135
177
|
'|| true; ')
|
|
136
178
|
|
|
179
|
+
SETUP_SKY_DIRS_COMMANDS = (f'mkdir -p ~/sky_workdir && '
|
|
180
|
+
f'mkdir -p ~/.sky/sky_app && '
|
|
181
|
+
f'mkdir -p {SKY_RUNTIME_DIR}/.sky;')
|
|
182
|
+
|
|
137
183
|
# Install conda on the remote cluster if it is not already installed.
|
|
138
184
|
# We use conda with python 3.10 to be consistent across multiple clouds with
|
|
139
185
|
# best effort.
|
|
@@ -150,8 +196,9 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
150
196
|
# because for some images, conda is already installed, but not initialized.
|
|
151
197
|
# In this case, we need to initialize conda and set auto_activate_base to
|
|
152
198
|
# true.
|
|
153
|
-
'{
|
|
154
|
-
'
|
|
199
|
+
'{ '
|
|
200
|
+
f'bash Miniconda3-Linux.sh -b -p "{SKY_CONDA_ROOT}" || true; '
|
|
201
|
+
f'eval "$({SKY_CONDA_ROOT}/bin/conda shell.bash hook)" && conda init && '
|
|
155
202
|
# Caller should replace {conda_auto_activate} with either true or false.
|
|
156
203
|
'conda config --set auto_activate_base {conda_auto_activate} && '
|
|
157
204
|
'conda activate base; }; '
|
|
@@ -172,7 +219,7 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
172
219
|
'fi;'
|
|
173
220
|
# Install uv for venv management and pip installation.
|
|
174
221
|
f'{SKY_UV_INSTALL_CMD};'
|
|
175
|
-
# Create a separate
|
|
222
|
+
# Create a separate python environment for SkyPilot dependencies.
|
|
176
223
|
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
|
|
177
224
|
# Do NOT use --system-site-packages here, because if users upgrade any
|
|
178
225
|
# packages in the base env, they interfere with skypilot dependencies.
|
|
@@ -194,7 +241,7 @@ _sky_version = str(version.parse(sky.__version__))
|
|
|
194
241
|
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
|
|
195
242
|
RAY_INSTALLATION_COMMANDS = (
|
|
196
243
|
f'{SKY_UV_INSTALL_CMD};'
|
|
197
|
-
'
|
|
244
|
+
f'{SETUP_SKY_DIRS_COMMANDS}'
|
|
198
245
|
# Print the PATH in provision.log to help debug PATH issues.
|
|
199
246
|
'echo PATH=$PATH; '
|
|
200
247
|
# Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
|
|
@@ -217,7 +264,9 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
217
264
|
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
|
218
265
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
|
219
266
|
f'|| {RAY_STATUS} || '
|
|
220
|
-
|
|
267
|
+
# The pydantic-core==2.41.3 for arm seems corrupted
|
|
268
|
+
# so we need to avoid that specific version.
|
|
269
|
+
f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
|
|
221
270
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
|
222
271
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
|
223
272
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
|
@@ -226,12 +275,32 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
226
275
|
#
|
|
227
276
|
# Here, we add ~/.local/bin to the end of the PATH to make sure the issues
|
|
228
277
|
# mentioned above are resolved.
|
|
229
|
-
'export PATH=$PATH
|
|
278
|
+
f'export PATH=$PATH:{SKY_RUNTIME_DIR}/.local/bin; '
|
|
230
279
|
# Writes ray path to file if it does not exist or the file is empty.
|
|
231
280
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
|
232
|
-
f'{{ {
|
|
281
|
+
f'{{ {SKY_UV_RUN_CMD} '
|
|
233
282
|
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
|
|
234
283
|
|
|
284
|
+
# Copy SkyPilot templates from the installed wheel to ~/sky_templates.
|
|
285
|
+
# This must run after the skypilot wheel is installed.
|
|
286
|
+
# Note: We remove ~/sky_templates first to avoid import conflicts where Python
|
|
287
|
+
# would import from ~/sky_templates instead of site-packages (because
|
|
288
|
+
# sky_templates itself is a package), leading to src == dst error when
|
|
289
|
+
# launching on an existing cluster.
|
|
290
|
+
COPY_SKYPILOT_TEMPLATES_COMMANDS = (
|
|
291
|
+
f'rm -rf {SKY_TEMPLATES_DIRECTORY}; '
|
|
292
|
+
f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
293
|
+
f'{SKY_PYTHON_CMD} -c \''
|
|
294
|
+
'import sky_templates, shutil, os; '
|
|
295
|
+
'src = os.path.dirname(sky_templates.__file__); '
|
|
296
|
+
f'dst = os.path.expanduser(\"{SKY_TEMPLATES_DIRECTORY}\"); '
|
|
297
|
+
'print(f\"Copying templates from {src} to {dst}...\"); '
|
|
298
|
+
'shutil.copytree(src, dst); '
|
|
299
|
+
'print(f\"Templates copied successfully\")\'; '
|
|
300
|
+
# Make scripts executable.
|
|
301
|
+
f'find {SKY_TEMPLATES_DIRECTORY} -type f ! -name "*.py" ! -name "*.md" '
|
|
302
|
+
'-exec chmod +x {} + ; ')
|
|
303
|
+
|
|
235
304
|
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
|
236
305
|
f'{SKY_UV_INSTALL_CMD};'
|
|
237
306
|
f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
|
|
@@ -322,6 +391,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
|
|
|
322
391
|
# controller_utils.translate_local_file_mounts_to_two_hop().
|
|
323
392
|
FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
|
|
324
393
|
|
|
394
|
+
# For passing in CPU and memory limits to the controller pod when running
|
|
395
|
+
# in k8s. Right now, we only use this for the jobs controller, but we may
|
|
396
|
+
# use this for the serve controller as well in the future.
|
|
397
|
+
# These files are written to disk by the skylet, who reads it from env vars
|
|
398
|
+
# passed by the backend when starting the skylet (start_skylet_on_head_node).
|
|
399
|
+
CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
|
|
400
|
+
CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
|
|
401
|
+
|
|
325
402
|
# Used when an managed jobs are created and
|
|
326
403
|
# files are synced up to the cloud.
|
|
327
404
|
FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
|
|
@@ -353,6 +430,8 @@ SERVICE_ACCOUNT_TOKEN_ENV_VAR = (
|
|
|
353
430
|
# SkyPilot environment variables
|
|
354
431
|
SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
|
|
355
432
|
SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
|
|
433
|
+
SKYPILOT_SETUP_NUM_GPUS_PER_NODE = (
|
|
434
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SETUP_NUM_GPUS_PER_NODE')
|
|
356
435
|
SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
|
|
357
436
|
SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
|
|
358
437
|
|
|
@@ -371,7 +450,9 @@ RCLONE_CACHE_REFRESH_INTERVAL = 10
|
|
|
371
450
|
OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
372
451
|
('docker', 'run_options'),
|
|
373
452
|
('nvidia_gpus', 'disable_ecc'),
|
|
453
|
+
('ssh', 'custom_metadata'),
|
|
374
454
|
('ssh', 'pod_config'),
|
|
455
|
+
('ssh', 'provision_timeout'),
|
|
375
456
|
('kubernetes', 'custom_metadata'),
|
|
376
457
|
('kubernetes', 'pod_config'),
|
|
377
458
|
('kubernetes', 'provision_timeout'),
|
|
@@ -381,13 +462,32 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
381
462
|
('gcp', 'enable_gvnic'),
|
|
382
463
|
('gcp', 'enable_gpu_direct'),
|
|
383
464
|
('gcp', 'placement_policy'),
|
|
465
|
+
('vast', 'secure_only'),
|
|
466
|
+
('active_workspace',),
|
|
384
467
|
]
|
|
385
468
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
386
469
|
# we skip the following keys because they are meant to be client-side configs.
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
470
|
+
# Also, we skip the consolidation mode config as those should be only set on
|
|
471
|
+
# the API server side.
|
|
472
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
473
|
+
('api_server',),
|
|
474
|
+
('allowed_clouds',),
|
|
475
|
+
('workspaces',),
|
|
476
|
+
('db',),
|
|
477
|
+
('daemons',),
|
|
478
|
+
# TODO(kevin,tian): Override the whole controller config once our test
|
|
479
|
+
# infrastructure supports setting dynamic server side configs.
|
|
480
|
+
# Tests that are affected:
|
|
481
|
+
# - test_managed_jobs_ha_kill_starting
|
|
482
|
+
# - test_managed_jobs_ha_kill_running
|
|
483
|
+
# - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
|
|
484
|
+
# LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
|
|
485
|
+
# but the configs won't be applied)
|
|
486
|
+
('jobs', 'controller', 'consolidation_mode'),
|
|
487
|
+
('serve', 'controller', 'consolidation_mode'),
|
|
488
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
489
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
490
|
+
]
|
|
391
491
|
|
|
392
492
|
# Constants for Azure blob storage
|
|
393
493
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
|
@@ -421,6 +521,11 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
|
421
521
|
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
422
522
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
423
523
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
524
|
+
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
|
|
525
|
+
IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
|
|
526
|
+
|
|
527
|
+
SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
|
|
528
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
|
|
424
529
|
|
|
425
530
|
# Environment variable that is set to 'true' if metrics are enabled.
|
|
426
531
|
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
|
@@ -436,6 +541,7 @@ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
|
|
|
436
541
|
# authentication is enabled in the API server.
|
|
437
542
|
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
|
438
543
|
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
|
544
|
+
SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
|
|
439
545
|
ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
|
|
440
546
|
|
|
441
547
|
# Enable debug logging for requests.
|
|
@@ -447,11 +553,12 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
|
447
553
|
# BEGIN constants used for service catalog.
|
|
448
554
|
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
|
|
449
555
|
HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
|
|
450
|
-
CATALOG_SCHEMA_VERSION = '
|
|
556
|
+
CATALOG_SCHEMA_VERSION = 'v8'
|
|
451
557
|
CATALOG_DIR = '~/.sky/catalogs'
|
|
452
558
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
453
559
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
454
|
-
'paperspace', 'do', 'nebius', 'ssh', '
|
|
560
|
+
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh', 'slurm',
|
|
561
|
+
'hyperbolic', 'seeweb', 'shadeform')
|
|
455
562
|
# END constants used for service catalog.
|
|
456
563
|
|
|
457
564
|
# The user ID of the SkyPilot system.
|
|
@@ -503,8 +610,11 @@ DEFAULT_PRIORITY = 0
|
|
|
503
610
|
GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
|
|
504
611
|
COST_REPORT_DEFAULT_DAYS = 30
|
|
505
612
|
|
|
506
|
-
# The directory for file locks.
|
|
507
|
-
SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
|
|
508
|
-
|
|
509
613
|
ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
510
614
|
'DEBUG_LOOP_LAG_THRESHOLD_MS')
|
|
615
|
+
|
|
616
|
+
ARM64_ARCH = 'arm64'
|
|
617
|
+
X86_64_ARCH = 'x86_64'
|
|
618
|
+
|
|
619
|
+
SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
|
|
620
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
|
sky/skylet/events.py
CHANGED
|
@@ -11,7 +11,8 @@ import psutil
|
|
|
11
11
|
from sky import clouds
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.backends import cloud_vm_ray_backend
|
|
14
|
-
from sky.jobs import
|
|
14
|
+
from sky.jobs import constants as managed_job_constants
|
|
15
|
+
from sky.jobs import scheduler
|
|
15
16
|
from sky.jobs import state as managed_job_state
|
|
16
17
|
from sky.jobs import utils as managed_job_utils
|
|
17
18
|
from sky.serve import serve_utils
|
|
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
|
|
|
21
22
|
from sky.usage import usage_lib
|
|
22
23
|
from sky.utils import cluster_utils
|
|
23
24
|
from sky.utils import registry
|
|
25
|
+
from sky.utils import subprocess_utils
|
|
24
26
|
from sky.utils import ux_utils
|
|
25
27
|
from sky.utils import yaml_utils
|
|
26
28
|
|
|
@@ -45,6 +47,9 @@ class SkyletEvent:
|
|
|
45
47
|
EVENT_CHECKING_INTERVAL_SECONDS))
|
|
46
48
|
self._n = 0
|
|
47
49
|
|
|
50
|
+
def start(self):
|
|
51
|
+
pass
|
|
52
|
+
|
|
48
53
|
def run(self):
|
|
49
54
|
self._n = (self._n + 1) % self._event_interval
|
|
50
55
|
if self._n % self._event_interval == 0:
|
|
@@ -73,18 +78,60 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
73
78
|
"""Skylet event for updating and scheduling managed jobs."""
|
|
74
79
|
EVENT_INTERVAL_SECONDS = 300
|
|
75
80
|
|
|
81
|
+
def start(self):
|
|
82
|
+
cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
|
|
83
|
+
if cpus_env_var is not None:
|
|
84
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
|
|
85
|
+
'w',
|
|
86
|
+
encoding='utf-8') as f:
|
|
87
|
+
f.write(cpus_env_var)
|
|
88
|
+
memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
|
89
|
+
if memory_env_var is not None:
|
|
90
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
91
|
+
'w',
|
|
92
|
+
encoding='utf-8') as f:
|
|
93
|
+
f.write(memory_env_var)
|
|
94
|
+
|
|
76
95
|
def _run(self):
|
|
96
|
+
if not os.path.exists(
|
|
97
|
+
os.path.expanduser(
|
|
98
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)
|
|
99
|
+
) and not managed_job_utils.is_consolidation_mode():
|
|
100
|
+
# Note: since the skylet is started before the user setup (in
|
|
101
|
+
# jobs-controller.yaml.j2) runs, it's possible that we hit this
|
|
102
|
+
# before the indicator file is written. However, since we will wait
|
|
103
|
+
# EVENT_INTERVAL_SECONDS before the first run, this should be very
|
|
104
|
+
# unlikely.
|
|
105
|
+
logger.info('No jobs controller indicator file found.')
|
|
106
|
+
all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
|
|
107
|
+
if not all_job_ids:
|
|
108
|
+
logger.info('No jobs running. Stopping controllers.')
|
|
109
|
+
# TODO(cooperc): Move this to a shared function also called by
|
|
110
|
+
# sdk.api_stop(). (#7229)
|
|
111
|
+
try:
|
|
112
|
+
records = scheduler.get_controller_process_records()
|
|
113
|
+
if records is not None:
|
|
114
|
+
for record in records:
|
|
115
|
+
if managed_job_utils.controller_process_alive(
|
|
116
|
+
record, quiet=False):
|
|
117
|
+
subprocess_utils.kill_children_processes(
|
|
118
|
+
parent_pids=[record.pid], force=True)
|
|
119
|
+
os.remove(
|
|
120
|
+
os.path.expanduser(
|
|
121
|
+
scheduler.JOB_CONTROLLER_PID_PATH))
|
|
122
|
+
except Exception as e: # pylint: disable=broad-except
|
|
123
|
+
# in case we get perm issues or something is messed up, just
|
|
124
|
+
# ignore it and assume the process is dead
|
|
125
|
+
logger.error(
|
|
126
|
+
f'Error looking at job controller pid file: {e}')
|
|
127
|
+
pass
|
|
128
|
+
logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
|
|
129
|
+
'indicator file hasn\'t been written yet.')
|
|
130
|
+
return
|
|
131
|
+
|
|
77
132
|
logger.info('=== Updating managed job status ===')
|
|
78
133
|
managed_job_utils.update_managed_jobs_statuses()
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class ManagedJobSchedulingEvent(SkyletEvent):
|
|
82
|
-
"""Skylet event for scheduling managed jobs."""
|
|
83
|
-
EVENT_INTERVAL_SECONDS = 20
|
|
84
|
-
|
|
85
|
-
def _run(self):
|
|
86
|
-
logger.info('=== Scheduling next jobs ===')
|
|
87
|
-
managed_job_scheduler.maybe_schedule_next_jobs()
|
|
134
|
+
scheduler.maybe_start_controllers()
|
|
88
135
|
|
|
89
136
|
|
|
90
137
|
class ServiceUpdateEvent(SkyletEvent):
|
|
@@ -189,7 +236,7 @@ class AutostopEvent(SkyletEvent):
|
|
|
189
236
|
RAY_PROVISIONER_SKYPILOT_TERMINATOR):
|
|
190
237
|
logger.info('Using new provisioner to stop the cluster.')
|
|
191
238
|
self._stop_cluster_with_new_provisioner(autostop_config, config,
|
|
192
|
-
provider_name)
|
|
239
|
+
provider_name, cloud)
|
|
193
240
|
return
|
|
194
241
|
logger.info('Not using new provisioner to stop the cluster. '
|
|
195
242
|
f'Cloud of this cluster: {provider_name}')
|
|
@@ -267,7 +314,8 @@ class AutostopEvent(SkyletEvent):
|
|
|
267
314
|
raise NotImplementedError
|
|
268
315
|
|
|
269
316
|
def _stop_cluster_with_new_provisioner(self, autostop_config,
|
|
270
|
-
cluster_config, provider_name
|
|
317
|
+
cluster_config, provider_name,
|
|
318
|
+
cloud):
|
|
271
319
|
# pylint: disable=import-outside-toplevel
|
|
272
320
|
from sky import provision as provision_lib
|
|
273
321
|
autostop_lib.set_autostopping_started()
|
|
@@ -275,13 +323,25 @@ class AutostopEvent(SkyletEvent):
|
|
|
275
323
|
cluster_name_on_cloud = cluster_config['cluster_name']
|
|
276
324
|
is_cluster_multinode = cluster_config['max_workers'] > 0
|
|
277
325
|
|
|
326
|
+
# Clear AWS credentials from environment to force boto3 to use IAM
|
|
327
|
+
# role attached to the instance (lowest priority in credential chain).
|
|
328
|
+
# This allows the cluster to stop/terminate itself using its IAM role.
|
|
278
329
|
os.environ.pop('AWS_ACCESS_KEY_ID', None)
|
|
279
330
|
os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
|
|
331
|
+
os.environ.pop('AWS_SESSION_TOKEN', None)
|
|
332
|
+
# Point boto3 to /dev/null to skip reading credentials from files.
|
|
333
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
|
|
334
|
+
os.environ['AWS_CONFIG_FILE'] = '/dev/null'
|
|
280
335
|
|
|
281
336
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
282
337
|
# stopping/terminating of the cluster.
|
|
283
|
-
|
|
284
|
-
|
|
338
|
+
if not cloud.uses_ray():
|
|
339
|
+
logger.info('Skipping ray stop as cloud does not use Ray.')
|
|
340
|
+
else:
|
|
341
|
+
logger.info('Stopping the ray cluster.')
|
|
342
|
+
subprocess.run(f'{constants.SKY_RAY_CMD} stop',
|
|
343
|
+
shell=True,
|
|
344
|
+
check=True)
|
|
285
345
|
|
|
286
346
|
operation_fn = provision_lib.stop_instances
|
|
287
347
|
if autostop_config.down:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Task Executors"""
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Slurm distributed task executor for SkyPilot.
|
|
2
|
+
|
|
3
|
+
This module is invoked on each Slurm compute node via:
|
|
4
|
+
srun python -m sky.skylet.executor.slurm --script=... --log-dir=...
|
|
5
|
+
"""
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import pathlib
|
|
10
|
+
import socket
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
import colorama
|
|
16
|
+
|
|
17
|
+
from sky.skylet.log_lib import run_bash_command_with_log
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_ip_address() -> str:
|
|
21
|
+
"""Get the IP address of the current node."""
|
|
22
|
+
ip_result = subprocess.run(['hostname', '-I'],
|
|
23
|
+
capture_output=True,
|
|
24
|
+
text=True,
|
|
25
|
+
check=False)
|
|
26
|
+
return ip_result.stdout.strip().split(
|
|
27
|
+
)[0] if ip_result.returncode == 0 else 'unknown'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_job_node_ips() -> str:
|
|
31
|
+
"""Get IPs of all nodes in the current Slurm job."""
|
|
32
|
+
nodelist = os.environ.get('SLURM_JOB_NODELIST', '')
|
|
33
|
+
assert nodelist, 'SLURM_JOB_NODELIST is not set'
|
|
34
|
+
|
|
35
|
+
# Expand compressed nodelist (e.g., "node[1-3,5]"
|
|
36
|
+
# -> "node1\nnode2\nnode3\nnode5")
|
|
37
|
+
result = subprocess.run(['scontrol', 'show', 'hostnames', nodelist],
|
|
38
|
+
capture_output=True,
|
|
39
|
+
text=True,
|
|
40
|
+
check=False)
|
|
41
|
+
if result.returncode != 0:
|
|
42
|
+
raise RuntimeError(f'Failed to get hostnames for: {nodelist}')
|
|
43
|
+
|
|
44
|
+
hostnames = result.stdout.strip().split('\n')
|
|
45
|
+
ips = []
|
|
46
|
+
for hostname in hostnames:
|
|
47
|
+
try:
|
|
48
|
+
ip = socket.gethostbyname(hostname)
|
|
49
|
+
ips.append(ip)
|
|
50
|
+
except socket.gaierror as e:
|
|
51
|
+
raise RuntimeError('Failed to get IP for hostname: '
|
|
52
|
+
f'{hostname}') from e
|
|
53
|
+
|
|
54
|
+
return '\n'.join(ips)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def main():
|
|
58
|
+
parser = argparse.ArgumentParser(
|
|
59
|
+
description='SkyPilot Slurm task runner for distributed execution')
|
|
60
|
+
parser.add_argument('--script', help='User script (inline, shell-quoted)')
|
|
61
|
+
parser.add_argument('--script-path',
|
|
62
|
+
help='Path to script file (if too long for inline)')
|
|
63
|
+
parser.add_argument('--env-vars',
|
|
64
|
+
default='{}',
|
|
65
|
+
help='JSON-encoded environment variables')
|
|
66
|
+
parser.add_argument('--log-dir',
|
|
67
|
+
required=True,
|
|
68
|
+
help='Directory for log files')
|
|
69
|
+
parser.add_argument('--cluster-num-nodes',
|
|
70
|
+
type=int,
|
|
71
|
+
required=True,
|
|
72
|
+
help='Total number of nodes in the cluster')
|
|
73
|
+
parser.add_argument('--cluster-ips',
|
|
74
|
+
required=True,
|
|
75
|
+
help='Comma-separated list of cluster node IPs')
|
|
76
|
+
parser.add_argument('--task-name',
|
|
77
|
+
default=None,
|
|
78
|
+
help='Task name for single-node log prefix')
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
'--is-setup',
|
|
81
|
+
action='store_true',
|
|
82
|
+
help=
|
|
83
|
+
'Whether this is a setup command (affects logging prefix and filename)')
|
|
84
|
+
parser.add_argument('--alloc-signal-file',
|
|
85
|
+
help='Path to allocation signal file')
|
|
86
|
+
parser.add_argument('--setup-done-signal-file',
|
|
87
|
+
help='Path to setup-done signal file')
|
|
88
|
+
args = parser.parse_args()
|
|
89
|
+
|
|
90
|
+
assert args.script is not None or args.script_path is not None, (
|
|
91
|
+
'Either '
|
|
92
|
+
'--script or --script-path must be provided')
|
|
93
|
+
|
|
94
|
+
# Task rank, different from index of the node in the cluster.
|
|
95
|
+
rank = int(os.environ['SLURM_PROCID'])
|
|
96
|
+
num_nodes = int(os.environ.get('SLURM_NNODES', 1))
|
|
97
|
+
is_single_node_cluster = (args.cluster_num_nodes == 1)
|
|
98
|
+
|
|
99
|
+
# Determine node index from IP (like Ray's cluster_ips_to_node_id)
|
|
100
|
+
cluster_ips = args.cluster_ips.split(',')
|
|
101
|
+
ip_addr = _get_ip_address()
|
|
102
|
+
try:
|
|
103
|
+
node_idx = cluster_ips.index(ip_addr)
|
|
104
|
+
except ValueError as e:
|
|
105
|
+
raise RuntimeError(f'IP address {ip_addr} not found in '
|
|
106
|
+
f'cluster IPs: {cluster_ips}') from e
|
|
107
|
+
node_name = 'head' if node_idx == 0 else f'worker{node_idx}'
|
|
108
|
+
|
|
109
|
+
# Log files are written to a shared filesystem, so each node must use a
|
|
110
|
+
# unique filename to avoid collisions.
|
|
111
|
+
if args.is_setup:
|
|
112
|
+
# TODO(kevin): This is inconsistent with other clouds, where it is
|
|
113
|
+
# simply called 'setup.log'. On Slurm that is obviously not possible,
|
|
114
|
+
# since the ~/sky_logs directory is shared by all nodes, so
|
|
115
|
+
# 'setup.log' will be overwritten by other nodes.
|
|
116
|
+
# Perhaps we should apply this naming convention to other clouds.
|
|
117
|
+
log_filename = f'setup-{node_name}.log'
|
|
118
|
+
elif is_single_node_cluster:
|
|
119
|
+
log_filename = 'run.log'
|
|
120
|
+
else:
|
|
121
|
+
log_filename = f'{rank}-{node_name}.log'
|
|
122
|
+
log_path = os.path.join(args.log_dir, log_filename)
|
|
123
|
+
|
|
124
|
+
if args.script_path:
|
|
125
|
+
with open(args.script_path, 'r', encoding='utf-8') as f:
|
|
126
|
+
script = f.read()
|
|
127
|
+
else:
|
|
128
|
+
script = args.script
|
|
129
|
+
|
|
130
|
+
# Parse env vars and add SKYPILOT environment variables
|
|
131
|
+
env_vars = json.loads(args.env_vars)
|
|
132
|
+
if not args.is_setup:
|
|
133
|
+
# For setup, env vars are set in CloudVmRayBackend._setup.
|
|
134
|
+
env_vars['SKYPILOT_NODE_RANK'] = str(rank)
|
|
135
|
+
env_vars['SKYPILOT_NUM_NODES'] = str(num_nodes)
|
|
136
|
+
env_vars['SKYPILOT_NODE_IPS'] = _get_job_node_ips()
|
|
137
|
+
|
|
138
|
+
# Signal file coordination for setup/run synchronization
|
|
139
|
+
# Rank 0 touches the allocation signal to indicate resources acquired
|
|
140
|
+
if args.alloc_signal_file is not None and rank == 0:
|
|
141
|
+
pathlib.Path(args.alloc_signal_file).touch()
|
|
142
|
+
|
|
143
|
+
# Wait for setup to complete.
|
|
144
|
+
while args.setup_done_signal_file is not None and not os.path.exists(
|
|
145
|
+
args.setup_done_signal_file):
|
|
146
|
+
time.sleep(0.1)
|
|
147
|
+
|
|
148
|
+
# Build log prefix
|
|
149
|
+
# For setup on head: (setup pid={pid})
|
|
150
|
+
# For setup on workers: (setup pid={pid}, ip=1.2.3.4)
|
|
151
|
+
# For single-node cluster: (task_name, pid={pid})
|
|
152
|
+
# For multi-node on head: (head, rank=0, pid={pid})
|
|
153
|
+
# For multi-node on workers: (worker1, rank=1, pid={pid}, ip=1.2.3.4)
|
|
154
|
+
# The {pid} placeholder will be replaced by run_with_log
|
|
155
|
+
if args.is_setup:
|
|
156
|
+
# Setup prefix: head (node_idx=0) shows no IP, workers show IP
|
|
157
|
+
if node_idx == 0:
|
|
158
|
+
prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}})'
|
|
159
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
160
|
+
else:
|
|
161
|
+
prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}}, ip={ip_addr})'
|
|
162
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
163
|
+
elif is_single_node_cluster:
|
|
164
|
+
# Single-node cluster: use task name
|
|
165
|
+
name_str = args.task_name if args.task_name else 'task'
|
|
166
|
+
prefix = (f'{colorama.Fore.CYAN}({name_str}, pid={{pid}})'
|
|
167
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
168
|
+
else:
|
|
169
|
+
# Multi-node cluster: head (node_idx=0) shows no IP, workers show IP
|
|
170
|
+
if node_idx == 0:
|
|
171
|
+
prefix = (
|
|
172
|
+
f'{colorama.Fore.CYAN}({node_name}, rank={rank}, pid={{pid}})'
|
|
173
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
174
|
+
else:
|
|
175
|
+
prefix = (f'{colorama.Fore.CYAN}'
|
|
176
|
+
f'({node_name}, rank={rank}, pid={{pid}}, ip={ip_addr})'
|
|
177
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
178
|
+
|
|
179
|
+
returncode = run_bash_command_with_log(script,
|
|
180
|
+
log_path,
|
|
181
|
+
env_vars=env_vars,
|
|
182
|
+
stream_logs=True,
|
|
183
|
+
streaming_prefix=prefix)
|
|
184
|
+
|
|
185
|
+
sys.exit(returncode)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
if __name__ == '__main__':
|
|
189
|
+
main()
|