skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/task.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Task: a coarse-grained stage in an application."""
|
|
2
2
|
import collections
|
|
3
|
-
import inspect
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
6
5
|
import re
|
|
@@ -8,6 +7,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
8
7
|
Union)
|
|
9
8
|
|
|
10
9
|
import colorama
|
|
10
|
+
from pydantic import SecretStr
|
|
11
11
|
|
|
12
12
|
from sky import clouds
|
|
13
13
|
from sky import dag as dag_lib
|
|
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
|
|
|
20
20
|
from sky.serve import service_spec
|
|
21
21
|
from sky.skylet import constants
|
|
22
22
|
from sky.utils import common_utils
|
|
23
|
+
from sky.utils import git
|
|
23
24
|
from sky.utils import registry
|
|
24
25
|
from sky.utils import schemas
|
|
25
26
|
from sky.utils import ux_utils
|
|
@@ -28,10 +29,6 @@ from sky.utils import yaml_utils
|
|
|
28
29
|
|
|
29
30
|
logger = sky_logging.init_logger(__name__)
|
|
30
31
|
|
|
31
|
-
# A lambda generating commands (node rank_i, node addrs -> cmd_i).
|
|
32
|
-
CommandGen = Callable[[int, List[str]], Optional[str]]
|
|
33
|
-
CommandOrCommandGen = Union[str, CommandGen]
|
|
34
|
-
|
|
35
32
|
_VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
|
|
36
33
|
_VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
|
|
37
34
|
' uppercase letters, digits, underscores, periods,'
|
|
@@ -116,7 +113,7 @@ def _fill_in_env_vars(
|
|
|
116
113
|
|
|
117
114
|
|
|
118
115
|
def _check_docker_login_config(task_envs: Dict[str, str],
|
|
119
|
-
task_secrets: Dict[str,
|
|
116
|
+
task_secrets: Dict[str, SecretStr]) -> bool:
|
|
120
117
|
"""Validates a valid docker login config in task_envs and task_secrets.
|
|
121
118
|
|
|
122
119
|
Docker login variables must be specified together either in envs OR secrets,
|
|
@@ -177,12 +174,13 @@ def _with_docker_login_config(
|
|
|
177
174
|
resources: Union[Set['resources_lib.Resources'],
|
|
178
175
|
List['resources_lib.Resources']],
|
|
179
176
|
task_envs: Dict[str, str],
|
|
180
|
-
task_secrets: Dict[str,
|
|
177
|
+
task_secrets: Dict[str, SecretStr],
|
|
181
178
|
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
|
|
182
179
|
if not _check_docker_login_config(task_envs, task_secrets):
|
|
183
180
|
return resources
|
|
184
181
|
envs = task_envs.copy()
|
|
185
|
-
|
|
182
|
+
for key, value in task_secrets.items():
|
|
183
|
+
envs[key] = value.get_secret_value()
|
|
186
184
|
docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
|
|
187
185
|
|
|
188
186
|
def _add_docker_login_config(resources: 'resources_lib.Resources'):
|
|
@@ -211,10 +209,11 @@ def _with_docker_username_for_runpod(
|
|
|
211
209
|
resources: Union[Set['resources_lib.Resources'],
|
|
212
210
|
List['resources_lib.Resources']],
|
|
213
211
|
task_envs: Dict[str, str],
|
|
214
|
-
task_secrets: Dict[str,
|
|
212
|
+
task_secrets: Dict[str, SecretStr],
|
|
215
213
|
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
|
|
216
214
|
envs = task_envs.copy()
|
|
217
|
-
|
|
215
|
+
for key, value in task_secrets.items():
|
|
216
|
+
envs[key] = value.get_secret_value()
|
|
218
217
|
docker_username_for_runpod = envs.get(
|
|
219
218
|
constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
|
|
220
219
|
|
|
@@ -227,6 +226,18 @@ def _with_docker_username_for_runpod(
|
|
|
227
226
|
for r in resources))
|
|
228
227
|
|
|
229
228
|
|
|
229
|
+
def get_plaintext_envs_and_secrets(
|
|
230
|
+
envs_and_secrets: Dict[str, Union[str, SecretStr]],) -> Dict[str, str]:
|
|
231
|
+
return {
|
|
232
|
+
k: v.get_secret_value() if isinstance(v, SecretStr) else v
|
|
233
|
+
for k, v in envs_and_secrets.items()
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def get_plaintext_secrets(secrets: Dict[str, SecretStr]) -> Dict[str, str]:
|
|
238
|
+
return {k: v.get_secret_value() for k, v in secrets.items()}
|
|
239
|
+
|
|
240
|
+
|
|
230
241
|
class Task:
|
|
231
242
|
"""Task: a computation to be run on the cloud."""
|
|
232
243
|
|
|
@@ -235,14 +246,14 @@ class Task:
|
|
|
235
246
|
name: Optional[str] = None,
|
|
236
247
|
*,
|
|
237
248
|
setup: Optional[Union[str, List[str]]] = None,
|
|
238
|
-
run: Optional[Union[
|
|
249
|
+
run: Optional[Union[str, List[str]]] = None,
|
|
239
250
|
envs: Optional[Dict[str, str]] = None,
|
|
240
251
|
secrets: Optional[Dict[str, str]] = None,
|
|
241
252
|
workdir: Optional[Union[str, Dict[str, Any]]] = None,
|
|
242
253
|
num_nodes: Optional[int] = None,
|
|
243
254
|
file_mounts: Optional[Dict[str, str]] = None,
|
|
244
255
|
storage_mounts: Optional[Dict[str, storage_lib.Storage]] = None,
|
|
245
|
-
volumes: Optional[Dict[str, str]] = None,
|
|
256
|
+
volumes: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None,
|
|
246
257
|
resources: Optional[Union['resources_lib.Resources',
|
|
247
258
|
List['resources_lib.Resources'],
|
|
248
259
|
Set['resources_lib.Resources']]] = None,
|
|
@@ -321,7 +332,10 @@ class Task:
|
|
|
321
332
|
object}``, where mount_path is the path inside the remote VM(s)
|
|
322
333
|
where the Storage object will be mounted on.
|
|
323
334
|
volumes: A dict of volumes to be mounted for the task. The dict has
|
|
324
|
-
the form of ``{mount_path: volume_name}
|
|
335
|
+
the form of ``{mount_path: volume_name}`` for external persistent
|
|
336
|
+
volumes, or ``{mount_path: volume_config}`` for ephemeral volumes
|
|
337
|
+
where volume_config is a dict with 'size', and optional type,
|
|
338
|
+
labels, and 'config' fields, etc.
|
|
325
339
|
resources: either a sky.Resources, a set of them, or a list of them.
|
|
326
340
|
A set or a list of resources asks the optimizer to "pick the
|
|
327
341
|
best of these resources" to run this task.
|
|
@@ -344,11 +358,13 @@ class Task:
|
|
|
344
358
|
self.storage_plans: Dict[storage_lib.Storage,
|
|
345
359
|
storage_lib.StoreType] = {}
|
|
346
360
|
self._envs = envs or {}
|
|
347
|
-
self._secrets =
|
|
361
|
+
self._secrets = {}
|
|
362
|
+
if secrets is not None:
|
|
363
|
+
self._secrets = {k: SecretStr(v) for k, v in secrets.items()}
|
|
348
364
|
self._volumes = volumes or {}
|
|
349
365
|
|
|
350
366
|
# concatenate commands if given as list
|
|
351
|
-
def _concat(commands):
|
|
367
|
+
def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
|
|
352
368
|
if isinstance(commands, list):
|
|
353
369
|
return '\n'.join(commands)
|
|
354
370
|
return commands
|
|
@@ -446,42 +462,9 @@ class Task:
|
|
|
446
462
|
|
|
447
463
|
def validate_run(self):
|
|
448
464
|
"""Validates if the run command is valid."""
|
|
449
|
-
if
|
|
450
|
-
run_sig = inspect.signature(self.run)
|
|
451
|
-
# Check that run is a function with 2 arguments.
|
|
452
|
-
if len(run_sig.parameters) != 2:
|
|
453
|
-
with ux_utils.print_exception_no_traceback():
|
|
454
|
-
raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
455
|
-
|
|
456
|
-
type_list = [int, List[str]]
|
|
457
|
-
# Check annotations, if exists
|
|
458
|
-
for i, param in enumerate(run_sig.parameters.values()):
|
|
459
|
-
if param.annotation != inspect.Parameter.empty:
|
|
460
|
-
if param.annotation != type_list[i]:
|
|
461
|
-
with ux_utils.print_exception_no_traceback():
|
|
462
|
-
raise ValueError(
|
|
463
|
-
_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
464
|
-
|
|
465
|
-
# Check self containedness.
|
|
466
|
-
run_closure = inspect.getclosurevars(self.run)
|
|
467
|
-
if run_closure.nonlocals:
|
|
468
|
-
with ux_utils.print_exception_no_traceback():
|
|
469
|
-
raise ValueError(
|
|
470
|
-
'run command generator must be self contained. '
|
|
471
|
-
f'Found nonlocals: {run_closure.nonlocals}')
|
|
472
|
-
if run_closure.globals:
|
|
473
|
-
with ux_utils.print_exception_no_traceback():
|
|
474
|
-
raise ValueError(
|
|
475
|
-
'run command generator must be self contained. '
|
|
476
|
-
f'Found globals: {run_closure.globals}')
|
|
477
|
-
if run_closure.unbound:
|
|
478
|
-
# Do not raise an error here. Import statements, which are
|
|
479
|
-
# allowed, will be considered as unbounded.
|
|
480
|
-
pass
|
|
481
|
-
elif self.run is not None and not isinstance(self.run, str):
|
|
465
|
+
if self.run is not None and not isinstance(self.run, str):
|
|
482
466
|
with ux_utils.print_exception_no_traceback():
|
|
483
|
-
raise ValueError('run must be
|
|
484
|
-
f'a command generator ({CommandGen}). '
|
|
467
|
+
raise ValueError('run must be a shell script (str). '
|
|
485
468
|
f'Got {type(self.run)}')
|
|
486
469
|
|
|
487
470
|
def expand_and_validate_file_mounts(self):
|
|
@@ -648,6 +631,10 @@ class Task:
|
|
|
648
631
|
config['workdir'] = _fill_in_env_vars(config['workdir'],
|
|
649
632
|
env_and_secrets)
|
|
650
633
|
|
|
634
|
+
if config.get('volumes') is not None:
|
|
635
|
+
config['volumes'] = _fill_in_env_vars(config['volumes'],
|
|
636
|
+
env_and_secrets)
|
|
637
|
+
|
|
651
638
|
task = Task(
|
|
652
639
|
config.pop('name', None),
|
|
653
640
|
run=config.pop('run', None),
|
|
@@ -737,34 +724,9 @@ class Task:
|
|
|
737
724
|
task.set_outputs(outputs=outputs,
|
|
738
725
|
estimated_size_gigabytes=estimated_size_gigabytes)
|
|
739
726
|
|
|
740
|
-
# Experimental configs.
|
|
741
|
-
experimental_configs = config.pop('experimental', None)
|
|
742
|
-
|
|
743
727
|
# Handle the top-level config field
|
|
744
728
|
config_override = config.pop('config', None)
|
|
745
729
|
|
|
746
|
-
# Handle backward compatibility with experimental.config_overrides
|
|
747
|
-
# TODO: Remove experimental.config_overrides in 0.11.0.
|
|
748
|
-
if experimental_configs is not None:
|
|
749
|
-
exp_config_override = experimental_configs.pop(
|
|
750
|
-
'config_overrides', None)
|
|
751
|
-
if exp_config_override is not None:
|
|
752
|
-
logger.warning(
|
|
753
|
-
f'{colorama.Fore.YELLOW}`experimental.config_overrides` '
|
|
754
|
-
'field is deprecated in the task YAML. Use the `config` '
|
|
755
|
-
f'field to set config overrides.{colorama.Style.RESET_ALL}')
|
|
756
|
-
if config_override is not None:
|
|
757
|
-
logger.warning(
|
|
758
|
-
f'{colorama.Fore.YELLOW}Both top-level `config` and '
|
|
759
|
-
f'`experimental.config_overrides` are specified. '
|
|
760
|
-
f'Using top-level `config`.{colorama.Style.RESET_ALL}')
|
|
761
|
-
else:
|
|
762
|
-
config_override = exp_config_override
|
|
763
|
-
logger.debug('Overriding skypilot config with task-level config: '
|
|
764
|
-
f'{config_override}')
|
|
765
|
-
assert not experimental_configs, ('Invalid task args: '
|
|
766
|
-
f'{experimental_configs.keys()}')
|
|
767
|
-
|
|
768
730
|
# Store the final config override for use in resource setup
|
|
769
731
|
cluster_config_override = config_override
|
|
770
732
|
|
|
@@ -830,16 +792,27 @@ class Task:
|
|
|
830
792
|
# https://github.com/yaml/pyyaml/issues/165#issuecomment-430074049
|
|
831
793
|
# to raise errors on duplicate keys.
|
|
832
794
|
user_specified_yaml = f.read()
|
|
833
|
-
|
|
795
|
+
return Task.from_yaml_str(user_specified_yaml)
|
|
796
|
+
|
|
797
|
+
@staticmethod
|
|
798
|
+
def from_yaml_str(yaml_str: str) -> 'Task':
|
|
799
|
+
"""Initializes a task from a task YAML string.
|
|
800
|
+
|
|
801
|
+
Example:
|
|
802
|
+
.. code-block:: python
|
|
803
|
+
|
|
804
|
+
task = sky.Task.from_yaml_str('yaml_str')
|
|
805
|
+
"""
|
|
806
|
+
config = yaml_utils.safe_load(yaml_str)
|
|
834
807
|
|
|
835
808
|
if isinstance(config, str):
|
|
836
809
|
with ux_utils.print_exception_no_traceback():
|
|
837
810
|
raise ValueError('YAML loaded as str, not as dict. '
|
|
838
|
-
f'Is it correct?
|
|
811
|
+
f'Is it correct? content:\n{yaml_str}')
|
|
839
812
|
|
|
840
813
|
if config is None:
|
|
841
814
|
config = {}
|
|
842
|
-
config['_user_specified_yaml'] =
|
|
815
|
+
config['_user_specified_yaml'] = yaml_str
|
|
843
816
|
return Task.from_yaml_config(config)
|
|
844
817
|
|
|
845
818
|
def resolve_and_validate_volumes(self) -> None:
|
|
@@ -860,13 +833,26 @@ class Task:
|
|
|
860
833
|
volume_mounts: List[volume_lib.VolumeMount] = []
|
|
861
834
|
for dst_path, vol in self._volumes.items():
|
|
862
835
|
self._validate_mount_path(dst_path, location='volumes')
|
|
863
|
-
# Shortcut for `dst_path: volume_name`
|
|
836
|
+
# Shortcut for `dst_path: volume_name` (external persistent volume)
|
|
864
837
|
if isinstance(vol, str):
|
|
865
838
|
volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
|
|
866
839
|
elif isinstance(vol, dict):
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
840
|
+
# Check if this is an ephemeral volume config or external volume
|
|
841
|
+
# with 'size' field
|
|
842
|
+
if 'size' in vol:
|
|
843
|
+
# This is an ephemeral volume config
|
|
844
|
+
volume_mount = (
|
|
845
|
+
volume_lib.VolumeMount.resolve_ephemeral_config(
|
|
846
|
+
dst_path, vol))
|
|
847
|
+
elif 'name' in vol:
|
|
848
|
+
# External volume with 'name' field
|
|
849
|
+
volume_mount = volume_lib.VolumeMount.resolve(
|
|
850
|
+
dst_path, vol['name'])
|
|
851
|
+
else:
|
|
852
|
+
raise ValueError(
|
|
853
|
+
f'Invalid volume config: {dst_path}: {vol}. '
|
|
854
|
+
'Either "size" (for ephemeral volume) or "name" '
|
|
855
|
+
'(for external volume) must be set.')
|
|
870
856
|
else:
|
|
871
857
|
raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
|
|
872
858
|
volume_mounts.append(volume_mount)
|
|
@@ -895,6 +881,9 @@ class Task:
|
|
|
895
881
|
if access_mode in disabled_modes:
|
|
896
882
|
raise ValueError(f'Volume {vol.volume_name} with '
|
|
897
883
|
f'{disabled_modes[access_mode]}')
|
|
884
|
+
# Skip ephemeral volumes for topology check
|
|
885
|
+
if vol.is_ephemeral:
|
|
886
|
+
continue
|
|
898
887
|
# Check topology
|
|
899
888
|
for key, (vol_name, previous_req) in topology.items():
|
|
900
889
|
req = getattr(vol.volume_config, key)
|
|
@@ -931,6 +920,8 @@ class Task:
|
|
|
931
920
|
vol_req)
|
|
932
921
|
else:
|
|
933
922
|
override_params[key] = vol_req
|
|
923
|
+
logger.debug(
|
|
924
|
+
f'Override resources with volume constraints: {override_params}')
|
|
934
925
|
self.set_resources_override(override_params)
|
|
935
926
|
self.volume_mounts = volume_mounts
|
|
936
927
|
|
|
@@ -961,22 +952,26 @@ class Task:
|
|
|
961
952
|
return self._envs
|
|
962
953
|
|
|
963
954
|
@property
|
|
964
|
-
def secrets(self) -> Dict[str,
|
|
955
|
+
def secrets(self) -> Dict[str, SecretStr]:
|
|
965
956
|
return self._secrets
|
|
966
957
|
|
|
967
958
|
@property
|
|
968
|
-
def volumes(self) -> Dict[str, str]:
|
|
959
|
+
def volumes(self) -> Dict[str, Union[str, Dict[str, Any]]]:
|
|
969
960
|
return self._volumes
|
|
970
961
|
|
|
971
|
-
def set_volumes(self, volumes: Dict[str, str
|
|
962
|
+
def set_volumes(self, volumes: Dict[str, Union[str, Dict[str,
|
|
963
|
+
Any]]]) -> None:
|
|
972
964
|
"""Sets the volumes for this task.
|
|
973
965
|
|
|
974
966
|
Args:
|
|
975
|
-
volumes: a dict of ``{mount_path: volume_name}
|
|
967
|
+
volumes: a dict of ``{mount_path: volume_name}`` for external
|
|
968
|
+
persistent volumes, or ``{mount_path: volume_config}`` for
|
|
969
|
+
ephemeral volumes.
|
|
976
970
|
"""
|
|
977
971
|
self._volumes = volumes
|
|
978
972
|
|
|
979
|
-
def update_volumes(self, volumes: Dict[str, str
|
|
973
|
+
def update_volumes(self, volumes: Dict[str, Union[str, Dict[str,
|
|
974
|
+
Any]]]) -> None:
|
|
980
975
|
"""Updates the volumes for this task."""
|
|
981
976
|
self._volumes.update(volumes)
|
|
982
977
|
|
|
@@ -1064,7 +1059,8 @@ class Task:
|
|
|
1064
1059
|
raise ValueError(
|
|
1065
1060
|
'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
|
|
1066
1061
|
f'{secrets}')
|
|
1067
|
-
|
|
1062
|
+
for key, value in secrets.items():
|
|
1063
|
+
self._secrets[key] = SecretStr(value)
|
|
1068
1064
|
# Validate Docker login configuration if needed
|
|
1069
1065
|
if _check_docker_login_config(self._envs, self._secrets):
|
|
1070
1066
|
self.resources = _with_docker_login_config(self.resources,
|
|
@@ -1079,7 +1075,7 @@ class Task:
|
|
|
1079
1075
|
return any(r.use_spot for r in self.resources)
|
|
1080
1076
|
|
|
1081
1077
|
@property
|
|
1082
|
-
def envs_and_secrets(self) -> Dict[str, str]:
|
|
1078
|
+
def envs_and_secrets(self) -> Dict[str, Union[str, SecretStr]]:
|
|
1083
1079
|
envs = self.envs.copy()
|
|
1084
1080
|
envs.update(self.secrets)
|
|
1085
1081
|
return envs
|
|
@@ -1125,7 +1121,7 @@ class Task:
|
|
|
1125
1121
|
def set_resources(
|
|
1126
1122
|
self, resources: Union['resources_lib.Resources',
|
|
1127
1123
|
List['resources_lib.Resources'],
|
|
1128
|
-
Set['resources_lib.Resources']]
|
|
1124
|
+
Set['resources_lib.Resources'], Dict[str, Any]]
|
|
1129
1125
|
) -> 'Task':
|
|
1130
1126
|
"""Sets the required resources to execute this task.
|
|
1131
1127
|
|
|
@@ -1139,7 +1135,9 @@ class Task:
|
|
|
1139
1135
|
Returns:
|
|
1140
1136
|
self: The current task, with resources set.
|
|
1141
1137
|
"""
|
|
1142
|
-
if isinstance(resources,
|
|
1138
|
+
if isinstance(resources, dict):
|
|
1139
|
+
resources = resources_lib.Resources.from_yaml_config(resources)
|
|
1140
|
+
elif isinstance(resources, resources_lib.Resources):
|
|
1143
1141
|
resources = {resources}
|
|
1144
1142
|
# TODO(woosuk): Check if the resources are None.
|
|
1145
1143
|
self.resources = _with_docker_login_config(resources, self.envs,
|
|
@@ -1167,6 +1165,10 @@ class Task:
|
|
|
1167
1165
|
self.set_resources(type(self.resources)(new_resources_list))
|
|
1168
1166
|
return self
|
|
1169
1167
|
|
|
1168
|
+
def get_resource_config(self) -> Dict[str, Any]:
|
|
1169
|
+
return _resources_to_config(self.resources,
|
|
1170
|
+
factor_out_common_fields=True)
|
|
1171
|
+
|
|
1170
1172
|
@property
|
|
1171
1173
|
def service(self) -> Optional[service_spec.SkyServiceSpec]:
|
|
1172
1174
|
return self._service
|
|
@@ -1547,6 +1549,16 @@ class Task:
|
|
|
1547
1549
|
self.update_file_mounts({
|
|
1548
1550
|
mnt_path: blob_path,
|
|
1549
1551
|
})
|
|
1552
|
+
elif store_type is storage_lib.StoreType.COREWEAVE:
|
|
1553
|
+
if storage.source is not None and not isinstance(
|
|
1554
|
+
storage.source,
|
|
1555
|
+
list) and storage.source.startswith('cw://'):
|
|
1556
|
+
blob_path = storage.source
|
|
1557
|
+
else:
|
|
1558
|
+
blob_path = 'cw://' + storage.name
|
|
1559
|
+
self.update_file_mounts({
|
|
1560
|
+
mnt_path: blob_path,
|
|
1561
|
+
})
|
|
1550
1562
|
else:
|
|
1551
1563
|
with ux_utils.print_exception_no_traceback():
|
|
1552
1564
|
raise ValueError(f'Storage Type {store_type} '
|
|
@@ -1596,6 +1608,69 @@ class Task:
|
|
|
1596
1608
|
d[k] = v
|
|
1597
1609
|
return d
|
|
1598
1610
|
|
|
1611
|
+
def update_workdir(self, workdir: Optional[str], git_url: Optional[str],
|
|
1612
|
+
git_ref: Optional[str]) -> 'Task':
|
|
1613
|
+
"""Updates the task workdir.
|
|
1614
|
+
|
|
1615
|
+
Args:
|
|
1616
|
+
workdir: The workdir to update.
|
|
1617
|
+
git_url: The git url to update.
|
|
1618
|
+
git_ref: The git ref to update.
|
|
1619
|
+
"""
|
|
1620
|
+
if self.workdir is None or isinstance(self.workdir, str):
|
|
1621
|
+
if workdir is not None:
|
|
1622
|
+
self.workdir = workdir
|
|
1623
|
+
return self
|
|
1624
|
+
if git_url is not None:
|
|
1625
|
+
self.workdir = {}
|
|
1626
|
+
self.workdir['url'] = git_url
|
|
1627
|
+
if git_ref is not None:
|
|
1628
|
+
self.workdir['ref'] = git_ref
|
|
1629
|
+
return self
|
|
1630
|
+
return self
|
|
1631
|
+
if git_url is not None:
|
|
1632
|
+
self.workdir['url'] = git_url
|
|
1633
|
+
if git_ref is not None:
|
|
1634
|
+
self.workdir['ref'] = git_ref
|
|
1635
|
+
return self
|
|
1636
|
+
|
|
1637
|
+
def update_envs_and_secrets_from_workdir(self) -> 'Task':
|
|
1638
|
+
"""Updates the task envs and secrets from the workdir."""
|
|
1639
|
+
if self.workdir is None:
|
|
1640
|
+
return self
|
|
1641
|
+
if not isinstance(self.workdir, dict):
|
|
1642
|
+
return self
|
|
1643
|
+
url = self.workdir['url']
|
|
1644
|
+
ref = self.workdir.get('ref', '')
|
|
1645
|
+
token = os.environ.get(git.GIT_TOKEN_ENV_VAR)
|
|
1646
|
+
ssh_key_path = os.environ.get(git.GIT_SSH_KEY_PATH_ENV_VAR)
|
|
1647
|
+
try:
|
|
1648
|
+
git_repo = git.GitRepo(url, ref, token, ssh_key_path)
|
|
1649
|
+
clone_info = git_repo.get_repo_clone_info()
|
|
1650
|
+
if clone_info is None:
|
|
1651
|
+
return self
|
|
1652
|
+
self.envs[git.GIT_URL_ENV_VAR] = clone_info.url
|
|
1653
|
+
if ref:
|
|
1654
|
+
ref_type = git_repo.get_ref_type()
|
|
1655
|
+
if ref_type == git.GitRefType.COMMIT:
|
|
1656
|
+
self.envs[git.GIT_COMMIT_HASH_ENV_VAR] = ref
|
|
1657
|
+
elif ref_type == git.GitRefType.BRANCH:
|
|
1658
|
+
self.envs[git.GIT_BRANCH_ENV_VAR] = ref
|
|
1659
|
+
elif ref_type == git.GitRefType.TAG:
|
|
1660
|
+
self.envs[git.GIT_TAG_ENV_VAR] = ref
|
|
1661
|
+
if clone_info.token is None and clone_info.ssh_key is None:
|
|
1662
|
+
return self
|
|
1663
|
+
if clone_info.token is not None:
|
|
1664
|
+
self.secrets[git.GIT_TOKEN_ENV_VAR] = SecretStr(
|
|
1665
|
+
clone_info.token)
|
|
1666
|
+
if clone_info.ssh_key is not None:
|
|
1667
|
+
self.secrets[git.GIT_SSH_KEY_ENV_VAR] = SecretStr(
|
|
1668
|
+
clone_info.ssh_key)
|
|
1669
|
+
except exceptions.GitError as e:
|
|
1670
|
+
with ux_utils.print_exception_no_traceback():
|
|
1671
|
+
raise ValueError(f'{str(e)}') from None
|
|
1672
|
+
return self
|
|
1673
|
+
|
|
1599
1674
|
def to_yaml_config(self,
|
|
1600
1675
|
use_user_specified_yaml: bool = False) -> Dict[str, Any]:
|
|
1601
1676
|
"""Returns a yaml-style dict representation of the task.
|
|
@@ -1622,16 +1697,7 @@ class Task:
|
|
|
1622
1697
|
|
|
1623
1698
|
add_if_not_none('name', self.name)
|
|
1624
1699
|
|
|
1625
|
-
tmp_resource_config
|
|
1626
|
-
Dict[str, List[Dict[str, Union[str, int]]]]]
|
|
1627
|
-
if len(self.resources) > 1:
|
|
1628
|
-
resource_list = []
|
|
1629
|
-
for r in self.resources:
|
|
1630
|
-
resource_list.append(r.to_yaml_config())
|
|
1631
|
-
key = 'ordered' if isinstance(self.resources, list) else 'any_of'
|
|
1632
|
-
tmp_resource_config = {key: resource_list}
|
|
1633
|
-
else:
|
|
1634
|
-
tmp_resource_config = list(self.resources)[0].to_yaml_config()
|
|
1700
|
+
tmp_resource_config = _resources_to_config(self.resources)
|
|
1635
1701
|
|
|
1636
1702
|
add_if_not_none('resources', tmp_resource_config)
|
|
1637
1703
|
|
|
@@ -1657,8 +1723,10 @@ class Task:
|
|
|
1657
1723
|
add_if_not_none('envs', self.envs, no_empty=True)
|
|
1658
1724
|
|
|
1659
1725
|
secrets = self.secrets
|
|
1660
|
-
if secrets and redact_secrets:
|
|
1661
|
-
secrets = {k:
|
|
1726
|
+
if secrets and not redact_secrets:
|
|
1727
|
+
secrets = {k: v.get_secret_value() for k, v in secrets.items()}
|
|
1728
|
+
elif secrets and redact_secrets:
|
|
1729
|
+
secrets = {k: '<redacted>' for k, v in secrets.items()}
|
|
1662
1730
|
add_if_not_none('secrets', secrets, no_empty=True)
|
|
1663
1731
|
|
|
1664
1732
|
add_if_not_none('file_mounts', {})
|
|
@@ -1709,7 +1777,12 @@ class Task:
|
|
|
1709
1777
|
return required_features
|
|
1710
1778
|
|
|
1711
1779
|
def __rshift__(self, b):
|
|
1712
|
-
dag_lib.get_current_dag()
|
|
1780
|
+
dag = dag_lib.get_current_dag()
|
|
1781
|
+
if dag is None:
|
|
1782
|
+
raise RuntimeError(
|
|
1783
|
+
'Cannot use >> operator outside of a DAG context. '
|
|
1784
|
+
'Please use "with sky.Dag() as dag:" to create a DAG context.')
|
|
1785
|
+
dag.add_edge(self, b)
|
|
1713
1786
|
|
|
1714
1787
|
def __repr__(self):
|
|
1715
1788
|
if isinstance(self.run, str):
|
|
@@ -1744,3 +1817,47 @@ class Task:
|
|
|
1744
1817
|
else:
|
|
1745
1818
|
s += '\n resources: default instances'
|
|
1746
1819
|
return s
|
|
1820
|
+
|
|
1821
|
+
|
|
1822
|
+
def _resources_to_config(
|
|
1823
|
+
resources: Union[List['resources_lib.Resources'],
|
|
1824
|
+
Set['resources_lib.Resources']],
|
|
1825
|
+
factor_out_common_fields: bool = False) -> Dict[str, Any]:
|
|
1826
|
+
if len(resources) > 1:
|
|
1827
|
+
resource_list: List[Dict[str, Union[str, int]]] = []
|
|
1828
|
+
for r in resources:
|
|
1829
|
+
resource_list.append(r.to_yaml_config())
|
|
1830
|
+
group_key = 'ordered' if isinstance(resources, list) else 'any_of'
|
|
1831
|
+
if factor_out_common_fields:
|
|
1832
|
+
return _factor_out_common_resource_fields(resource_list, group_key)
|
|
1833
|
+
return {group_key: resource_list}
|
|
1834
|
+
else:
|
|
1835
|
+
return list(resources)[0].to_yaml_config()
|
|
1836
|
+
|
|
1837
|
+
|
|
1838
|
+
def _factor_out_common_resource_fields(configs: List[Dict[str, Union[str,
|
|
1839
|
+
int]]],
|
|
1840
|
+
group_key: str) -> Dict[str, Any]:
|
|
1841
|
+
"""Factors out the fields that are common to all resources."""
|
|
1842
|
+
return_config: Dict[str, Any] = configs[0].copy()
|
|
1843
|
+
if len(configs) > 1:
|
|
1844
|
+
for config in configs[1:]:
|
|
1845
|
+
for key, value in config.items():
|
|
1846
|
+
if key in return_config and return_config[key] != value:
|
|
1847
|
+
del return_config[key]
|
|
1848
|
+
num_empty_configs = 0
|
|
1849
|
+
for config in configs:
|
|
1850
|
+
keys_to_delete = []
|
|
1851
|
+
for key, value in config.items():
|
|
1852
|
+
if key in return_config:
|
|
1853
|
+
keys_to_delete.append(key)
|
|
1854
|
+
for key in keys_to_delete:
|
|
1855
|
+
del config[key]
|
|
1856
|
+
if not config:
|
|
1857
|
+
num_empty_configs += 1
|
|
1858
|
+
|
|
1859
|
+
if num_empty_configs == len(configs):
|
|
1860
|
+
return return_config
|
|
1861
|
+
if len(configs) > 0:
|
|
1862
|
+
return_config[group_key] = configs
|
|
1863
|
+
return return_config
|
sky/templates/aws-ray.yml.j2
CHANGED
|
@@ -190,6 +190,7 @@ setup_commands:
|
|
|
190
190
|
{{ conda_installation_commands }}
|
|
191
191
|
conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true;
|
|
192
192
|
{{ ray_skypilot_installation_commands }}
|
|
193
|
+
{{ copy_skypilot_templates_commands }}
|
|
193
194
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
194
195
|
{%- if docker_image is none %}
|
|
195
196
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/azure-ray.yml.j2
CHANGED
|
@@ -118,6 +118,7 @@ setup_commands:
|
|
|
118
118
|
- mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
119
119
|
{{ conda_installation_commands }}
|
|
120
120
|
{{ ray_skypilot_installation_commands }}
|
|
121
|
+
{{ copy_skypilot_templates_commands }}
|
|
121
122
|
touch ~/.sudo_as_admin_successful;
|
|
122
123
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
123
124
|
{%- if docker_image is none %}
|
sky/templates/cudo-ray.yml.j2
CHANGED
|
@@ -68,6 +68,7 @@ setup_commands:
|
|
|
68
68
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
69
69
|
{{ conda_installation_commands }}
|
|
70
70
|
{{ ray_skypilot_installation_commands }}
|
|
71
|
+
{{ copy_skypilot_templates_commands }}
|
|
71
72
|
touch ~/.sudo_as_admin_successful;
|
|
72
73
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
73
74
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/do-ray.yml.j2
CHANGED
|
@@ -89,6 +89,7 @@ setup_commands:
|
|
|
89
89
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
90
90
|
{{ conda_installation_commands }}
|
|
91
91
|
{{ ray_skypilot_installation_commands }}
|
|
92
|
+
{{ copy_skypilot_templates_commands }}
|
|
92
93
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
93
94
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
94
95
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
|
@@ -69,6 +69,7 @@ setup_commands:
|
|
|
69
69
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
70
70
|
{{ conda_installation_commands }}
|
|
71
71
|
{{ ray_skypilot_installation_commands }}
|
|
72
|
+
{{ copy_skypilot_templates_commands }}
|
|
72
73
|
touch ~/.sudo_as_admin_successful;
|
|
73
74
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
74
75
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
sky/templates/gcp-ray.yml.j2
CHANGED
|
@@ -276,6 +276,7 @@ setup_commands:
|
|
|
276
276
|
grep "export TPU_NAME=" ~/.bashrc && echo "TPU_NAME already set" || echo "export TPU_NAME={{tpu_node_name}}" >> ~/.bashrc;
|
|
277
277
|
{%- endif %}
|
|
278
278
|
{{ ray_skypilot_installation_commands }}
|
|
279
|
+
{{ copy_skypilot_templates_commands }}
|
|
279
280
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
280
281
|
{%- if docker_image is none %}
|
|
281
282
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
@@ -62,6 +62,7 @@ setup_commands:
|
|
|
62
62
|
which patch > /dev/null || sudo apt install -y patch;
|
|
63
63
|
{{ conda_installation_commands }}
|
|
64
64
|
{{ ray_skypilot_installation_commands }}
|
|
65
|
+
{{ copy_skypilot_templates_commands }}
|
|
65
66
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
66
67
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
67
68
|
{{ ssh_max_sessions_config }}
|
sky/templates/ibm-ray.yml.j2
CHANGED
|
@@ -102,6 +102,7 @@ setup_commands:
|
|
|
102
102
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
|
103
103
|
{{ conda_installation_commands }}
|
|
104
104
|
{{ ray_skypilot_installation_commands }}
|
|
105
|
+
{{ copy_skypilot_templates_commands }}
|
|
105
106
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
106
107
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
107
108
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -121,7 +122,7 @@ head_start_ray_commands:
|
|
|
121
122
|
# all the sessions to be reloaded. This is a workaround.
|
|
122
123
|
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
|
123
124
|
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
|
124
|
-
{{dump_port_command}}
|
|
125
|
+
{{dump_port_command}} {{ray_head_wait_initialized_command}}
|
|
125
126
|
|
|
126
127
|
{%- if num_nodes > 1 %}
|
|
127
128
|
worker_start_ray_commands:
|
|
@@ -36,6 +36,9 @@ setup: |
|
|
|
36
36
|
grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
|
|
37
37
|
{% endif %}
|
|
38
38
|
|
|
39
|
+
# This is used by the skylet events to check if we are a jobs controller.
|
|
40
|
+
touch {{job_controller_indicator_file}}
|
|
41
|
+
|
|
39
42
|
run: |
|
|
40
43
|
{%- if consolidation_mode_job_id is none %}
|
|
41
44
|
{{ sky_activate_python_env }}
|