skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/jobs/scheduler.py
CHANGED
|
@@ -42,144 +42,220 @@ Nomenclature:
|
|
|
42
42
|
"""
|
|
43
43
|
|
|
44
44
|
from argparse import ArgumentParser
|
|
45
|
+
import asyncio
|
|
45
46
|
import contextlib
|
|
46
47
|
import os
|
|
48
|
+
import pathlib
|
|
49
|
+
import shutil
|
|
47
50
|
import sys
|
|
48
|
-
import
|
|
49
|
-
from typing import Optional
|
|
51
|
+
import typing
|
|
52
|
+
from typing import List, Optional, Set
|
|
53
|
+
import uuid
|
|
50
54
|
|
|
51
55
|
import filelock
|
|
52
56
|
|
|
53
|
-
from sky import exceptions
|
|
54
57
|
from sky import sky_logging
|
|
58
|
+
from sky import skypilot_config
|
|
59
|
+
from sky.adaptors import common as adaptors_common
|
|
60
|
+
from sky.client import sdk
|
|
55
61
|
from sky.jobs import constants as managed_job_constants
|
|
56
62
|
from sky.jobs import state
|
|
57
|
-
from sky.
|
|
63
|
+
from sky.jobs import utils as managed_job_utils
|
|
58
64
|
from sky.skylet import constants
|
|
59
|
-
from sky.utils import common_utils
|
|
60
65
|
from sky.utils import controller_utils
|
|
61
66
|
from sky.utils import subprocess_utils
|
|
62
67
|
|
|
68
|
+
if typing.TYPE_CHECKING:
|
|
69
|
+
import logging
|
|
70
|
+
|
|
71
|
+
import psutil
|
|
72
|
+
else:
|
|
73
|
+
psutil = adaptors_common.LazyImport('psutil')
|
|
74
|
+
|
|
63
75
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
64
76
|
|
|
65
|
-
|
|
77
|
+
# Job controller lock. This is used to synchronize writing/reading the
|
|
78
|
+
# controller pid file.
|
|
79
|
+
JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
|
|
80
|
+
'~/.sky/locks/job_controller_pid.lock')
|
|
81
|
+
|
|
82
|
+
JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
|
|
83
|
+
JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
|
|
84
|
+
|
|
85
|
+
CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _parse_controller_pid_entry(
|
|
89
|
+
entry: str) -> Optional[state.ControllerPidRecord]:
|
|
90
|
+
entry = entry.strip()
|
|
91
|
+
if not entry:
|
|
92
|
+
return None
|
|
93
|
+
# The entry should be like <pid>,<started_at>
|
|
94
|
+
# pid is an integer, started_at is a float
|
|
95
|
+
# For backwards compatibility, we also support just <pid>
|
|
96
|
+
entry_parts = entry.split(',')
|
|
97
|
+
if len(entry_parts) == 2:
|
|
98
|
+
[raw_pid, raw_started_at] = entry_parts
|
|
99
|
+
elif len(entry_parts) == 1:
|
|
100
|
+
# Backwards compatibility, pre-#7847
|
|
101
|
+
# TODO(cooperc): Remove for 0.13.0
|
|
102
|
+
raw_pid = entry_parts[0]
|
|
103
|
+
raw_started_at = None
|
|
104
|
+
else:
|
|
105
|
+
# Unknown format
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
pid = int(raw_pid)
|
|
110
|
+
except ValueError:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
started_at: Optional[float] = None
|
|
114
|
+
if raw_started_at:
|
|
115
|
+
try:
|
|
116
|
+
started_at = float(raw_started_at)
|
|
117
|
+
except ValueError:
|
|
118
|
+
started_at = None
|
|
119
|
+
return state.ControllerPidRecord(pid=pid, started_at=started_at)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_controller_process_records(
|
|
123
|
+
) -> Optional[List[state.ControllerPidRecord]]:
|
|
124
|
+
"""Return recorded controller processes if the file can be read."""
|
|
125
|
+
if not os.path.exists(JOB_CONTROLLER_PID_PATH):
|
|
126
|
+
# If the file doesn't exist, it means the controller server is not
|
|
127
|
+
# running, so we return an empty list
|
|
128
|
+
return []
|
|
129
|
+
try:
|
|
130
|
+
with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
|
|
131
|
+
lines = f.read().splitlines()
|
|
132
|
+
except (FileNotFoundError, OSError):
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
records: List[state.ControllerPidRecord] = []
|
|
136
|
+
for line in lines:
|
|
137
|
+
record = _parse_controller_pid_entry(line)
|
|
138
|
+
if record is not None:
|
|
139
|
+
records.append(record)
|
|
140
|
+
return records
|
|
66
141
|
|
|
67
142
|
|
|
68
|
-
def
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
f
|
|
76
|
-
f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
|
|
77
|
-
|
|
78
|
-
# If the command line here is changed, please also update
|
|
79
|
-
# utils._controller_process_alive. The substring `--job-id X`
|
|
80
|
-
# should be in the command.
|
|
81
|
-
run_cmd = (f'{activate_python_env_cmd}'
|
|
82
|
-
f'{source_environment_cmd}'
|
|
83
|
-
f'{run_controller_cmd}')
|
|
143
|
+
def _append_controller_pid_record(pid: int,
|
|
144
|
+
started_at: Optional[float]) -> None:
|
|
145
|
+
# Note: started_at is a float, but converting to a string will not lose any
|
|
146
|
+
# precision. See https://docs.python.org/3/tutorial/floatingpoint.html and
|
|
147
|
+
# https://github.com/python/cpython/issues/53583
|
|
148
|
+
entry = str(pid) if started_at is None else f'{pid},{started_at}'
|
|
149
|
+
with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
|
|
150
|
+
f.write(entry + '\n')
|
|
84
151
|
|
|
152
|
+
|
|
153
|
+
def start_controller() -> None:
|
|
154
|
+
"""Start the job controller process.
|
|
155
|
+
|
|
156
|
+
This requires that the env file is already set up.
|
|
157
|
+
"""
|
|
158
|
+
os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
|
|
85
159
|
logs_dir = os.path.expanduser(
|
|
86
160
|
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
87
161
|
os.makedirs(logs_dir, exist_ok=True)
|
|
88
|
-
|
|
162
|
+
controller_uuid = str(uuid.uuid4())
|
|
163
|
+
log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
|
|
164
|
+
|
|
165
|
+
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
|
166
|
+
run_controller_cmd = (f'{sys.executable} -u -m'
|
|
167
|
+
f'sky.jobs.controller {controller_uuid}')
|
|
168
|
+
|
|
169
|
+
run_cmd = (f'{activate_python_env_cmd}'
|
|
170
|
+
f'{run_controller_cmd}')
|
|
171
|
+
|
|
172
|
+
logger.info(f'Running controller with command: {run_cmd}')
|
|
89
173
|
|
|
90
174
|
pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
the
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
If this function obtains the lock, it will launch as many jobs as possible
|
|
117
|
-
before releasing the lock. This is what allows other calls to exit
|
|
118
|
-
immediately if the lock is held, while ensuring that all jobs are started as
|
|
119
|
-
soon as possible.
|
|
120
|
-
|
|
121
|
-
This uses subprocess_utils.launch_new_process_tree() to start the controller
|
|
122
|
-
processes, which should be safe to call from pretty much any code running on
|
|
123
|
-
the jobs controller instance. New job controller processes will be detached
|
|
124
|
-
from the current process and there will not be a parent/child relationship.
|
|
125
|
-
See launch_new_process_tree for more.
|
|
126
|
-
|
|
127
|
-
After adding the pool support, this function will be called in a per-pool
|
|
128
|
-
basis. We employ resources limitation for each pool given the number of
|
|
129
|
-
ready workers in the pool. Each pool will have its own scheduler queue,
|
|
130
|
-
indicating by the argument `pool`. Finished job in pool 1 will only trigger
|
|
131
|
-
another jobs in pool 1, but the job in pool 2 will still be waiting. When
|
|
132
|
-
the `pool` argument is None, it schedules a job regardless of the pool.
|
|
175
|
+
pid_started_at = psutil.Process(pid).create_time()
|
|
176
|
+
_append_controller_pid_record(pid, pid_started_at)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_alive_controllers() -> Optional[int]:
|
|
180
|
+
records = get_controller_process_records()
|
|
181
|
+
if records is None:
|
|
182
|
+
# If we cannot read the file reliably, avoid starting extra controllers.
|
|
183
|
+
return None
|
|
184
|
+
if not records:
|
|
185
|
+
return 0
|
|
186
|
+
|
|
187
|
+
alive = 0
|
|
188
|
+
for record in records:
|
|
189
|
+
if managed_job_utils.controller_process_alive(record, quiet=False):
|
|
190
|
+
alive += 1
|
|
191
|
+
return alive
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def maybe_start_controllers(from_scheduler: bool = False) -> None:
|
|
195
|
+
"""Start the job controller process.
|
|
196
|
+
|
|
197
|
+
If the process is already running, it will not start a new one.
|
|
198
|
+
Will also add the job_id, dag_yaml_path, and env_file_path to the
|
|
199
|
+
controllers list of processes.
|
|
133
200
|
"""
|
|
201
|
+
# In consolidation mode, during rolling update, two API servers may be
|
|
202
|
+
# running. If we are on the new API server, and we haven't finished the
|
|
203
|
+
# recovery process, we should avoid starting new controllers. The old API
|
|
204
|
+
# server/consolidated jobs controller could run update_managed_jobs_statuses
|
|
205
|
+
# and if there are jobs running on the new API server, the old one will not
|
|
206
|
+
# see the corresponding processes and may mark them as FAILED_CONTROLLER.
|
|
207
|
+
if from_scheduler and managed_job_utils.is_consolidation_mode(
|
|
208
|
+
) and os.path.exists(
|
|
209
|
+
os.path.expanduser(
|
|
210
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
|
211
|
+
# This could happen during an API server rolling update, or during
|
|
212
|
+
# normal running while managed-job-status-refresh-daemon is running. In
|
|
213
|
+
# either case, the controllers should be already started or will be
|
|
214
|
+
# started by the recovery process.
|
|
215
|
+
logger.info('Recovery is still in progress, skipping controller start.')
|
|
216
|
+
return
|
|
134
217
|
try:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
job_id = maybe_next_job['job_id']
|
|
178
|
-
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
|
179
|
-
env_file_path = maybe_next_job['env_file_path']
|
|
180
|
-
|
|
181
|
-
_start_controller(job_id, dag_yaml_path, env_file_path,
|
|
182
|
-
actual_pool)
|
|
218
|
+
with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
|
|
219
|
+
if from_scheduler and not managed_job_utils.is_consolidation_mode():
|
|
220
|
+
cur = pathlib.Path(CURRENT_HASH)
|
|
221
|
+
old = pathlib.Path(f'{CURRENT_HASH}.old')
|
|
222
|
+
|
|
223
|
+
if old.exists() and cur.exists():
|
|
224
|
+
if (old.read_text(encoding='utf-8') !=
|
|
225
|
+
cur.read_text(encoding='utf-8')):
|
|
226
|
+
# TODO(luca): there is a 1/2^160 chance that there will
|
|
227
|
+
# be a collision. using a geometric distribution and
|
|
228
|
+
# assuming one update a day, we expect a bug slightly
|
|
229
|
+
# before the heat death of the universe. should get
|
|
230
|
+
# this fixed before then.
|
|
231
|
+
try:
|
|
232
|
+
# this will stop all the controllers and the api
|
|
233
|
+
# server.
|
|
234
|
+
sdk.api_stop()
|
|
235
|
+
# All controllers should be dead. Remove the PIDs so
|
|
236
|
+
# that update_managed_jobs_statuses won't think they
|
|
237
|
+
# have failed.
|
|
238
|
+
state.reset_jobs_for_recovery()
|
|
239
|
+
except Exception as e: # pylint: disable=broad-except
|
|
240
|
+
logger.error(f'Failed to stop the api server: {e}')
|
|
241
|
+
pass
|
|
242
|
+
else:
|
|
243
|
+
shutil.copyfile(cur, old)
|
|
244
|
+
if not old.exists():
|
|
245
|
+
shutil.copyfile(cur, old)
|
|
246
|
+
|
|
247
|
+
alive = get_alive_controllers()
|
|
248
|
+
if alive is None:
|
|
249
|
+
return
|
|
250
|
+
wanted = controller_utils.get_number_of_jobs_controllers()
|
|
251
|
+
started = 0
|
|
252
|
+
|
|
253
|
+
while alive + started < wanted:
|
|
254
|
+
start_controller()
|
|
255
|
+
started += 1
|
|
256
|
+
|
|
257
|
+
if started > 0:
|
|
258
|
+
logger.info(f'Started {started} controllers')
|
|
183
259
|
|
|
184
260
|
except filelock.Timeout:
|
|
185
261
|
# If we can't get the lock, just exit. The process holding the lock
|
|
@@ -188,30 +264,63 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
188
264
|
|
|
189
265
|
|
|
190
266
|
def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
191
|
-
env_file_path: str, priority: int
|
|
267
|
+
env_file_path: str, priority: int) -> None:
|
|
192
268
|
"""Submit an existing job to the scheduler.
|
|
193
269
|
|
|
194
270
|
This should be called after a job is created in the `spot` table as
|
|
195
271
|
PENDING. It will tell the scheduler to try and start the job controller, if
|
|
196
|
-
there are resources available.
|
|
197
|
-
should not be on the critical path for `sky jobs launch -d`.
|
|
272
|
+
there are resources available.
|
|
198
273
|
|
|
199
274
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
|
200
275
|
"""
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
276
|
+
controller_process = state.get_job_controller_process(job_id)
|
|
277
|
+
if controller_process is not None:
|
|
278
|
+
# why? TODO(cooperc): figure out why this is needed, fix it, and remove
|
|
279
|
+
if managed_job_utils.controller_process_alive(controller_process,
|
|
280
|
+
job_id):
|
|
281
|
+
# This can happen when HA recovery runs for some reason but the job
|
|
282
|
+
# controller is still alive.
|
|
283
|
+
logger.warning(f'Job {job_id} is still alive with controller '
|
|
284
|
+
f'{controller_process}, skipping submission')
|
|
285
|
+
maybe_start_controllers(from_scheduler=True)
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
|
|
289
|
+
dag_yaml_content = dag_file.read()
|
|
290
|
+
with open(original_user_yaml_path, 'r',
|
|
291
|
+
encoding='utf-8') as original_user_yaml_file:
|
|
292
|
+
original_user_yaml_content = original_user_yaml_file.read()
|
|
293
|
+
with open(env_file_path, 'r', encoding='utf-8') as env_file:
|
|
294
|
+
env_file_content = env_file.read()
|
|
295
|
+
|
|
296
|
+
# Read config file if SKYPILOT_CONFIG env var is set
|
|
297
|
+
config_file_content: Optional[str] = None
|
|
298
|
+
config_file_path = os.environ.get(skypilot_config.ENV_VAR_SKYPILOT_CONFIG)
|
|
299
|
+
if config_file_path:
|
|
300
|
+
config_file_path = os.path.expanduser(config_file_path)
|
|
301
|
+
if os.path.exists(config_file_path):
|
|
302
|
+
with open(config_file_path, 'r', encoding='utf-8') as config_file:
|
|
303
|
+
config_file_content = config_file.read()
|
|
304
|
+
|
|
305
|
+
config_bytes = (len(config_file_content) if config_file_content else 0)
|
|
306
|
+
logger.debug(f'Storing job {job_id} file contents in database '
|
|
307
|
+
f'(DAG bytes={len(dag_yaml_content)}, '
|
|
308
|
+
f'original user yaml bytes={len(original_user_yaml_content)}, '
|
|
309
|
+
f'env bytes={len(env_file_content)}, '
|
|
310
|
+
f'config bytes={config_bytes}).')
|
|
311
|
+
state.scheduler_set_waiting(job_id, dag_yaml_content,
|
|
312
|
+
original_user_yaml_content, env_file_content,
|
|
313
|
+
config_file_content, priority)
|
|
314
|
+
maybe_start_controllers(from_scheduler=True)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
@contextlib.asynccontextmanager
|
|
318
|
+
async def scheduled_launch(
|
|
319
|
+
job_id: int,
|
|
320
|
+
starting: Set[int],
|
|
321
|
+
starting_lock: asyncio.Lock,
|
|
322
|
+
starting_signal: asyncio.Condition,
|
|
323
|
+
):
|
|
215
324
|
"""Launch as part of an ongoing job.
|
|
216
325
|
|
|
217
326
|
A newly started job will already be LAUNCHING, and this will immediately
|
|
@@ -240,30 +349,34 @@ def scheduled_launch(job_id: int):
|
|
|
240
349
|
yield
|
|
241
350
|
return
|
|
242
351
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
352
|
+
assert starting_lock == starting_signal._lock, ( # type: ignore #pylint: disable=protected-access
|
|
353
|
+
'starting_lock and starting_signal must use the same lock')
|
|
354
|
+
|
|
355
|
+
while True:
|
|
356
|
+
async with starting_lock:
|
|
357
|
+
starting_count = len(starting)
|
|
358
|
+
if starting_count < controller_utils.LAUNCHES_PER_WORKER:
|
|
359
|
+
break
|
|
360
|
+
logger.info('Too many jobs starting, waiting for a slot')
|
|
361
|
+
await starting_signal.wait()
|
|
249
362
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
363
|
+
logger.info(f'Starting job {job_id}')
|
|
364
|
+
|
|
365
|
+
async with starting_lock:
|
|
366
|
+
starting.add(job_id)
|
|
367
|
+
|
|
368
|
+
await state.scheduler_set_launching_async(job_id)
|
|
253
369
|
|
|
254
370
|
try:
|
|
255
371
|
yield
|
|
256
|
-
except
|
|
257
|
-
|
|
258
|
-
# We should transition to ALIVE_BACKOFF instead of ALIVE.
|
|
259
|
-
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
260
|
-
state.scheduler_set_alive_backoff(job_id)
|
|
261
|
-
raise
|
|
372
|
+
except Exception as e:
|
|
373
|
+
raise e
|
|
262
374
|
else:
|
|
263
|
-
|
|
264
|
-
state.scheduler_set_alive(job_id)
|
|
375
|
+
await state.scheduler_set_alive_async(job_id)
|
|
265
376
|
finally:
|
|
266
|
-
|
|
377
|
+
async with starting_lock:
|
|
378
|
+
starting.remove(job_id)
|
|
379
|
+
starting_signal.notify()
|
|
267
380
|
|
|
268
381
|
|
|
269
382
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -274,38 +387,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
274
387
|
|
|
275
388
|
The job could be in any terminal ManagedJobStatus. However, once DONE, it
|
|
276
389
|
should never transition back to another state.
|
|
390
|
+
|
|
391
|
+
This is only called by utils.update_managed_jobs_statuses which is sync.
|
|
277
392
|
"""
|
|
278
393
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
279
394
|
== state.ManagedJobScheduleState.DONE):
|
|
280
395
|
return
|
|
281
396
|
|
|
282
|
-
|
|
283
|
-
state.scheduler_set_done(job_id, idempotent)
|
|
284
|
-
maybe_schedule_next_jobs()
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def _set_alive_waiting(job_id: int) -> None:
|
|
288
|
-
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
289
|
-
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
290
|
-
state.scheduler_set_alive_waiting(job_id)
|
|
291
|
-
maybe_schedule_next_jobs()
|
|
397
|
+
state.scheduler_set_done(job_id, idempotent)
|
|
292
398
|
|
|
293
399
|
|
|
294
|
-
def
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
return False
|
|
300
|
-
|
|
301
|
-
# Check if there are available workers in the pool
|
|
302
|
-
if pool is not None:
|
|
303
|
-
alive_jobs_in_pool = state.get_num_alive_jobs(pool)
|
|
304
|
-
if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
|
|
305
|
-
logger.debug(f'No READY workers available in pool {pool}')
|
|
306
|
-
return False
|
|
400
|
+
async def job_done_async(job_id: int, idempotent: bool = False):
|
|
401
|
+
"""Async version of job_done."""
|
|
402
|
+
if idempotent and (await state.get_job_schedule_state_async(job_id)
|
|
403
|
+
== state.ManagedJobScheduleState.DONE):
|
|
404
|
+
return
|
|
307
405
|
|
|
308
|
-
|
|
406
|
+
await state.scheduler_set_done_async(job_id, idempotent)
|
|
309
407
|
|
|
310
408
|
|
|
311
409
|
if __name__ == '__main__':
|
|
@@ -337,4 +435,4 @@ if __name__ == '__main__':
|
|
|
337
435
|
f' Default: {constants.DEFAULT_PRIORITY}.')
|
|
338
436
|
args = parser.parse_args()
|
|
339
437
|
submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
|
|
340
|
-
args.priority
|
|
438
|
+
args.priority)
|