skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/jobs/server/core.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
|
+
import concurrent.futures
|
|
3
|
+
import copy
|
|
4
|
+
import ipaddress
|
|
2
5
|
import os
|
|
3
6
|
import pathlib
|
|
4
7
|
import tempfile
|
|
5
8
|
import typing
|
|
6
9
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
|
+
from urllib import parse as urlparse
|
|
7
11
|
import uuid
|
|
8
12
|
|
|
9
13
|
import colorama
|
|
@@ -17,16 +21,21 @@ from sky import provision as provision_lib
|
|
|
17
21
|
from sky import sky_logging
|
|
18
22
|
from sky import skypilot_config
|
|
19
23
|
from sky import task as task_lib
|
|
24
|
+
from sky.adaptors import common as adaptors_common
|
|
20
25
|
from sky.backends import backend_utils
|
|
26
|
+
from sky.backends import cloud_vm_ray_backend
|
|
21
27
|
from sky.catalog import common as service_catalog_common
|
|
22
28
|
from sky.data import storage as storage_lib
|
|
23
29
|
from sky.jobs import constants as managed_job_constants
|
|
24
30
|
from sky.jobs import state as managed_job_state
|
|
25
31
|
from sky.jobs import utils as managed_job_utils
|
|
32
|
+
from sky.metrics import utils as metrics_lib
|
|
26
33
|
from sky.provision import common as provision_common
|
|
34
|
+
from sky.schemas.api import responses
|
|
27
35
|
from sky.serve import serve_state
|
|
28
36
|
from sky.serve import serve_utils
|
|
29
37
|
from sky.serve.server import impl
|
|
38
|
+
from sky.server.requests import request_names
|
|
30
39
|
from sky.skylet import constants as skylet_constants
|
|
31
40
|
from sky.usage import usage_lib
|
|
32
41
|
from sky.utils import admin_policy_utils
|
|
@@ -42,11 +51,47 @@ from sky.utils import ux_utils
|
|
|
42
51
|
from sky.workspaces import core as workspaces_core
|
|
43
52
|
|
|
44
53
|
if typing.TYPE_CHECKING:
|
|
54
|
+
from google.protobuf import json_format
|
|
55
|
+
|
|
45
56
|
import sky
|
|
46
|
-
from sky.
|
|
57
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
58
|
+
else:
|
|
59
|
+
json_format = adaptors_common.LazyImport('google.protobuf.json_format')
|
|
60
|
+
|
|
61
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
62
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
47
63
|
|
|
48
64
|
logger = sky_logging.init_logger(__name__)
|
|
49
65
|
|
|
66
|
+
_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
|
|
67
|
+
'job_id',
|
|
68
|
+
'task_id',
|
|
69
|
+
'workspace',
|
|
70
|
+
'job_name',
|
|
71
|
+
'task_name',
|
|
72
|
+
'resources',
|
|
73
|
+
'submitted_at',
|
|
74
|
+
'end_at',
|
|
75
|
+
'job_duration',
|
|
76
|
+
'recovery_count',
|
|
77
|
+
'status',
|
|
78
|
+
'pool',
|
|
79
|
+
'current_cluster_name',
|
|
80
|
+
'job_id_on_pool_cluster',
|
|
81
|
+
'start_at',
|
|
82
|
+
'infra',
|
|
83
|
+
'cloud',
|
|
84
|
+
'region',
|
|
85
|
+
'zone',
|
|
86
|
+
'cluster_resources',
|
|
87
|
+
'schedule_state',
|
|
88
|
+
'details',
|
|
89
|
+
'failure_reason',
|
|
90
|
+
'metadata',
|
|
91
|
+
'user_name',
|
|
92
|
+
'user_hash',
|
|
93
|
+
]
|
|
94
|
+
|
|
50
95
|
|
|
51
96
|
def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
52
97
|
"""Upload files to the controller.
|
|
@@ -129,7 +174,8 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
|
129
174
|
force_user_workspace=True),
|
|
130
175
|
entrypoint=common_utils.get_current_command(),
|
|
131
176
|
pool=pool,
|
|
132
|
-
pool_hash=pool_hash
|
|
177
|
+
pool_hash=pool_hash,
|
|
178
|
+
user_hash=common_utils.get_user_hash()))
|
|
133
179
|
for task_id, task in enumerate(dag.tasks):
|
|
134
180
|
resources_str = backend_utils.get_task_resources_str(
|
|
135
181
|
task, is_managed_job=True)
|
|
@@ -188,10 +234,12 @@ def launch(
|
|
|
188
234
|
|
|
189
235
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
|
190
236
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
237
|
+
|
|
191
238
|
# Always apply the policy again here, even though it might have been applied
|
|
192
239
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
193
240
|
# and get the mutated config.
|
|
194
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
|
241
|
+
dag, mutated_user_config = admin_policy_utils.apply(
|
|
242
|
+
dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
|
|
195
243
|
dag.resolve_and_validate_volumes()
|
|
196
244
|
if not dag.is_chain():
|
|
197
245
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -202,6 +250,21 @@ def launch(
|
|
|
202
250
|
# pre-mount operations when submitting jobs.
|
|
203
251
|
dag.pre_mount_volumes()
|
|
204
252
|
|
|
253
|
+
# If there is a local postgres db, when the api server tries launching on
|
|
254
|
+
# the remote jobs controller it will fail. therefore, we should remove this
|
|
255
|
+
# before sending the config to the jobs controller.
|
|
256
|
+
# TODO(luca) there are a lot of potential problems with postgres being sent
|
|
257
|
+
# to the jobs controller. for example if the postgres is whitelisted to
|
|
258
|
+
# only the API server, this will then break. the simple solution to that is
|
|
259
|
+
# telling the user to add the jobs controller to the postgres whitelist.
|
|
260
|
+
if not managed_job_utils.is_consolidation_mode():
|
|
261
|
+
db_path = mutated_user_config.get('db', None)
|
|
262
|
+
if db_path is not None:
|
|
263
|
+
parsed = urlparse.urlparse(db_path)
|
|
264
|
+
if ((parsed.hostname == 'localhost' or
|
|
265
|
+
ipaddress.ip_address(parsed.hostname).is_loopback)):
|
|
266
|
+
mutated_user_config.pop('db', None)
|
|
267
|
+
|
|
205
268
|
user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
|
|
206
269
|
dag, use_user_specified_yaml=True)
|
|
207
270
|
|
|
@@ -263,15 +326,13 @@ def launch(
|
|
|
263
326
|
# Check whether cached jobs controller cluster is accessible
|
|
264
327
|
cluster_name = (
|
|
265
328
|
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
|
266
|
-
|
|
267
|
-
if record is not None:
|
|
329
|
+
if global_user_state.cluster_with_name_exists(cluster_name):
|
|
268
330
|
# there is a cached jobs controller cluster
|
|
269
331
|
try:
|
|
270
332
|
# TODO: do something with returned status?
|
|
271
333
|
_, _ = backend_utils.refresh_cluster_status_handle(
|
|
272
334
|
cluster_name=cluster_name,
|
|
273
|
-
force_refresh_statuses=set(status_lib.ClusterStatus)
|
|
274
|
-
acquire_per_cluster_status_lock=False)
|
|
335
|
+
force_refresh_statuses=set(status_lib.ClusterStatus))
|
|
275
336
|
except (exceptions.ClusterOwnerIdentityMismatchError,
|
|
276
337
|
exceptions.CloudUserIdentityError,
|
|
277
338
|
exceptions.ClusterStatusFetchingError) as e:
|
|
@@ -309,6 +370,7 @@ def launch(
|
|
|
309
370
|
def _submit_one(
|
|
310
371
|
consolidation_mode_job_id: Optional[int] = None,
|
|
311
372
|
job_rank: Optional[int] = None,
|
|
373
|
+
num_jobs: Optional[int] = None,
|
|
312
374
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
313
375
|
rank_suffix = '' if job_rank is None else f'-{job_rank}'
|
|
314
376
|
remote_original_user_yaml_path = (
|
|
@@ -328,11 +390,16 @@ def launch(
|
|
|
328
390
|
) as original_user_yaml_path:
|
|
329
391
|
original_user_yaml_path.write(user_dag_str_user_specified)
|
|
330
392
|
original_user_yaml_path.flush()
|
|
331
|
-
|
|
393
|
+
# Copy tasks to avoid race conditions when multiple threads modify
|
|
394
|
+
# the same dag object concurrently. Each thread needs its own copy.
|
|
395
|
+
dag_copy = copy.deepcopy(dag)
|
|
396
|
+
for task_ in dag_copy.tasks:
|
|
332
397
|
if job_rank is not None:
|
|
333
398
|
task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
|
|
399
|
+
if num_jobs is not None:
|
|
400
|
+
task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
|
|
334
401
|
|
|
335
|
-
dag_utils.dump_chain_dag_to_yaml(
|
|
402
|
+
dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
|
|
336
403
|
|
|
337
404
|
vars_to_fill = {
|
|
338
405
|
'remote_original_user_yaml_path':
|
|
@@ -351,6 +418,8 @@ def launch(
|
|
|
351
418
|
'priority': priority,
|
|
352
419
|
'consolidation_mode_job_id': consolidation_mode_job_id,
|
|
353
420
|
'pool': pool,
|
|
421
|
+
'job_controller_indicator_file':
|
|
422
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
|
|
354
423
|
**controller_utils.shared_controller_vars_to_fill(
|
|
355
424
|
controller,
|
|
356
425
|
remote_user_config_path=remote_user_config_path,
|
|
@@ -363,7 +432,8 @@ def launch(
|
|
|
363
432
|
|
|
364
433
|
yaml_path = os.path.join(
|
|
365
434
|
managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
|
|
366
|
-
f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml'
|
|
435
|
+
f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
|
|
436
|
+
)
|
|
367
437
|
common_utils.fill_template(
|
|
368
438
|
managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
|
|
369
439
|
vars_to_fill,
|
|
@@ -371,16 +441,19 @@ def launch(
|
|
|
371
441
|
controller_task = task_lib.Task.from_yaml(yaml_path)
|
|
372
442
|
controller_task.set_resources(controller_resources)
|
|
373
443
|
|
|
374
|
-
controller_task.managed_job_dag =
|
|
444
|
+
controller_task.managed_job_dag = dag_copy
|
|
375
445
|
# pylint: disable=protected-access
|
|
376
446
|
controller_task._metadata = metadata
|
|
377
447
|
|
|
378
448
|
job_identity = ''
|
|
379
449
|
if job_rank is not None:
|
|
380
450
|
job_identity = f' (rank: {job_rank})'
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
451
|
+
job_controller_postfix = (' from jobs controller' if
|
|
452
|
+
consolidation_mode_job_id is None else '')
|
|
453
|
+
logger.info(
|
|
454
|
+
f'{colorama.Fore.YELLOW}'
|
|
455
|
+
f'Launching managed job {dag.name!r}{job_identity}'
|
|
456
|
+
f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
|
|
384
457
|
|
|
385
458
|
# Launch with the api server's user hash, so that sky status does
|
|
386
459
|
# not show the owner of the controller as whatever user launched
|
|
@@ -395,19 +468,24 @@ def launch(
|
|
|
395
468
|
# intermediate bucket and newly created bucket should be in
|
|
396
469
|
# workspace A.
|
|
397
470
|
if consolidation_mode_job_id is None:
|
|
398
|
-
return execution.launch(
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
471
|
+
return execution.launch(
|
|
472
|
+
task=controller_task,
|
|
473
|
+
cluster_name=controller_name,
|
|
474
|
+
stream_logs=stream_logs,
|
|
475
|
+
retry_until_up=True,
|
|
476
|
+
fast=True,
|
|
477
|
+
_request_name=request_names.AdminPolicyRequestName.
|
|
478
|
+
JOBS_LAUNCH_CONTROLLER,
|
|
479
|
+
_disable_controller_check=True)
|
|
404
480
|
# Manually launch the scheduler in consolidation mode.
|
|
405
481
|
local_handle = backend_utils.is_controller_accessible(
|
|
406
482
|
controller=controller, stopped_message='')
|
|
407
483
|
backend = backend_utils.get_backend_from_handle(
|
|
408
484
|
local_handle)
|
|
409
485
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
410
|
-
|
|
486
|
+
# Suppress file mount logs when submitting multiple jobs.
|
|
487
|
+
should_silence = num_jobs is not None and num_jobs > 1
|
|
488
|
+
with sky_logging.silent(should_silence):
|
|
411
489
|
backend.sync_file_mounts(
|
|
412
490
|
handle=local_handle,
|
|
413
491
|
all_file_mounts=controller_task.file_mounts,
|
|
@@ -423,12 +501,16 @@ def launch(
|
|
|
423
501
|
for k, v in controller_task.envs.items()
|
|
424
502
|
]
|
|
425
503
|
run_script = '\n'.join(env_cmds + [run_script])
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
backend.run_on_head(local_handle,
|
|
504
|
+
log_dir = os.path.join(skylet_constants.SKY_LOGS_DIRECTORY,
|
|
505
|
+
'managed_jobs')
|
|
506
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
507
|
+
log_path = os.path.join(
|
|
508
|
+
log_dir, f'submit-job-{consolidation_mode_job_id}.log')
|
|
509
|
+
backend.run_on_head(local_handle,
|
|
510
|
+
run_script,
|
|
511
|
+
log_path=log_path)
|
|
512
|
+
ux_utils.starting_message(
|
|
513
|
+
f'Job submitted, ID: {consolidation_mode_job_id}')
|
|
432
514
|
return consolidation_mode_job_id, local_handle
|
|
433
515
|
|
|
434
516
|
if pool is None:
|
|
@@ -437,15 +519,49 @@ def launch(
|
|
|
437
519
|
assert len(consolidation_mode_job_ids) == 1
|
|
438
520
|
return _submit_one(consolidation_mode_job_ids[0])
|
|
439
521
|
|
|
440
|
-
ids = []
|
|
441
|
-
all_handle = None
|
|
442
|
-
|
|
443
|
-
|
|
522
|
+
ids: List[int] = []
|
|
523
|
+
all_handle: Optional[backends.ResourceHandle] = None
|
|
524
|
+
|
|
525
|
+
if num_jobs == 1:
|
|
526
|
+
job_id = (consolidation_mode_job_ids[0]
|
|
444
527
|
if consolidation_mode_job_ids is not None else None)
|
|
445
|
-
jid, handle = _submit_one(job_id,
|
|
528
|
+
jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
|
|
446
529
|
assert jid is not None, (job_id, handle)
|
|
447
530
|
ids.append(jid)
|
|
448
531
|
all_handle = handle
|
|
532
|
+
else:
|
|
533
|
+
# Submit jobs in parallel using ThreadPoolExecutor
|
|
534
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
535
|
+
max_workers=min(num_jobs,
|
|
536
|
+
os.cpu_count() or 1)) as executor:
|
|
537
|
+
# Submit jobs concurrently
|
|
538
|
+
future_to_rank = {}
|
|
539
|
+
for job_rank in range(num_jobs):
|
|
540
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
541
|
+
if consolidation_mode_job_ids is not None else None)
|
|
542
|
+
future = executor.submit(_submit_one, job_id, job_rank,
|
|
543
|
+
num_jobs)
|
|
544
|
+
future_to_rank[future] = job_rank
|
|
545
|
+
|
|
546
|
+
# Collect results in order of job_rank to maintain consistent order.
|
|
547
|
+
results: List[Optional[Tuple[
|
|
548
|
+
int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
|
|
549
|
+
for future in concurrent.futures.as_completed(future_to_rank):
|
|
550
|
+
job_rank = future_to_rank[future]
|
|
551
|
+
try:
|
|
552
|
+
jid, handle = future.result()
|
|
553
|
+
assert jid is not None, (job_id, handle)
|
|
554
|
+
results[job_rank] = (jid, handle)
|
|
555
|
+
all_handle = handle # Keep the last handle.
|
|
556
|
+
except Exception as e:
|
|
557
|
+
logger.error(f'Error launching job {job_rank}: {e}')
|
|
558
|
+
raise e
|
|
559
|
+
|
|
560
|
+
# Extract job IDs in order
|
|
561
|
+
for res in results:
|
|
562
|
+
if res is not None:
|
|
563
|
+
ids.append(res[0])
|
|
564
|
+
|
|
449
565
|
return ids, all_handle
|
|
450
566
|
|
|
451
567
|
|
|
@@ -498,7 +614,8 @@ def queue_from_kubernetes_pod(
|
|
|
498
614
|
'kubernetes', cluster_info)[0]
|
|
499
615
|
|
|
500
616
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
501
|
-
skip_finished=skip_finished
|
|
617
|
+
skip_finished=skip_finished,
|
|
618
|
+
fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
|
|
502
619
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
503
620
|
code,
|
|
504
621
|
require_outputs=True,
|
|
@@ -575,8 +692,49 @@ def _maybe_restart_controller(
|
|
|
575
692
|
return handle
|
|
576
693
|
|
|
577
694
|
|
|
695
|
+
# For backwards compatibility
|
|
696
|
+
# TODO(hailong): Remove before 0.12.0.
|
|
697
|
+
@usage_lib.entrypoint
|
|
698
|
+
def queue(refresh: bool,
|
|
699
|
+
skip_finished: bool = False,
|
|
700
|
+
all_users: bool = False,
|
|
701
|
+
job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
|
|
702
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
703
|
+
"""Gets statuses of managed jobs.
|
|
704
|
+
|
|
705
|
+
Please refer to sky.cli.job_queue for documentation.
|
|
706
|
+
|
|
707
|
+
Returns:
|
|
708
|
+
[
|
|
709
|
+
{
|
|
710
|
+
'job_id': int,
|
|
711
|
+
'job_name': str,
|
|
712
|
+
'resources': str,
|
|
713
|
+
'submitted_at': (float) timestamp of submission,
|
|
714
|
+
'end_at': (float) timestamp of end,
|
|
715
|
+
'job_duration': (float) duration in seconds,
|
|
716
|
+
'recovery_count': (int) Number of retries,
|
|
717
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
718
|
+
'cluster_resources': (str) resources of the cluster,
|
|
719
|
+
'region': (str) region of the cluster,
|
|
720
|
+
'user_name': (Optional[str]) job creator's user name,
|
|
721
|
+
'user_hash': (str) job creator's user hash,
|
|
722
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
723
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
724
|
+
}
|
|
725
|
+
]
|
|
726
|
+
Raises:
|
|
727
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
|
728
|
+
does not exist.
|
|
729
|
+
RuntimeError: if failed to get the managed jobs with ssh.
|
|
730
|
+
"""
|
|
731
|
+
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
|
|
732
|
+
|
|
733
|
+
return jobs
|
|
734
|
+
|
|
735
|
+
|
|
578
736
|
@usage_lib.entrypoint
|
|
579
|
-
def
|
|
737
|
+
def queue_v2_api(
|
|
580
738
|
refresh: bool,
|
|
581
739
|
skip_finished: bool = False,
|
|
582
740
|
all_users: bool = False,
|
|
@@ -588,9 +746,34 @@ def queue(
|
|
|
588
746
|
page: Optional[int] = None,
|
|
589
747
|
limit: Optional[int] = None,
|
|
590
748
|
statuses: Optional[List[str]] = None,
|
|
749
|
+
fields: Optional[List[str]] = None,
|
|
750
|
+
) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
|
|
751
|
+
"""Gets statuses of managed jobs and parse the
|
|
752
|
+
jobs to responses.ManagedJobRecord."""
|
|
753
|
+
jobs, total, status_counts, total_no_filter = queue_v2(
|
|
754
|
+
refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
|
|
755
|
+
name_match, pool_match, page, limit, statuses, fields)
|
|
756
|
+
return [responses.ManagedJobRecord(**job) for job in jobs
|
|
757
|
+
], total, status_counts, total_no_filter
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
@metrics_lib.time_me
|
|
761
|
+
def queue_v2(
|
|
762
|
+
refresh: bool,
|
|
763
|
+
skip_finished: bool = False,
|
|
764
|
+
all_users: bool = False,
|
|
765
|
+
job_ids: Optional[List[int]] = None,
|
|
766
|
+
user_match: Optional[str] = None,
|
|
767
|
+
workspace_match: Optional[str] = None,
|
|
768
|
+
name_match: Optional[str] = None,
|
|
769
|
+
pool_match: Optional[str] = None,
|
|
770
|
+
page: Optional[int] = None,
|
|
771
|
+
limit: Optional[int] = None,
|
|
772
|
+
statuses: Optional[List[str]] = None,
|
|
773
|
+
fields: Optional[List[str]] = None,
|
|
591
774
|
) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
|
|
592
775
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
593
|
-
"""Gets statuses of managed jobs.
|
|
776
|
+
"""Gets statuses of managed jobs with filtering.
|
|
594
777
|
|
|
595
778
|
Please refer to sky.cli.job_queue for documentation.
|
|
596
779
|
|
|
@@ -633,20 +816,23 @@ def queue(
|
|
|
633
816
|
if page is not None:
|
|
634
817
|
raise ValueError('Limit must be specified when page is specified')
|
|
635
818
|
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
819
|
+
with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
|
|
820
|
+
handle = _maybe_restart_controller(refresh,
|
|
821
|
+
stopped_message='No in-progress '
|
|
822
|
+
'managed jobs.',
|
|
823
|
+
spinner_message='Checking '
|
|
824
|
+
'managed jobs')
|
|
641
825
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
642
826
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
643
827
|
|
|
644
828
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
829
|
+
show_jobs_without_user_hash = False
|
|
645
830
|
if not all_users:
|
|
646
831
|
user_hashes = [common_utils.get_user_hash()]
|
|
647
832
|
# For backwards compatibility, we show jobs that do not have a
|
|
648
833
|
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
649
834
|
user_hashes.append(None)
|
|
835
|
+
show_jobs_without_user_hash = True
|
|
650
836
|
elif user_match is not None:
|
|
651
837
|
users = global_user_state.get_user_by_name_match(user_match)
|
|
652
838
|
if not users:
|
|
@@ -654,70 +840,109 @@ def queue(
|
|
|
654
840
|
user_hashes = [user.id for user in users]
|
|
655
841
|
|
|
656
842
|
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
843
|
+
|
|
844
|
+
if handle.is_grpc_enabled_with_flag:
|
|
845
|
+
try:
|
|
846
|
+
request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
847
|
+
skip_finished=skip_finished,
|
|
848
|
+
accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
|
|
849
|
+
workspaces=accessible_workspaces)),
|
|
850
|
+
job_ids=managed_jobsv1_pb2.JobIds(
|
|
851
|
+
ids=job_ids) if job_ids is not None else None,
|
|
852
|
+
workspace_match=workspace_match,
|
|
853
|
+
name_match=name_match,
|
|
854
|
+
pool_match=pool_match,
|
|
855
|
+
page=page,
|
|
856
|
+
limit=limit,
|
|
857
|
+
# Remove None from user_hashes, as the gRPC server uses the
|
|
858
|
+
# show_jobs_without_user_hash flag instead.
|
|
859
|
+
user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
|
|
860
|
+
user_hash for user_hash in user_hashes
|
|
861
|
+
if user_hash is not None
|
|
862
|
+
]) if user_hashes is not None else None,
|
|
863
|
+
statuses=managed_jobsv1_pb2.Statuses(
|
|
864
|
+
statuses=statuses) if statuses is not None else None,
|
|
865
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
866
|
+
fields=fields) if fields is not None else None,
|
|
867
|
+
show_jobs_without_user_hash=show_jobs_without_user_hash,
|
|
868
|
+
)
|
|
869
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
870
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
871
|
+
handle.get_grpc_channel()).get_managed_job_table(request))
|
|
872
|
+
jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
|
|
873
|
+
return jobs, response.total, dict(
|
|
874
|
+
response.status_counts), response.total_no_filter
|
|
875
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
876
|
+
pass
|
|
877
|
+
|
|
878
|
+
with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
|
|
879
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
880
|
+
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
881
|
+
name_match, pool_match, page, limit, user_hashes, statuses, fields)
|
|
882
|
+
with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
|
|
883
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
884
|
+
handle,
|
|
885
|
+
code,
|
|
886
|
+
require_outputs=True,
|
|
887
|
+
stream_logs=False,
|
|
888
|
+
separate_stderr=True)
|
|
666
889
|
|
|
667
890
|
if returncode != 0:
|
|
668
891
|
logger.error(job_table_payload + stderr)
|
|
669
892
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
670
893
|
f'{returncode}.\n{job_table_payload + stderr}')
|
|
671
894
|
|
|
672
|
-
(jobs,
|
|
673
|
-
|
|
895
|
+
with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
|
|
896
|
+
(jobs, total, result_type, total_no_filter, status_counts
|
|
897
|
+
) = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
674
898
|
|
|
675
899
|
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
676
900
|
return jobs, total, status_counts, total_no_filter
|
|
677
901
|
|
|
678
902
|
# Backward compatibility for old jobs controller without filtering
|
|
679
903
|
# TODO(hailong): remove this after 0.12.0
|
|
680
|
-
|
|
904
|
+
with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
|
|
905
|
+
if not all_users:
|
|
681
906
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
907
|
+
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
908
|
+
user_hash = job.get('user_hash', None)
|
|
909
|
+
if user_hash is None:
|
|
910
|
+
# For backwards compatibility, we show jobs that do not have
|
|
911
|
+
# a user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
912
|
+
return True
|
|
913
|
+
return user_hash == common_utils.get_user_hash()
|
|
689
914
|
|
|
690
|
-
|
|
915
|
+
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
|
691
916
|
|
|
692
|
-
jobs = list(
|
|
693
|
-
filter(
|
|
694
|
-
lambda job: job.get('workspace', skylet_constants.
|
|
695
|
-
SKYPILOT_DEFAULT_WORKSPACE) in
|
|
696
|
-
accessible_workspaces, jobs))
|
|
697
|
-
|
|
698
|
-
if skip_finished:
|
|
699
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
700
|
-
# finished, we will include all its tasks.
|
|
701
|
-
non_finished_tasks = list(
|
|
702
|
-
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
703
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
704
917
|
jobs = list(
|
|
705
|
-
filter(
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
918
|
+
filter(
|
|
919
|
+
lambda job: job.get('workspace', skylet_constants.
|
|
920
|
+
SKYPILOT_DEFAULT_WORKSPACE) in
|
|
921
|
+
accessible_workspaces, jobs))
|
|
922
|
+
|
|
923
|
+
if skip_finished:
|
|
924
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
|
925
|
+
# finished, we will include all its tasks.
|
|
926
|
+
non_finished_tasks = list(
|
|
927
|
+
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
928
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
929
|
+
jobs = list(
|
|
930
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
931
|
+
|
|
932
|
+
if job_ids:
|
|
933
|
+
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
934
|
+
|
|
935
|
+
filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
|
|
936
|
+
jobs,
|
|
937
|
+
workspace_match,
|
|
938
|
+
name_match,
|
|
939
|
+
pool_match,
|
|
940
|
+
page=page,
|
|
941
|
+
limit=limit,
|
|
942
|
+
user_match=user_match,
|
|
943
|
+
enable_user_match=True,
|
|
944
|
+
statuses=statuses,
|
|
945
|
+
)
|
|
721
946
|
return filtered_jobs, total, status_counts, total_no_filter
|
|
722
947
|
|
|
723
948
|
|
|
@@ -760,33 +985,60 @@ def cancel(name: Optional[str] = None,
|
|
|
760
985
|
'Can only specify one of JOB_IDS, name, pool, or all/'
|
|
761
986
|
f'all_users. Provided {" ".join(arguments)!r}.')
|
|
762
987
|
|
|
988
|
+
job_ids = None if (all_users or all) else job_ids
|
|
989
|
+
|
|
763
990
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
764
991
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
992
|
+
|
|
993
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
994
|
+
|
|
995
|
+
if not use_legacy:
|
|
996
|
+
current_workspace = skypilot_config.get_active_workspace()
|
|
997
|
+
try:
|
|
998
|
+
request = managed_jobsv1_pb2.CancelJobsRequest(
|
|
999
|
+
current_workspace=current_workspace)
|
|
1000
|
+
|
|
1001
|
+
if all_users or all or job_ids:
|
|
1002
|
+
request.all_users = all_users
|
|
1003
|
+
if all:
|
|
1004
|
+
request.user_hash = common_utils.get_user_hash()
|
|
1005
|
+
if job_ids is not None:
|
|
1006
|
+
request.job_ids.CopyFrom(
|
|
1007
|
+
managed_jobsv1_pb2.JobIds(ids=job_ids))
|
|
1008
|
+
elif name is not None:
|
|
1009
|
+
request.job_name = name
|
|
1010
|
+
else:
|
|
1011
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
1012
|
+
request.pool_name = pool
|
|
1013
|
+
|
|
1014
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
1015
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
1016
|
+
handle.get_grpc_channel()).cancel_managed_jobs(request))
|
|
1017
|
+
stdout = response.message
|
|
1018
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
1019
|
+
use_legacy = True
|
|
1020
|
+
|
|
1021
|
+
if use_legacy:
|
|
1022
|
+
if all_users or all or job_ids:
|
|
1023
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
|
|
1024
|
+
job_ids, all_users=all_users)
|
|
1025
|
+
elif name is not None:
|
|
1026
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
|
|
1027
|
+
name)
|
|
1028
|
+
else:
|
|
1029
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
1030
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
|
|
1031
|
+
pool)
|
|
1032
|
+
# The stderr is redirected to stdout
|
|
1033
|
+
returncode, stdout, stderr = backend.run_on_head(
|
|
1034
|
+
handle, code, require_outputs=True, stream_logs=False)
|
|
1035
|
+
try:
|
|
1036
|
+
subprocess_utils.handle_returncode(
|
|
1037
|
+
returncode, code, 'Failed to cancel managed job',
|
|
1038
|
+
stdout + stderr)
|
|
1039
|
+
except exceptions.CommandError as e:
|
|
1040
|
+
with ux_utils.print_exception_no_traceback():
|
|
1041
|
+
raise RuntimeError(e.error_msg) from e
|
|
790
1042
|
|
|
791
1043
|
logger.info(stdout)
|
|
792
1044
|
if 'Multiple jobs found with name' in stdout:
|
|
@@ -901,9 +1153,10 @@ def pool_apply(
|
|
|
901
1153
|
task: 'sky.Task',
|
|
902
1154
|
pool_name: str,
|
|
903
1155
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
1156
|
+
workers: Optional[int] = None,
|
|
904
1157
|
) -> None:
|
|
905
1158
|
"""Apply a config to a pool."""
|
|
906
|
-
return impl.apply(task, pool_name, mode, pool=True)
|
|
1159
|
+
return impl.apply(task, workers, pool_name, mode, pool=True)
|
|
907
1160
|
|
|
908
1161
|
|
|
909
1162
|
@usage_lib.entrypoint
|