skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -4,21 +4,23 @@ NOTE: whenever an API change is made in this file, we need to bump the
|
|
|
4
4
|
jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
|
|
5
5
|
ManagedJobCodeGen.
|
|
6
6
|
"""
|
|
7
|
+
import asyncio
|
|
7
8
|
import collections
|
|
8
|
-
import datetime
|
|
9
|
+
from datetime import datetime
|
|
9
10
|
import enum
|
|
10
11
|
import os
|
|
11
12
|
import pathlib
|
|
13
|
+
import re
|
|
12
14
|
import shlex
|
|
13
15
|
import textwrap
|
|
14
16
|
import time
|
|
15
17
|
import traceback
|
|
16
18
|
import typing
|
|
17
|
-
from typing import Any, Deque, Dict,
|
|
19
|
+
from typing import (Any, Deque, Dict, Iterable, List, Literal, Optional, Set,
|
|
20
|
+
TextIO, Tuple, Union)
|
|
18
21
|
|
|
19
22
|
import colorama
|
|
20
23
|
import filelock
|
|
21
|
-
from typing_extensions import Literal
|
|
22
24
|
|
|
23
25
|
from sky import backends
|
|
24
26
|
from sky import exceptions
|
|
@@ -27,16 +29,18 @@ from sky import sky_logging
|
|
|
27
29
|
from sky import skypilot_config
|
|
28
30
|
from sky.adaptors import common as adaptors_common
|
|
29
31
|
from sky.backends import backend_utils
|
|
32
|
+
from sky.backends import cloud_vm_ray_backend
|
|
30
33
|
from sky.jobs import constants as managed_job_constants
|
|
31
34
|
from sky.jobs import scheduler
|
|
32
35
|
from sky.jobs import state as managed_job_state
|
|
36
|
+
from sky.schemas.api import responses
|
|
33
37
|
from sky.skylet import constants
|
|
34
38
|
from sky.skylet import job_lib
|
|
35
39
|
from sky.skylet import log_lib
|
|
36
40
|
from sky.usage import usage_lib
|
|
37
41
|
from sky.utils import annotations
|
|
38
|
-
from sky.utils import command_runner
|
|
39
42
|
from sky.utils import common_utils
|
|
43
|
+
from sky.utils import context_utils
|
|
40
44
|
from sky.utils import controller_utils
|
|
41
45
|
from sky.utils import infra_utils
|
|
42
46
|
from sky.utils import log_utils
|
|
@@ -47,18 +51,29 @@ from sky.utils import subprocess_utils
|
|
|
47
51
|
from sky.utils import ux_utils
|
|
48
52
|
|
|
49
53
|
if typing.TYPE_CHECKING:
|
|
54
|
+
from google.protobuf import descriptor
|
|
55
|
+
from google.protobuf import json_format
|
|
56
|
+
import grpc
|
|
50
57
|
import psutil
|
|
51
58
|
|
|
52
59
|
import sky
|
|
53
60
|
from sky import dag as dag_lib
|
|
61
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
62
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
54
63
|
else:
|
|
64
|
+
json_format = adaptors_common.LazyImport('google.protobuf.json_format')
|
|
65
|
+
descriptor = adaptors_common.LazyImport('google.protobuf.descriptor')
|
|
55
66
|
psutil = adaptors_common.LazyImport('psutil')
|
|
67
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
68
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
69
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
70
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
56
71
|
|
|
57
72
|
logger = sky_logging.init_logger(__name__)
|
|
58
73
|
|
|
59
|
-
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
60
74
|
# Controller checks its job's status every this many seconds.
|
|
61
|
-
|
|
75
|
+
# This is a tradeoff between the latency and the resource usage.
|
|
76
|
+
JOB_STATUS_CHECK_GAP_SECONDS = 15
|
|
62
77
|
|
|
63
78
|
# Controller checks if its job has started every this many seconds.
|
|
64
79
|
JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
@@ -67,6 +82,7 @@ _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
|
|
67
82
|
|
|
68
83
|
_JOB_STATUS_FETCH_MAX_RETRIES = 3
|
|
69
84
|
_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
|
|
85
|
+
_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
|
|
70
86
|
|
|
71
87
|
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
|
72
88
|
'Waiting for task to start[/]'
|
|
@@ -82,7 +98,29 @@ _JOB_CANCELLED_MESSAGE = (
|
|
|
82
98
|
# blocking for a long time. This should be significantly longer than the
|
|
83
99
|
# JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
|
|
84
100
|
# update the state.
|
|
85
|
-
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS =
|
|
101
|
+
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
|
|
102
|
+
|
|
103
|
+
# After enabling consolidation mode, we need to restart the API server to get
|
|
104
|
+
# the jobs refresh deamon and correct number of executors. We use this file to
|
|
105
|
+
# indicate that the API server has been restarted after enabling consolidation
|
|
106
|
+
# mode.
|
|
107
|
+
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
|
|
108
|
+
'~/.sky/.jobs_controller_consolidation_reloaded_signal')
|
|
109
|
+
|
|
110
|
+
# The response fields for managed jobs that require cluster handle
|
|
111
|
+
_CLUSTER_HANDLE_FIELDS = [
|
|
112
|
+
'cluster_resources',
|
|
113
|
+
'cluster_resources_full',
|
|
114
|
+
'cloud',
|
|
115
|
+
'region',
|
|
116
|
+
'zone',
|
|
117
|
+
'infra',
|
|
118
|
+
'accelerators',
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
# The response fields for managed jobs that are not stored in the database
|
|
122
|
+
# These fields will be mapped to the DB fields in the `_update_fields`.
|
|
123
|
+
_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
|
|
86
124
|
|
|
87
125
|
|
|
88
126
|
class ManagedJobQueueResultType(enum.Enum):
|
|
@@ -99,7 +137,10 @@ class UserSignal(enum.Enum):
|
|
|
99
137
|
|
|
100
138
|
|
|
101
139
|
# ====== internal functions ======
|
|
102
|
-
def terminate_cluster(
|
|
140
|
+
def terminate_cluster(
|
|
141
|
+
cluster_name: str,
|
|
142
|
+
max_retry: int = 6,
|
|
143
|
+
) -> None:
|
|
103
144
|
"""Terminate the cluster."""
|
|
104
145
|
from sky import core # pylint: disable=import-outside-toplevel
|
|
105
146
|
retry_cnt = 0
|
|
@@ -144,32 +185,28 @@ def _validate_consolidation_mode_config(
|
|
|
144
185
|
if current_is_consolidation_mode:
|
|
145
186
|
controller_cn = (
|
|
146
187
|
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
|
147
|
-
if global_user_state.
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
'terminate the controller cluster first.'
|
|
154
|
-
f'{colorama.Style.RESET_ALL}')
|
|
188
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
189
|
+
logger.warning(
|
|
190
|
+
f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
|
|
191
|
+
f'but the controller cluster {controller_cn} is still running. '
|
|
192
|
+
'Please terminate the controller cluster first.'
|
|
193
|
+
f'{colorama.Style.RESET_ALL}')
|
|
155
194
|
else:
|
|
156
|
-
|
|
157
|
-
if
|
|
195
|
+
total_jobs = managed_job_state.get_managed_jobs_total()
|
|
196
|
+
if total_jobs > 0:
|
|
158
197
|
nonterminal_jobs = (
|
|
159
198
|
managed_job_state.get_nonterminal_job_ids_by_name(
|
|
160
|
-
None, all_users=True))
|
|
199
|
+
None, None, all_users=True))
|
|
161
200
|
if nonterminal_jobs:
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
'running. Please terminate those jobs '
|
|
168
|
-
f'first.{colorama.Style.RESET_ALL}')
|
|
201
|
+
logger.warning(
|
|
202
|
+
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
|
203
|
+
f'but there are still {len(nonterminal_jobs)} managed jobs '
|
|
204
|
+
'running. Please terminate those jobs first.'
|
|
205
|
+
f'{colorama.Style.RESET_ALL}')
|
|
169
206
|
else:
|
|
170
207
|
logger.warning(
|
|
171
208
|
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
|
172
|
-
f'but there are {
|
|
209
|
+
f'but there are {total_jobs} jobs from previous '
|
|
173
210
|
'consolidation mode. Reset the `jobs.controller.'
|
|
174
211
|
'consolidation_mode` to `true` and run `sky jobs queue` '
|
|
175
212
|
'to see those jobs. Switching to normal mode will '
|
|
@@ -181,75 +218,127 @@ def _validate_consolidation_mode_config(
|
|
|
181
218
|
# API Server. Under the hood, we submit the job monitoring logic as processes
|
|
182
219
|
# directly in the API Server.
|
|
183
220
|
# Use LRU Cache so that the check is only done once.
|
|
184
|
-
@annotations.lru_cache(scope='request', maxsize=
|
|
185
|
-
def is_consolidation_mode() -> bool:
|
|
186
|
-
|
|
221
|
+
@annotations.lru_cache(scope='request', maxsize=2)
|
|
222
|
+
def is_consolidation_mode(on_api_restart: bool = False) -> bool:
|
|
223
|
+
if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
config_consolidation_mode = skypilot_config.get_nested(
|
|
187
227
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
228
|
+
|
|
229
|
+
signal_file = pathlib.Path(
|
|
230
|
+
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
|
|
231
|
+
|
|
232
|
+
if on_api_restart:
|
|
233
|
+
if config_consolidation_mode:
|
|
234
|
+
signal_file.touch()
|
|
235
|
+
else:
|
|
236
|
+
restart_signal_file_exists = signal_file.exists()
|
|
237
|
+
if not restart_signal_file_exists:
|
|
238
|
+
if config_consolidation_mode:
|
|
239
|
+
logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
|
|
240
|
+
'managed jobs is enabled in the server config, '
|
|
241
|
+
'but the API server has not been restarted yet. '
|
|
242
|
+
'Please restart the API server to enable it.'
|
|
243
|
+
f'{colorama.Style.RESET_ALL}')
|
|
244
|
+
return False
|
|
245
|
+
elif not config_consolidation_mode:
|
|
246
|
+
# Cleanup the signal file if the consolidation mode is disabled in
|
|
247
|
+
# the config. This allow the user to disable the consolidation mode
|
|
248
|
+
# without restarting the API server.
|
|
249
|
+
signal_file.unlink()
|
|
250
|
+
|
|
188
251
|
# We should only do this check on API server, as the controller will not
|
|
189
252
|
# have related config and will always seemingly disabled for consolidation
|
|
190
253
|
# mode. Check #6611 for more details.
|
|
191
254
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
192
|
-
_validate_consolidation_mode_config(
|
|
193
|
-
return
|
|
255
|
+
_validate_consolidation_mode_config(config_consolidation_mode)
|
|
256
|
+
return config_consolidation_mode
|
|
194
257
|
|
|
195
258
|
|
|
196
|
-
def ha_recovery_for_consolidation_mode():
|
|
197
|
-
"""Recovery logic for
|
|
259
|
+
def ha_recovery_for_consolidation_mode() -> None:
|
|
260
|
+
"""Recovery logic for consolidation mode.
|
|
261
|
+
|
|
262
|
+
This should only be called from the managed-job-status-refresh-daemon, due
|
|
263
|
+
so that we have correct ordering recovery -> controller start -> job status
|
|
264
|
+
updates. This also should ensure correct operation during a rolling update.
|
|
265
|
+
"""
|
|
198
266
|
# No setup recovery is needed in consolidation mode, as the API server
|
|
199
267
|
# already has all runtime installed. Directly start jobs recovery here.
|
|
200
268
|
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
201
|
-
|
|
269
|
+
scheduler.maybe_start_controllers()
|
|
202
270
|
with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
|
|
203
|
-
'
|
|
271
|
+
'a',
|
|
204
272
|
encoding='utf-8') as f:
|
|
205
273
|
start = time.time()
|
|
206
|
-
f.write(f'Starting HA recovery at {datetime.
|
|
207
|
-
|
|
274
|
+
f.write(f'Starting HA recovery at {datetime.now()}\n')
|
|
275
|
+
jobs, _ = managed_job_state.get_managed_jobs_with_filters(fields=[
|
|
276
|
+
'job_id', 'controller_pid', 'controller_pid_started_at',
|
|
277
|
+
'schedule_state', 'status'
|
|
278
|
+
])
|
|
279
|
+
for job in jobs:
|
|
208
280
|
job_id = job['job_id']
|
|
209
281
|
controller_pid = job['controller_pid']
|
|
282
|
+
controller_pid_started_at = job.get('controller_pid_started_at')
|
|
210
283
|
|
|
211
284
|
# In consolidation mode, it is possible that only the API server
|
|
212
285
|
# process is restarted, and the controller process is not. In such
|
|
213
286
|
# case, we don't need to do anything and the controller process will
|
|
214
|
-
# just keep running.
|
|
287
|
+
# just keep running. However, in most cases, the controller process
|
|
288
|
+
# will also be stopped - either by a pod restart in k8s API server,
|
|
289
|
+
# or by `sky api stop`, which will stop controllers.
|
|
290
|
+
# TODO(cooperc): Make sure we cannot have a controller process
|
|
291
|
+
# running across API server restarts for consistency.
|
|
215
292
|
if controller_pid is not None:
|
|
216
293
|
try:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
294
|
+
# Note: We provide the legacy job id to the
|
|
295
|
+
# controller_process_alive just in case, but we shouldn't
|
|
296
|
+
# have a running legacy job controller process at this point
|
|
297
|
+
if controller_process_alive(
|
|
298
|
+
managed_job_state.ControllerPidRecord(
|
|
299
|
+
pid=controller_pid,
|
|
300
|
+
started_at=controller_pid_started_at), job_id):
|
|
301
|
+
message = (f'Controller pid {controller_pid} for '
|
|
302
|
+
f'job {job_id} is still running. '
|
|
303
|
+
'Skipping recovery.\n')
|
|
304
|
+
logger.debug(message)
|
|
305
|
+
f.write(message)
|
|
221
306
|
continue
|
|
222
307
|
except Exception: # pylint: disable=broad-except
|
|
223
308
|
# _controller_process_alive may raise if psutil fails; we
|
|
224
309
|
# should not crash the recovery logic because of this.
|
|
225
|
-
|
|
226
|
-
|
|
310
|
+
message = ('Error checking controller pid '
|
|
311
|
+
f'{controller_pid} for job {job_id}\n')
|
|
312
|
+
logger.warning(message, exc_info=True)
|
|
313
|
+
f.write(message)
|
|
227
314
|
|
|
315
|
+
# Controller process is not set or not alive.
|
|
228
316
|
if job['schedule_state'] not in [
|
|
229
317
|
managed_job_state.ManagedJobScheduleState.DONE,
|
|
230
|
-
managed_job_state.ManagedJobScheduleState.WAITING
|
|
318
|
+
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
319
|
+
# INACTIVE job may be mid-submission, don't set to WAITING.
|
|
320
|
+
managed_job_state.ManagedJobScheduleState.INACTIVE,
|
|
231
321
|
]:
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
runner.run(script)
|
|
239
|
-
f.write(f'Job {job_id} completed recovery at '
|
|
240
|
-
f'{datetime.datetime.now()}\n')
|
|
241
|
-
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
|
322
|
+
managed_job_state.reset_job_for_recovery(job_id)
|
|
323
|
+
message = (f'Job {job_id} completed recovery at '
|
|
324
|
+
f'{datetime.now()}\n')
|
|
325
|
+
logger.info(message)
|
|
326
|
+
f.write(message)
|
|
327
|
+
f.write(f'HA recovery completed at {datetime.now()}\n')
|
|
242
328
|
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
243
329
|
|
|
244
330
|
|
|
245
|
-
def get_job_status(
|
|
246
|
-
|
|
331
|
+
async def get_job_status(
|
|
332
|
+
backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
333
|
+
job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
|
|
247
334
|
"""Check the status of the job running on a managed job cluster.
|
|
248
335
|
|
|
249
336
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
250
337
|
FAILED_SETUP or CANCELLED.
|
|
251
338
|
"""
|
|
252
|
-
|
|
339
|
+
# TODO(luca) make this async
|
|
340
|
+
handle = await context_utils.to_thread(
|
|
341
|
+
global_user_state.get_handle_from_cluster_name, cluster_name)
|
|
253
342
|
if handle is None:
|
|
254
343
|
# This can happen if the cluster was preempted and background status
|
|
255
344
|
# refresh already noticed and cleaned it up.
|
|
@@ -260,9 +349,12 @@ def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
|
260
349
|
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
261
350
|
try:
|
|
262
351
|
logger.info('=== Checking the job status... ===')
|
|
263
|
-
statuses =
|
|
264
|
-
|
|
265
|
-
|
|
352
|
+
statuses = await asyncio.wait_for(
|
|
353
|
+
context_utils.to_thread(backend.get_job_status,
|
|
354
|
+
handle,
|
|
355
|
+
job_ids=job_ids,
|
|
356
|
+
stream_logs=False),
|
|
357
|
+
timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
|
|
266
358
|
status = list(statuses.values())[0]
|
|
267
359
|
if status is None:
|
|
268
360
|
logger.info('No job found.')
|
|
@@ -270,29 +362,129 @@ def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
|
270
362
|
logger.info(f'Job status: {status}')
|
|
271
363
|
logger.info('=' * 34)
|
|
272
364
|
return status
|
|
273
|
-
except exceptions.CommandError
|
|
365
|
+
except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
|
|
366
|
+
ValueError, TypeError, asyncio.TimeoutError) as e:
|
|
367
|
+
# Note: Each of these exceptions has some additional conditions to
|
|
368
|
+
# limit how we handle it and whether or not we catch it.
|
|
274
369
|
# Retry on k8s transient network errors. This is useful when using
|
|
275
370
|
# coreweave which may have transient network issue sometimes.
|
|
276
|
-
|
|
277
|
-
|
|
371
|
+
is_transient_error = False
|
|
372
|
+
detailed_reason = None
|
|
373
|
+
if isinstance(e, exceptions.CommandError):
|
|
374
|
+
detailed_reason = e.detailed_reason
|
|
375
|
+
if (detailed_reason is not None and
|
|
376
|
+
_JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
|
|
377
|
+
is_transient_error = True
|
|
378
|
+
elif isinstance(e, grpc.RpcError):
|
|
379
|
+
detailed_reason = e.details()
|
|
380
|
+
if e.code() in [
|
|
381
|
+
grpc.StatusCode.UNAVAILABLE,
|
|
382
|
+
grpc.StatusCode.DEADLINE_EXCEEDED
|
|
383
|
+
]:
|
|
384
|
+
is_transient_error = True
|
|
385
|
+
elif isinstance(e, grpc.FutureTimeoutError):
|
|
386
|
+
detailed_reason = 'Timeout'
|
|
387
|
+
elif isinstance(e, asyncio.TimeoutError):
|
|
388
|
+
detailed_reason = ('Job status check timed out after '
|
|
389
|
+
f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
|
|
390
|
+
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
391
|
+
elif isinstance(e, ValueError):
|
|
392
|
+
# If the cluster yaml is deleted in the middle of getting the
|
|
393
|
+
# SSH credentials, we could see this. See
|
|
394
|
+
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
395
|
+
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
396
|
+
detailed_reason = 'Cluster yaml was deleted'
|
|
397
|
+
else:
|
|
398
|
+
raise
|
|
399
|
+
elif isinstance(e, TypeError):
|
|
400
|
+
# We will grab the SSH credentials from the cluster yaml, but if
|
|
401
|
+
# handle.cluster_yaml is None, we will just return an empty dict
|
|
402
|
+
# for the credentials. See
|
|
403
|
+
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
404
|
+
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
405
|
+
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
406
|
+
# TypeError if the cluster yaml is removed from the handle right
|
|
407
|
+
# when we pull it before the cluster is fully deleted.
|
|
408
|
+
error_msg_to_check = (
|
|
409
|
+
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
410
|
+
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
411
|
+
if str(e) == error_msg_to_check:
|
|
412
|
+
detailed_reason = 'SSH credentials were already cleaned up'
|
|
413
|
+
else:
|
|
414
|
+
raise
|
|
415
|
+
if is_transient_error:
|
|
278
416
|
logger.info('Failed to connect to the cluster. Retrying '
|
|
279
417
|
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
280
418
|
logger.info('=' * 34)
|
|
281
|
-
|
|
419
|
+
await asyncio.sleep(1)
|
|
282
420
|
else:
|
|
283
|
-
logger.info(f'Failed to get job status: {
|
|
421
|
+
logger.info(f'Failed to get job status: {detailed_reason}')
|
|
284
422
|
logger.info('=' * 34)
|
|
285
423
|
return None
|
|
286
424
|
return None
|
|
287
425
|
|
|
288
426
|
|
|
289
|
-
def
|
|
290
|
-
|
|
427
|
+
def controller_process_alive(record: managed_job_state.ControllerPidRecord,
|
|
428
|
+
legacy_job_id: Optional[int] = None,
|
|
429
|
+
quiet: bool = True) -> bool:
|
|
430
|
+
"""Check if the controller process is alive.
|
|
431
|
+
|
|
432
|
+
If legacy_job_id is provided, this will also return True for a legacy
|
|
433
|
+
single-job controller process with that job id, based on the cmdline. This
|
|
434
|
+
is how the old check worked before #7051.
|
|
435
|
+
"""
|
|
291
436
|
try:
|
|
292
|
-
process = psutil.Process(pid)
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
437
|
+
process = psutil.Process(record.pid)
|
|
438
|
+
|
|
439
|
+
if record.started_at is not None:
|
|
440
|
+
if process.create_time() != record.started_at:
|
|
441
|
+
if not quiet:
|
|
442
|
+
logger.debug(f'Controller process {record.pid} has started '
|
|
443
|
+
f'at {record.started_at} but process has '
|
|
444
|
+
f'started at {process.create_time()}')
|
|
445
|
+
return False
|
|
446
|
+
else:
|
|
447
|
+
# If we can't check the create_time try to check the cmdline instead
|
|
448
|
+
cmd_str = ' '.join(process.cmdline())
|
|
449
|
+
# pylint: disable=line-too-long
|
|
450
|
+
# Pre-#7051 cmdline: /path/to/python -u -m sky.jobs.controller <dag.yaml_path> --job-id <job_id>
|
|
451
|
+
# Post-#7051 cmdline: /path/to/python -u -msky.jobs.controller
|
|
452
|
+
# pylint: enable=line-too-long
|
|
453
|
+
if ('-m sky.jobs.controller' not in cmd_str and
|
|
454
|
+
'-msky.jobs.controller' not in cmd_str):
|
|
455
|
+
if not quiet:
|
|
456
|
+
logger.debug(f'Process {record.pid} is not a controller '
|
|
457
|
+
'process - missing "-m sky.jobs.controller" '
|
|
458
|
+
f'from cmdline: {cmd_str}')
|
|
459
|
+
return False
|
|
460
|
+
if (legacy_job_id is not None and '--job-id' in cmd_str and
|
|
461
|
+
f'--job-id {legacy_job_id}' not in cmd_str):
|
|
462
|
+
if not quiet:
|
|
463
|
+
logger.debug(f'Controller process {record.pid} has the '
|
|
464
|
+
f'wrong --job-id (expected {legacy_job_id}) '
|
|
465
|
+
f'in cmdline: {cmd_str}')
|
|
466
|
+
return False
|
|
467
|
+
|
|
468
|
+
# On linux, psutil.Process(pid) will return a valid process object
|
|
469
|
+
# even if the pid is actually a thread ID within the process. This
|
|
470
|
+
# hugely inflates the number of valid-looking pids, increasing the
|
|
471
|
+
# chance that we will falsely believe a controller is alive. The pid
|
|
472
|
+
# file should never contain thread IDs, just process IDs. We can
|
|
473
|
+
# check this with psutil.pid_exists(pid), which is false for TIDs.
|
|
474
|
+
# See pid_exists in psutil/_pslinux.py
|
|
475
|
+
if not psutil.pid_exists(record.pid):
|
|
476
|
+
if not quiet:
|
|
477
|
+
logger.debug(
|
|
478
|
+
f'Controller process {record.pid} is not a valid '
|
|
479
|
+
'process id.')
|
|
480
|
+
return False
|
|
481
|
+
|
|
482
|
+
return process.is_running()
|
|
483
|
+
|
|
484
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
|
|
485
|
+
OSError) as e:
|
|
486
|
+
if not quiet:
|
|
487
|
+
logger.debug(f'Controller process {record.pid} is not running: {e}')
|
|
296
488
|
return False
|
|
297
489
|
|
|
298
490
|
|
|
@@ -326,9 +518,8 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
326
518
|
This function should not throw any exception. If it fails, it will
|
|
327
519
|
capture the error message, and log/return it.
|
|
328
520
|
"""
|
|
329
|
-
managed_job_state.remove_ha_recovery_script(job_id)
|
|
330
521
|
error_msg = None
|
|
331
|
-
tasks = managed_job_state.
|
|
522
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
332
523
|
for task in tasks:
|
|
333
524
|
pool = task.get('pool', None)
|
|
334
525
|
if pool is None:
|
|
@@ -351,43 +542,6 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
351
542
|
logger.exception(error_msg, exc_info=e)
|
|
352
543
|
return error_msg
|
|
353
544
|
|
|
354
|
-
# For backwards compatible jobs
|
|
355
|
-
# TODO(cooperc): Remove before 0.11.0.
|
|
356
|
-
def _handle_legacy_job(job_id: int):
|
|
357
|
-
controller_status = job_lib.get_status(job_id)
|
|
358
|
-
if controller_status is None or controller_status.is_terminal():
|
|
359
|
-
logger.error(f'Controller process for legacy job {job_id} is '
|
|
360
|
-
'in an unexpected state.')
|
|
361
|
-
|
|
362
|
-
cleanup_error = _cleanup_job_clusters(job_id)
|
|
363
|
-
if cleanup_error:
|
|
364
|
-
# Unconditionally set the job to failed_controller if the
|
|
365
|
-
# cleanup fails.
|
|
366
|
-
managed_job_state.set_failed(
|
|
367
|
-
job_id,
|
|
368
|
-
task_id=None,
|
|
369
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
|
370
|
-
FAILED_CONTROLLER,
|
|
371
|
-
failure_reason=
|
|
372
|
-
'Legacy controller process has exited abnormally, and '
|
|
373
|
-
f'cleanup failed: {cleanup_error}. For more details, run: '
|
|
374
|
-
f'sky jobs logs --controller {job_id}',
|
|
375
|
-
override_terminal=True)
|
|
376
|
-
return
|
|
377
|
-
|
|
378
|
-
# It's possible for the job to have transitioned to
|
|
379
|
-
# another terminal state while between when we checked its
|
|
380
|
-
# state and now. In that case, set_failed won't do
|
|
381
|
-
# anything, which is fine.
|
|
382
|
-
managed_job_state.set_failed(
|
|
383
|
-
job_id,
|
|
384
|
-
task_id=None,
|
|
385
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
|
386
|
-
FAILED_CONTROLLER,
|
|
387
|
-
failure_reason=(
|
|
388
|
-
'Legacy controller process has exited abnormally. For '
|
|
389
|
-
f'more details, run: sky jobs logs --controller {job_id}'))
|
|
390
|
-
|
|
391
545
|
# Get jobs that need checking (non-terminal or not DONE)
|
|
392
546
|
job_ids = managed_job_state.get_jobs_to_check_status(job_id)
|
|
393
547
|
if not job_ids:
|
|
@@ -397,29 +551,22 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
397
551
|
|
|
398
552
|
for job_id in job_ids:
|
|
399
553
|
assert job_id is not None
|
|
400
|
-
tasks = managed_job_state.
|
|
554
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
401
555
|
# Note: controller_pid and schedule_state are in the job_info table
|
|
402
556
|
# which is joined to the spot table, so all tasks with the same job_id
|
|
403
557
|
# will have the same value for these columns. This is what lets us just
|
|
404
558
|
# take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
|
|
405
559
|
schedule_state = tasks[0]['schedule_state']
|
|
406
560
|
|
|
407
|
-
# Backwards compatibility: this job was submitted when ray was still
|
|
408
|
-
# used for managing the parallelism of job controllers, before #4485.
|
|
409
|
-
# TODO(cooperc): Remove before 0.11.0.
|
|
410
|
-
if (schedule_state is
|
|
411
|
-
managed_job_state.ManagedJobScheduleState.INVALID):
|
|
412
|
-
_handle_legacy_job(job_id)
|
|
413
|
-
continue
|
|
414
|
-
|
|
415
561
|
# Handle jobs with schedule state (non-legacy jobs):
|
|
416
562
|
pid = tasks[0]['controller_pid']
|
|
563
|
+
pid_started_at = tasks[0].get('controller_pid_started_at')
|
|
417
564
|
if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
|
|
418
565
|
# There are two cases where we could get a job that is DONE.
|
|
419
566
|
# 1. At query time (get_jobs_to_check_status), the job was not yet
|
|
420
|
-
# DONE, but since then (before
|
|
421
|
-
# hit a terminal status, marked itself done, and exited.
|
|
422
|
-
# fine.
|
|
567
|
+
# DONE, but since then (before get_managed_job_tasks is called)
|
|
568
|
+
# it has hit a terminal status, marked itself done, and exited.
|
|
569
|
+
# This is fine.
|
|
423
570
|
# 2. The job is DONE, but in a non-terminal status. This is
|
|
424
571
|
# unexpected. For instance, the task status is RUNNING, but the
|
|
425
572
|
# job schedule_state is DONE.
|
|
@@ -466,7 +613,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
466
613
|
failure_reason = f'No controller pid set for {schedule_state.value}'
|
|
467
614
|
else:
|
|
468
615
|
logger.debug(f'Checking controller pid {pid}')
|
|
469
|
-
if
|
|
616
|
+
if controller_process_alive(
|
|
617
|
+
managed_job_state.ControllerPidRecord(
|
|
618
|
+
pid=pid, started_at=pid_started_at), job_id):
|
|
470
619
|
# The controller is still running, so this job is fine.
|
|
471
620
|
continue
|
|
472
621
|
|
|
@@ -526,9 +675,32 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
526
675
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
527
676
|
job_id: Optional[int], get_end_time: bool) -> float:
|
|
528
677
|
"""Get the submitted/ended time of the job."""
|
|
529
|
-
code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
530
|
-
job_id=job_id, get_ended_time=get_end_time)
|
|
531
678
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
679
|
+
assert handle is not None, (
|
|
680
|
+
f'handle for cluster {cluster_name!r} should not be None')
|
|
681
|
+
if handle.is_grpc_enabled_with_flag:
|
|
682
|
+
try:
|
|
683
|
+
if get_end_time:
|
|
684
|
+
end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
|
|
685
|
+
job_id=job_id)
|
|
686
|
+
end_ts_response = backend_utils.invoke_skylet_with_retries(
|
|
687
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
688
|
+
handle.get_grpc_channel()).get_job_ended_timestamp(
|
|
689
|
+
end_ts_request))
|
|
690
|
+
return end_ts_response.timestamp
|
|
691
|
+
else:
|
|
692
|
+
submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
|
|
693
|
+
job_id=job_id)
|
|
694
|
+
submit_ts_response = backend_utils.invoke_skylet_with_retries(
|
|
695
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
696
|
+
handle.get_grpc_channel()).get_job_submitted_timestamp(
|
|
697
|
+
submit_ts_request))
|
|
698
|
+
return submit_ts_response.timestamp
|
|
699
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
700
|
+
pass
|
|
701
|
+
|
|
702
|
+
code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
703
|
+
job_id=job_id, get_ended_time=get_end_time))
|
|
532
704
|
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
533
705
|
code,
|
|
534
706
|
stream_logs=False,
|
|
@@ -552,8 +724,13 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
|
552
724
|
cluster_name,
|
|
553
725
|
job_id=job_id,
|
|
554
726
|
get_end_time=True)
|
|
555
|
-
except exceptions.CommandError
|
|
556
|
-
|
|
727
|
+
except (exceptions.CommandError, grpc.RpcError,
|
|
728
|
+
grpc.FutureTimeoutError) as e:
|
|
729
|
+
if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
|
|
730
|
+
(isinstance(e, grpc.RpcError) and e.code() in [
|
|
731
|
+
grpc.StatusCode.UNAVAILABLE,
|
|
732
|
+
grpc.StatusCode.DEADLINE_EXCEEDED,
|
|
733
|
+
]) or isinstance(e, grpc.FutureTimeoutError):
|
|
557
734
|
# Failed to connect - probably the instance was preempted since the
|
|
558
735
|
# job completed. We shouldn't crash here, so just log and use the
|
|
559
736
|
# current time.
|
|
@@ -565,7 +742,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
|
565
742
|
raise
|
|
566
743
|
|
|
567
744
|
|
|
568
|
-
def event_callback_func(
|
|
745
|
+
def event_callback_func(
|
|
746
|
+
job_id: int, task_id: Optional[int],
|
|
747
|
+
task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
|
|
569
748
|
"""Run event callback for the task."""
|
|
570
749
|
|
|
571
750
|
def callback_func(status: str):
|
|
@@ -604,7 +783,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
604
783
|
f'Bash:{event_callback},log_path:{log_path},result:{result}')
|
|
605
784
|
logger.info(f'=== END: event callback for {status!r} ===')
|
|
606
785
|
|
|
607
|
-
|
|
786
|
+
async def async_callback_func(status: str):
|
|
787
|
+
return await context_utils.to_thread(callback_func, status)
|
|
788
|
+
|
|
789
|
+
return async_callback_func
|
|
608
790
|
|
|
609
791
|
|
|
610
792
|
# ======== user functions ========
|
|
@@ -624,14 +806,15 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
|
|
|
624
806
|
|
|
625
807
|
def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
626
808
|
all_users: bool = False,
|
|
627
|
-
current_workspace: Optional[str] = None
|
|
809
|
+
current_workspace: Optional[str] = None,
|
|
810
|
+
user_hash: Optional[str] = None) -> str:
|
|
628
811
|
"""Cancel jobs by id.
|
|
629
812
|
|
|
630
813
|
If job_ids is None, cancel all jobs.
|
|
631
814
|
"""
|
|
632
815
|
if job_ids is None:
|
|
633
816
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
|
|
634
|
-
None, all_users)
|
|
817
|
+
None, user_hash, all_users)
|
|
635
818
|
job_ids = list(set(job_ids))
|
|
636
819
|
if not job_ids:
|
|
637
820
|
return 'No job to cancel.'
|
|
@@ -651,6 +834,12 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
|
651
834
|
logger.info(f'Job {job_id} is already in terminal state '
|
|
652
835
|
f'{job_status.value}. Skipped.')
|
|
653
836
|
continue
|
|
837
|
+
elif job_status == managed_job_state.ManagedJobStatus.PENDING:
|
|
838
|
+
# the "if PENDING" is a short circuit, this will be atomic.
|
|
839
|
+
cancelled = managed_job_state.set_pending_cancelled(job_id)
|
|
840
|
+
if cancelled:
|
|
841
|
+
cancelled_job_ids.append(job_id)
|
|
842
|
+
continue
|
|
654
843
|
|
|
655
844
|
update_managed_jobs_statuses(job_id)
|
|
656
845
|
|
|
@@ -659,14 +848,30 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
|
659
848
|
wrong_workspace_job_ids.append(job_id)
|
|
660
849
|
continue
|
|
661
850
|
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
851
|
+
if managed_job_state.is_legacy_controller_process(job_id):
|
|
852
|
+
# The job is running on a legacy single-job controller process.
|
|
853
|
+
# TODO(cooperc): Remove this handling for 0.13.0
|
|
854
|
+
|
|
855
|
+
# Send the signal to the jobs controller.
|
|
856
|
+
signal_file = (pathlib.Path(
|
|
857
|
+
managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
|
|
858
|
+
# Filelock is needed to prevent race condition between signal
|
|
859
|
+
# check/removal and signal writing.
|
|
860
|
+
with filelock.FileLock(str(signal_file) + '.lock'):
|
|
861
|
+
with signal_file.open('w', encoding='utf-8') as f:
|
|
862
|
+
f.write(UserSignal.CANCEL.value)
|
|
863
|
+
f.flush()
|
|
864
|
+
else:
|
|
865
|
+
# New controller process.
|
|
866
|
+
try:
|
|
867
|
+
signal_file = pathlib.Path(
|
|
868
|
+
managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
|
|
869
|
+
signal_file.touch()
|
|
870
|
+
except OSError as e:
|
|
871
|
+
logger.error(f'Failed to cancel job {job_id}: {e}')
|
|
872
|
+
# Don't add it to the to be cancelled job ids
|
|
873
|
+
continue
|
|
874
|
+
|
|
670
875
|
cancelled_job_ids.append(job_id)
|
|
671
876
|
|
|
672
877
|
wrong_workspace_job_str = ''
|
|
@@ -714,6 +919,14 @@ def cancel_jobs_by_pool(pool_name: str,
|
|
|
714
919
|
return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
|
715
920
|
|
|
716
921
|
|
|
922
|
+
def controller_log_file_for_job(job_id: int,
|
|
923
|
+
create_if_not_exists: bool = False) -> str:
|
|
924
|
+
log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
925
|
+
if create_if_not_exists:
|
|
926
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
927
|
+
return os.path.join(log_dir, f'{job_id}.log')
|
|
928
|
+
|
|
929
|
+
|
|
717
930
|
def stream_logs_by_id(job_id: int,
|
|
718
931
|
follow: bool = True,
|
|
719
932
|
tail: Optional[int] = None) -> Tuple[str, int]:
|
|
@@ -746,13 +959,20 @@ def stream_logs_by_id(job_id: int,
|
|
|
746
959
|
if managed_job_status.is_failed():
|
|
747
960
|
job_msg = ('\nFailure reason: '
|
|
748
961
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
|
749
|
-
|
|
962
|
+
log_file_ever_existed = False
|
|
750
963
|
task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
|
|
751
964
|
job_id)
|
|
752
965
|
num_tasks = len(task_info)
|
|
753
|
-
for task_id, task_name, task_status, log_file
|
|
966
|
+
for (task_id, task_name, task_status, log_file,
|
|
967
|
+
logs_cleaned_at) in task_info:
|
|
754
968
|
if log_file:
|
|
755
|
-
|
|
969
|
+
log_file_ever_existed = True
|
|
970
|
+
if logs_cleaned_at is not None:
|
|
971
|
+
ts_str = datetime.fromtimestamp(
|
|
972
|
+
logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
|
|
973
|
+
print(f'Task {task_name}({task_id}) log has been '
|
|
974
|
+
f'cleaned at {ts_str}.')
|
|
975
|
+
continue
|
|
756
976
|
task_str = (f'Task {task_name}({task_id})'
|
|
757
977
|
if task_name else f'Task {task_id}')
|
|
758
978
|
if num_tasks > 1:
|
|
@@ -787,7 +1007,7 @@ def stream_logs_by_id(job_id: int,
|
|
|
787
1007
|
f'{task_str} finished '
|
|
788
1008
|
f'(status: {task_status.value}).'),
|
|
789
1009
|
flush=True)
|
|
790
|
-
if
|
|
1010
|
+
if log_file_ever_existed:
|
|
791
1011
|
# Add the "Job finished" message for terminal states
|
|
792
1012
|
if managed_job_status.is_terminal():
|
|
793
1013
|
print(ux_utils.finishing_message(
|
|
@@ -1015,7 +1235,8 @@ def stream_logs(job_id: Optional[int],
|
|
|
1015
1235
|
if controller:
|
|
1016
1236
|
if job_id is None:
|
|
1017
1237
|
assert job_name is not None
|
|
1018
|
-
managed_jobs = managed_job_state.
|
|
1238
|
+
managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
|
|
1239
|
+
name_match=job_name, fields=['job_id', 'job_name', 'status'])
|
|
1019
1240
|
# We manually filter the jobs by name, instead of using
|
|
1020
1241
|
# get_nonterminal_job_ids_by_name, as with `controller=True`, we
|
|
1021
1242
|
# should be able to show the logs for jobs in terminal states.
|
|
@@ -1038,9 +1259,7 @@ def stream_logs(job_id: Optional[int],
|
|
|
1038
1259
|
job_id = managed_job_ids.pop()
|
|
1039
1260
|
assert job_id is not None, (job_id, job_name)
|
|
1040
1261
|
|
|
1041
|
-
controller_log_path =
|
|
1042
|
-
os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
|
|
1043
|
-
f'{job_id}.log')
|
|
1262
|
+
controller_log_path = controller_log_file_for_job(job_id)
|
|
1044
1263
|
job_status = None
|
|
1045
1264
|
|
|
1046
1265
|
# Wait for the log file to be written
|
|
@@ -1141,144 +1360,254 @@ def dump_managed_job_queue(
|
|
|
1141
1360
|
limit: Optional[int] = None,
|
|
1142
1361
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1143
1362
|
statuses: Optional[List[str]] = None,
|
|
1363
|
+
fields: Optional[List[str]] = None,
|
|
1144
1364
|
) -> str:
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1365
|
+
return message_utils.encode_payload(
|
|
1366
|
+
get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
|
|
1367
|
+
workspace_match, name_match, pool_match, page,
|
|
1368
|
+
limit, user_hashes, statuses, fields))
|
|
1148
1369
|
|
|
1149
|
-
# Figure out what the highest priority blocking job is. We need to know in
|
|
1150
|
-
# order to determine if other jobs are blocked by a higher priority job, or
|
|
1151
|
-
# just by the limited controller resources.
|
|
1152
|
-
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1153
|
-
for job in jobs:
|
|
1154
|
-
if job['schedule_state'] not in (
|
|
1155
|
-
# LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
|
|
1156
|
-
# lower priority.
|
|
1157
|
-
managed_job_state.ManagedJobScheduleState.LAUNCHING,
|
|
1158
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_BACKOFF,
|
|
1159
|
-
# It's possible for a WAITING/ALIVE_WAITING job to be ready to
|
|
1160
|
-
# launch, but the scheduler just hasn't run yet.
|
|
1161
|
-
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
1162
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_WAITING,
|
|
1163
|
-
):
|
|
1164
|
-
# This job will not block others.
|
|
1165
|
-
continue
|
|
1166
1370
|
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
highest_blocking_priority = priority
|
|
1371
|
+
def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
|
|
1372
|
+
"""Update the fields list to include the necessary fields.
|
|
1170
1373
|
|
|
1171
|
-
|
|
1374
|
+
Args:
|
|
1375
|
+
fields: The fields to update.
|
|
1376
|
+
|
|
1377
|
+
It will:
|
|
1378
|
+
- Add the necessary dependent fields to the list.
|
|
1379
|
+
- Remove the fields that are not in the DB.
|
|
1380
|
+
- Determine if cluster handle is required.
|
|
1381
|
+
|
|
1382
|
+
Returns:
|
|
1383
|
+
A tuple containing the updated fields and a boolean indicating if
|
|
1384
|
+
cluster handle is required.
|
|
1385
|
+
"""
|
|
1386
|
+
cluster_handle_required = True
|
|
1387
|
+
if _cluster_handle_not_required(fields):
|
|
1388
|
+
cluster_handle_required = False
|
|
1389
|
+
# Copy the list to avoid modifying the original list
|
|
1390
|
+
new_fields = fields.copy()
|
|
1391
|
+
# status and job_id are always included
|
|
1392
|
+
if 'status' not in new_fields:
|
|
1393
|
+
new_fields.append('status')
|
|
1394
|
+
if 'job_id' not in new_fields:
|
|
1395
|
+
new_fields.append('job_id')
|
|
1396
|
+
# user_hash is required if user_name is present
|
|
1397
|
+
if 'user_name' in new_fields and 'user_hash' not in new_fields:
|
|
1398
|
+
new_fields.append('user_hash')
|
|
1399
|
+
if 'job_duration' in new_fields:
|
|
1400
|
+
if 'last_recovered_at' not in new_fields:
|
|
1401
|
+
new_fields.append('last_recovered_at')
|
|
1402
|
+
if 'end_at' not in new_fields:
|
|
1403
|
+
new_fields.append('end_at')
|
|
1404
|
+
if 'job_name' in new_fields and 'task_name' not in new_fields:
|
|
1405
|
+
new_fields.append('task_name')
|
|
1406
|
+
if 'details' in new_fields:
|
|
1407
|
+
if 'schedule_state' not in new_fields:
|
|
1408
|
+
new_fields.append('schedule_state')
|
|
1409
|
+
if 'priority' not in new_fields:
|
|
1410
|
+
new_fields.append('priority')
|
|
1411
|
+
if 'failure_reason' not in new_fields:
|
|
1412
|
+
new_fields.append('failure_reason')
|
|
1413
|
+
if 'user_yaml' in new_fields:
|
|
1414
|
+
if 'original_user_yaml_path' not in new_fields:
|
|
1415
|
+
new_fields.append('original_user_yaml_path')
|
|
1416
|
+
if 'original_user_yaml_content' not in new_fields:
|
|
1417
|
+
new_fields.append('original_user_yaml_content')
|
|
1418
|
+
if cluster_handle_required:
|
|
1419
|
+
if 'task_name' not in new_fields:
|
|
1420
|
+
new_fields.append('task_name')
|
|
1421
|
+
if 'current_cluster_name' not in new_fields:
|
|
1422
|
+
new_fields.append('current_cluster_name')
|
|
1423
|
+
# Remove _NON_DB_FIELDS
|
|
1424
|
+
# These fields have been mapped to the DB fields in the above code, so we
|
|
1425
|
+
# don't need to include them in the updated fields.
|
|
1426
|
+
for field in _NON_DB_FIELDS:
|
|
1427
|
+
if field in new_fields:
|
|
1428
|
+
new_fields.remove(field)
|
|
1429
|
+
return new_fields, cluster_handle_required
|
|
1430
|
+
|
|
1431
|
+
|
|
1432
|
+
def _cluster_handle_not_required(fields: List[str]) -> bool:
|
|
1433
|
+
"""Determine if cluster handle is not required.
|
|
1434
|
+
|
|
1435
|
+
Args:
|
|
1436
|
+
fields: The fields to check if they contain any of the cluster handle
|
|
1437
|
+
fields.
|
|
1438
|
+
|
|
1439
|
+
Returns:
|
|
1440
|
+
True if the fields do not contain any of the cluster handle fields,
|
|
1441
|
+
False otherwise.
|
|
1442
|
+
"""
|
|
1443
|
+
return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def get_managed_job_queue(
|
|
1447
|
+
skip_finished: bool = False,
|
|
1448
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1449
|
+
job_ids: Optional[List[int]] = None,
|
|
1450
|
+
workspace_match: Optional[str] = None,
|
|
1451
|
+
name_match: Optional[str] = None,
|
|
1452
|
+
pool_match: Optional[str] = None,
|
|
1453
|
+
page: Optional[int] = None,
|
|
1454
|
+
limit: Optional[int] = None,
|
|
1455
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1456
|
+
statuses: Optional[List[str]] = None,
|
|
1457
|
+
fields: Optional[List[str]] = None,
|
|
1458
|
+
) -> Dict[str, Any]:
|
|
1459
|
+
"""Get the managed job queue.
|
|
1460
|
+
|
|
1461
|
+
Args:
|
|
1462
|
+
skip_finished: Whether to skip finished jobs.
|
|
1463
|
+
accessible_workspaces: The accessible workspaces.
|
|
1464
|
+
job_ids: The job ids.
|
|
1465
|
+
workspace_match: The workspace name to match.
|
|
1466
|
+
name_match: The job name to match.
|
|
1467
|
+
pool_match: The pool name to match.
|
|
1468
|
+
page: The page number.
|
|
1469
|
+
limit: The limit number.
|
|
1470
|
+
user_hashes: The user hashes.
|
|
1471
|
+
statuses: The statuses.
|
|
1472
|
+
fields: The fields to include in the response.
|
|
1473
|
+
|
|
1474
|
+
Returns:
|
|
1475
|
+
A dictionary containing the managed job queue.
|
|
1476
|
+
"""
|
|
1477
|
+
cluster_handle_required = True
|
|
1478
|
+
updated_fields = None
|
|
1479
|
+
# The caller only need to specify the fields in the
|
|
1480
|
+
# `class ManagedJobRecord` in `response.py`, and the `_update_fields`
|
|
1481
|
+
# function will add the necessary dependent fields to the list, for
|
|
1482
|
+
# example, if the caller specifies `['user_name']`, the `_update_fields`
|
|
1483
|
+
# function will add `['user_hash']` to the list.
|
|
1484
|
+
if fields:
|
|
1485
|
+
updated_fields, cluster_handle_required = _update_fields(fields)
|
|
1486
|
+
|
|
1487
|
+
total_no_filter = managed_job_state.get_managed_jobs_total()
|
|
1488
|
+
|
|
1489
|
+
status_counts = managed_job_state.get_status_count_with_filters(
|
|
1490
|
+
fields=fields,
|
|
1491
|
+
job_ids=job_ids,
|
|
1492
|
+
accessible_workspaces=accessible_workspaces,
|
|
1493
|
+
workspace_match=workspace_match,
|
|
1494
|
+
name_match=name_match,
|
|
1495
|
+
pool_match=pool_match,
|
|
1496
|
+
user_hashes=user_hashes,
|
|
1497
|
+
skip_finished=skip_finished,
|
|
1498
|
+
)
|
|
1499
|
+
|
|
1500
|
+
jobs, total = managed_job_state.get_managed_jobs_with_filters(
|
|
1501
|
+
fields=updated_fields,
|
|
1502
|
+
job_ids=job_ids,
|
|
1503
|
+
accessible_workspaces=accessible_workspaces,
|
|
1504
|
+
workspace_match=workspace_match,
|
|
1505
|
+
name_match=name_match,
|
|
1506
|
+
pool_match=pool_match,
|
|
1507
|
+
user_hashes=user_hashes,
|
|
1508
|
+
statuses=statuses,
|
|
1509
|
+
skip_finished=skip_finished,
|
|
1510
|
+
page=page,
|
|
1511
|
+
limit=limit,
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
if cluster_handle_required:
|
|
1515
|
+
# Fetch the cluster name to handle map for managed clusters only.
|
|
1516
|
+
cluster_name_to_handle = (
|
|
1517
|
+
global_user_state.get_cluster_name_to_handle_map(is_managed=True))
|
|
1518
|
+
|
|
1519
|
+
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1520
|
+
if not fields or 'details' in fields:
|
|
1521
|
+
# Figure out what the highest priority blocking job is. We need to know
|
|
1522
|
+
# in order to determine if other jobs are blocked by a higher priority
|
|
1523
|
+
# job, or just by the limited controller resources.
|
|
1524
|
+
highest_blocking_priority = (
|
|
1525
|
+
managed_job_state.get_managed_jobs_highest_priority())
|
|
1172
1526
|
|
|
1173
|
-
if user_hashes:
|
|
1174
|
-
jobs = [
|
|
1175
|
-
job for job in jobs if job.get('user_hash', None) in user_hashes
|
|
1176
|
-
]
|
|
1177
|
-
if accessible_workspaces:
|
|
1178
|
-
jobs = [
|
|
1179
|
-
job for job in jobs
|
|
1180
|
-
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1181
|
-
accessible_workspaces
|
|
1182
|
-
]
|
|
1183
|
-
if skip_finished:
|
|
1184
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
1185
|
-
# finished, we will include all its tasks.
|
|
1186
|
-
non_finished_tasks = list(
|
|
1187
|
-
filter(
|
|
1188
|
-
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1189
|
-
'status']).is_terminal(), jobs))
|
|
1190
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1191
|
-
jobs = list(
|
|
1192
|
-
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1193
|
-
if job_ids:
|
|
1194
|
-
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1195
|
-
|
|
1196
|
-
jobs, total, status_counts = filter_jobs(jobs,
|
|
1197
|
-
workspace_match,
|
|
1198
|
-
name_match,
|
|
1199
|
-
pool_match,
|
|
1200
|
-
page,
|
|
1201
|
-
limit,
|
|
1202
|
-
statuses=statuses)
|
|
1203
1527
|
for job in jobs:
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
end_at
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1528
|
+
if not fields or 'job_duration' in fields:
|
|
1529
|
+
end_at = job['end_at']
|
|
1530
|
+
if end_at is None:
|
|
1531
|
+
end_at = time.time()
|
|
1532
|
+
|
|
1533
|
+
job_submitted_at = job['last_recovered_at'] - job['job_duration']
|
|
1534
|
+
if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
|
|
1535
|
+
# When job is recovering, the duration is exact
|
|
1536
|
+
# job['job_duration']
|
|
1537
|
+
job_duration = job['job_duration']
|
|
1538
|
+
elif job_submitted_at > 0:
|
|
1539
|
+
job_duration = end_at - job_submitted_at
|
|
1540
|
+
else:
|
|
1541
|
+
# When job_start_at <= 0, that means the last_recovered_at
|
|
1542
|
+
# is not set yet, i.e. the job is not started.
|
|
1543
|
+
job_duration = 0
|
|
1544
|
+
job['job_duration'] = job_duration
|
|
1219
1545
|
job['status'] = job['status'].value
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
pool = managed_job_state.get_pool_from_job_id(job['job_id'])
|
|
1223
|
-
if pool is not None:
|
|
1224
|
-
cluster_name, _ = managed_job_state.get_pool_submit_info(
|
|
1225
|
-
job['job_id'])
|
|
1226
|
-
else:
|
|
1227
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
1228
|
-
job['task_name'], job['job_id'])
|
|
1229
|
-
handle = global_user_state.get_handle_from_cluster_name(
|
|
1230
|
-
cluster_name) if cluster_name is not None else None
|
|
1231
|
-
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1232
|
-
resources_str = resources_utils.get_readable_resources_repr(
|
|
1233
|
-
handle, simplify=True)
|
|
1234
|
-
resources_str_full = resources_utils.get_readable_resources_repr(
|
|
1235
|
-
handle, simplify=False)
|
|
1236
|
-
job['cluster_resources'] = resources_str
|
|
1237
|
-
job['cluster_resources_full'] = resources_str_full
|
|
1238
|
-
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1239
|
-
job['region'] = handle.launched_resources.region
|
|
1240
|
-
job['zone'] = handle.launched_resources.zone
|
|
1241
|
-
job['infra'] = infra_utils.InfraInfo(
|
|
1242
|
-
str(handle.launched_resources.cloud),
|
|
1243
|
-
handle.launched_resources.region,
|
|
1244
|
-
handle.launched_resources.zone).formatted_str()
|
|
1245
|
-
job['accelerators'] = handle.launched_resources.accelerators
|
|
1546
|
+
if not fields or 'schedule_state' in fields:
|
|
1547
|
+
job['schedule_state'] = job['schedule_state'].value
|
|
1246
1548
|
else:
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1549
|
+
job['schedule_state'] = None
|
|
1550
|
+
|
|
1551
|
+
if cluster_handle_required:
|
|
1552
|
+
cluster_name = job.get('current_cluster_name', None)
|
|
1553
|
+
if cluster_name is None:
|
|
1554
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1555
|
+
job['task_name'], job['job_id'])
|
|
1556
|
+
handle = cluster_name_to_handle.get(
|
|
1557
|
+
cluster_name, None) if cluster_name is not None else None
|
|
1558
|
+
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1559
|
+
resources_str_simple, resources_str_full = (
|
|
1560
|
+
resources_utils.get_readable_resources_repr(
|
|
1561
|
+
handle, simplified_only=False))
|
|
1562
|
+
assert resources_str_full is not None
|
|
1563
|
+
job['cluster_resources'] = resources_str_simple
|
|
1564
|
+
job['cluster_resources_full'] = resources_str_full
|
|
1565
|
+
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1566
|
+
job['region'] = handle.launched_resources.region
|
|
1567
|
+
job['zone'] = handle.launched_resources.zone
|
|
1568
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1569
|
+
str(handle.launched_resources.cloud),
|
|
1570
|
+
handle.launched_resources.region,
|
|
1571
|
+
handle.launched_resources.zone).formatted_str()
|
|
1572
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1264
1573
|
else:
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1574
|
+
# FIXME(zongheng): display the last cached values for these.
|
|
1575
|
+
job['cluster_resources'] = '-'
|
|
1576
|
+
job['cluster_resources_full'] = '-'
|
|
1577
|
+
job['cloud'] = '-'
|
|
1578
|
+
job['region'] = '-'
|
|
1579
|
+
job['zone'] = '-'
|
|
1580
|
+
job['infra'] = '-'
|
|
1581
|
+
|
|
1582
|
+
if not fields or 'details' in fields:
|
|
1583
|
+
# Add details about schedule state / backoff.
|
|
1584
|
+
state_details = None
|
|
1585
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
|
1586
|
+
state_details = 'In backoff, waiting for resources'
|
|
1587
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
|
1588
|
+
priority = job.get('priority')
|
|
1589
|
+
if (priority is not None and
|
|
1590
|
+
priority < highest_blocking_priority):
|
|
1591
|
+
# Job is lower priority than some other blocking job.
|
|
1592
|
+
state_details = 'Waiting for higher priority jobs to launch'
|
|
1593
|
+
else:
|
|
1594
|
+
state_details = 'Waiting for other jobs to launch'
|
|
1595
|
+
|
|
1596
|
+
if state_details and job['failure_reason']:
|
|
1597
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
|
1598
|
+
elif state_details:
|
|
1599
|
+
job['details'] = state_details
|
|
1600
|
+
elif job['failure_reason']:
|
|
1601
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
|
1602
|
+
else:
|
|
1603
|
+
job['details'] = None
|
|
1275
1604
|
|
|
1276
|
-
return
|
|
1605
|
+
return {
|
|
1277
1606
|
'jobs': jobs,
|
|
1278
1607
|
'total': total,
|
|
1279
1608
|
'total_no_filter': total_no_filter,
|
|
1280
1609
|
'status_counts': status_counts
|
|
1281
|
-
}
|
|
1610
|
+
}
|
|
1282
1611
|
|
|
1283
1612
|
|
|
1284
1613
|
def filter_jobs(
|
|
@@ -1370,30 +1699,31 @@ def load_managed_job_queue(
|
|
|
1370
1699
|
"""Load job queue from json string."""
|
|
1371
1700
|
result = message_utils.decode_payload(payload)
|
|
1372
1701
|
result_type = ManagedJobQueueResultType.DICT
|
|
1373
|
-
status_counts = {}
|
|
1702
|
+
status_counts: Dict[str, int] = {}
|
|
1374
1703
|
if isinstance(result, dict):
|
|
1375
|
-
jobs = result['jobs']
|
|
1376
|
-
total = result['total']
|
|
1704
|
+
jobs: List[Dict[str, Any]] = result['jobs']
|
|
1705
|
+
total: int = result['total']
|
|
1377
1706
|
status_counts = result.get('status_counts', {})
|
|
1378
|
-
total_no_filter = result.get('total_no_filter', total)
|
|
1707
|
+
total_no_filter: int = result.get('total_no_filter', total)
|
|
1379
1708
|
else:
|
|
1380
1709
|
jobs = result
|
|
1381
1710
|
total = len(jobs)
|
|
1382
1711
|
total_no_filter = total
|
|
1383
1712
|
result_type = ManagedJobQueueResultType.LIST
|
|
1384
1713
|
|
|
1714
|
+
all_users = global_user_state.get_all_users()
|
|
1715
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
1385
1716
|
for job in jobs:
|
|
1386
1717
|
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1387
1718
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
1388
1719
|
# Skip jobs that do not have user_hash info.
|
|
1389
1720
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1390
|
-
|
|
1391
|
-
job['user_name'] = user.name if user is not None else None
|
|
1721
|
+
job['user_name'] = all_users_map.get(job['user_hash'])
|
|
1392
1722
|
return jobs, total, result_type, total_no_filter, status_counts
|
|
1393
1723
|
|
|
1394
1724
|
|
|
1395
1725
|
def _get_job_status_from_tasks(
|
|
1396
|
-
job_tasks: List[Dict[str, Any]]
|
|
1726
|
+
job_tasks: Union[List[responses.ManagedJobRecord], List[Dict[str, Any]]]
|
|
1397
1727
|
) -> Tuple[managed_job_state.ManagedJobStatus, int]:
|
|
1398
1728
|
"""Get the current task status and the current task id for a job."""
|
|
1399
1729
|
managed_task_status = managed_job_state.ManagedJobStatus.SUCCEEDED
|
|
@@ -1413,29 +1743,40 @@ def _get_job_status_from_tasks(
|
|
|
1413
1743
|
|
|
1414
1744
|
|
|
1415
1745
|
@typing.overload
|
|
1416
|
-
def format_job_table(
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1746
|
+
def format_job_table(
|
|
1747
|
+
tasks: List[Dict[str, Any]],
|
|
1748
|
+
show_all: bool,
|
|
1749
|
+
show_user: bool,
|
|
1750
|
+
return_rows: Literal[False] = False,
|
|
1751
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1752
|
+
max_jobs: Optional[int] = None,
|
|
1753
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1754
|
+
) -> str:
|
|
1421
1755
|
...
|
|
1422
1756
|
|
|
1423
1757
|
|
|
1424
1758
|
@typing.overload
|
|
1425
|
-
def format_job_table(
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1759
|
+
def format_job_table(
|
|
1760
|
+
tasks: List[Dict[str, Any]],
|
|
1761
|
+
show_all: bool,
|
|
1762
|
+
show_user: bool,
|
|
1763
|
+
return_rows: Literal[True],
|
|
1764
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1765
|
+
max_jobs: Optional[int] = None,
|
|
1766
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1767
|
+
) -> List[List[str]]:
|
|
1430
1768
|
...
|
|
1431
1769
|
|
|
1432
1770
|
|
|
1433
1771
|
def format_job_table(
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1772
|
+
tasks: List[Dict[str, Any]],
|
|
1773
|
+
show_all: bool,
|
|
1774
|
+
show_user: bool,
|
|
1775
|
+
return_rows: bool = False,
|
|
1776
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1777
|
+
max_jobs: Optional[int] = None,
|
|
1778
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1779
|
+
) -> Union[str, List[List[str]]]:
|
|
1439
1780
|
"""Returns managed jobs as a formatted string.
|
|
1440
1781
|
|
|
1441
1782
|
Args:
|
|
@@ -1444,13 +1785,15 @@ def format_job_table(
|
|
|
1444
1785
|
max_jobs: The maximum number of jobs to show in the table.
|
|
1445
1786
|
return_rows: If True, return the rows as a list of strings instead of
|
|
1446
1787
|
all rows concatenated into a single string.
|
|
1788
|
+
pool_status: List of pool status dictionaries with replica_info.
|
|
1789
|
+
job_status_counts: The counts of each job status.
|
|
1447
1790
|
|
|
1448
1791
|
Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
|
|
1449
1792
|
a list of "rows" (each of which is a list of str).
|
|
1450
1793
|
"""
|
|
1451
1794
|
jobs = collections.defaultdict(list)
|
|
1452
1795
|
# Check if the tasks have user information from kubernetes.
|
|
1453
|
-
# This is only used for sky status
|
|
1796
|
+
# This is only used for sky status-kubernetes.
|
|
1454
1797
|
tasks_have_k8s_user = any([task.get('user') for task in tasks])
|
|
1455
1798
|
if max_jobs and tasks_have_k8s_user:
|
|
1456
1799
|
raise ValueError('max_jobs is not supported when tasks have user info.')
|
|
@@ -1460,17 +1803,37 @@ def format_job_table(
|
|
|
1460
1803
|
return (task['user'], task['job_id'])
|
|
1461
1804
|
return task['job_id']
|
|
1462
1805
|
|
|
1806
|
+
def _get_job_id_to_worker_map(
|
|
1807
|
+
pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
|
|
1808
|
+
"""Create a mapping from job_id to worker replica_id.
|
|
1809
|
+
|
|
1810
|
+
Args:
|
|
1811
|
+
pool_status: List of pool status dictionaries with replica_info.
|
|
1812
|
+
|
|
1813
|
+
Returns:
|
|
1814
|
+
Dictionary mapping job_id to replica_id (worker ID).
|
|
1815
|
+
"""
|
|
1816
|
+
job_to_worker: Dict[int, int] = {}
|
|
1817
|
+
if pool_status is None:
|
|
1818
|
+
return job_to_worker
|
|
1819
|
+
for pool in pool_status:
|
|
1820
|
+
replica_info = pool.get('replica_info', [])
|
|
1821
|
+
for replica in replica_info:
|
|
1822
|
+
used_by = replica.get('used_by')
|
|
1823
|
+
if used_by is not None:
|
|
1824
|
+
job_to_worker[used_by] = replica.get('replica_id')
|
|
1825
|
+
return job_to_worker
|
|
1826
|
+
|
|
1827
|
+
# Create mapping from job_id to worker replica_id
|
|
1828
|
+
job_to_worker = _get_job_id_to_worker_map(pool_status)
|
|
1829
|
+
|
|
1463
1830
|
for task in tasks:
|
|
1464
1831
|
# The tasks within the same job_id are already sorted
|
|
1465
1832
|
# by the task_id.
|
|
1466
1833
|
jobs[get_hash(task)].append(task)
|
|
1467
1834
|
|
|
1468
|
-
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1469
1835
|
workspaces = set()
|
|
1470
1836
|
for job_tasks in jobs.values():
|
|
1471
|
-
managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
|
|
1472
|
-
if not managed_job_status.is_terminal():
|
|
1473
|
-
status_counts[managed_job_status.value] += 1
|
|
1474
1837
|
workspaces.add(job_tasks[0].get('workspace',
|
|
1475
1838
|
constants.SKYPILOT_DEFAULT_WORKSPACE))
|
|
1476
1839
|
|
|
@@ -1513,9 +1876,15 @@ def format_job_table(
|
|
|
1513
1876
|
job_table = log_utils.create_table(columns)
|
|
1514
1877
|
|
|
1515
1878
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1879
|
+
if job_status_counts:
|
|
1880
|
+
for status_value, count in job_status_counts.items():
|
|
1881
|
+
status = managed_job_state.ManagedJobStatus(status_value)
|
|
1882
|
+
if not status.is_terminal():
|
|
1883
|
+
status_counts[status_value] = count
|
|
1884
|
+
else:
|
|
1885
|
+
for task in tasks:
|
|
1886
|
+
if not task['status'].is_terminal():
|
|
1887
|
+
status_counts[task['status'].value] += 1
|
|
1519
1888
|
|
|
1520
1889
|
all_tasks = tasks
|
|
1521
1890
|
if max_jobs is not None:
|
|
@@ -1601,7 +1970,12 @@ def format_job_table(
|
|
|
1601
1970
|
if pool is None:
|
|
1602
1971
|
pool = '-'
|
|
1603
1972
|
|
|
1973
|
+
# Add worker information if job is assigned to a worker
|
|
1604
1974
|
job_id = job_hash[1] if tasks_have_k8s_user else job_hash
|
|
1975
|
+
# job_id is now always an integer, use it to look up worker
|
|
1976
|
+
if job_id in job_to_worker and pool != '-':
|
|
1977
|
+
pool = f'{pool} (worker={job_to_worker[job_id]})'
|
|
1978
|
+
|
|
1605
1979
|
job_values = [
|
|
1606
1980
|
job_id,
|
|
1607
1981
|
'',
|
|
@@ -1644,6 +2018,12 @@ def format_job_table(
|
|
|
1644
2018
|
pool = task.get('pool')
|
|
1645
2019
|
if pool is None:
|
|
1646
2020
|
pool = '-'
|
|
2021
|
+
|
|
2022
|
+
# Add worker information if task is assigned to a worker
|
|
2023
|
+
task_job_id = task['job_id']
|
|
2024
|
+
if task_job_id in job_to_worker and pool != '-':
|
|
2025
|
+
pool = f'{pool} (worker={job_to_worker[task_job_id]})'
|
|
2026
|
+
|
|
1647
2027
|
values = [
|
|
1648
2028
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
|
1649
2029
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
|
@@ -1726,6 +2106,59 @@ def format_job_table(
|
|
|
1726
2106
|
return output
|
|
1727
2107
|
|
|
1728
2108
|
|
|
2109
|
+
def decode_managed_job_protos(
|
|
2110
|
+
job_protos: Iterable['managed_jobsv1_pb2.ManagedJobInfo']
|
|
2111
|
+
) -> List[Dict[str, Any]]:
|
|
2112
|
+
"""Decode job protos to dicts. Similar to load_managed_job_queue."""
|
|
2113
|
+
user_hash_to_user = global_user_state.get_users(
|
|
2114
|
+
set(job.user_hash for job in job_protos if job.user_hash))
|
|
2115
|
+
|
|
2116
|
+
jobs = []
|
|
2117
|
+
for job_proto in job_protos:
|
|
2118
|
+
job_dict = _job_proto_to_dict(job_proto)
|
|
2119
|
+
user_hash = job_dict.get('user_hash', None)
|
|
2120
|
+
if user_hash is not None:
|
|
2121
|
+
# Skip jobs that do not have user_hash info.
|
|
2122
|
+
# TODO(cooperc): Remove check before 0.12.0.
|
|
2123
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
2124
|
+
job_dict['user_name'] = user.name if user is not None else None
|
|
2125
|
+
jobs.append(job_dict)
|
|
2126
|
+
return jobs
|
|
2127
|
+
|
|
2128
|
+
|
|
2129
|
+
def _job_proto_to_dict(
|
|
2130
|
+
job_proto: 'managed_jobsv1_pb2.ManagedJobInfo') -> Dict[str, Any]:
|
|
2131
|
+
job_dict = json_format.MessageToDict(
|
|
2132
|
+
job_proto,
|
|
2133
|
+
always_print_fields_with_no_presence=True,
|
|
2134
|
+
# Our API returns fields in snake_case.
|
|
2135
|
+
preserving_proto_field_name=True,
|
|
2136
|
+
use_integers_for_enums=True)
|
|
2137
|
+
for field in job_proto.DESCRIPTOR.fields:
|
|
2138
|
+
# Ensure optional fields are present with None values for
|
|
2139
|
+
# backwards compatibility with older clients.
|
|
2140
|
+
if field.has_presence and field.name not in job_dict:
|
|
2141
|
+
job_dict[field.name] = None
|
|
2142
|
+
# json_format.MessageToDict is meant for encoding to JSON,
|
|
2143
|
+
# and Protobuf encodes int64 as decimal strings in JSON,
|
|
2144
|
+
# so we need to convert them back to ints.
|
|
2145
|
+
# https://protobuf.dev/programming-guides/json/#field-representation
|
|
2146
|
+
if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
|
|
2147
|
+
job_dict.get(field.name) is not None):
|
|
2148
|
+
job_dict[field.name] = int(job_dict[field.name])
|
|
2149
|
+
job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
|
|
2150
|
+
job_dict['status'])
|
|
2151
|
+
# For backwards compatibility, convert schedule_state to a string,
|
|
2152
|
+
# as we don't have the logic to handle it in our request
|
|
2153
|
+
# encoder/decoder, unlike status.
|
|
2154
|
+
schedule_state_enum = (
|
|
2155
|
+
managed_job_state.ManagedJobScheduleState.from_protobuf(
|
|
2156
|
+
job_dict['schedule_state']))
|
|
2157
|
+
job_dict['schedule_state'] = (schedule_state_enum.value
|
|
2158
|
+
if schedule_state_enum is not None else None)
|
|
2159
|
+
return job_dict
|
|
2160
|
+
|
|
2161
|
+
|
|
1729
2162
|
class ManagedJobCodeGen:
|
|
1730
2163
|
"""Code generator for managed job utility functions.
|
|
1731
2164
|
|
|
@@ -1755,6 +2188,7 @@ class ManagedJobCodeGen:
|
|
|
1755
2188
|
limit: Optional[int] = None,
|
|
1756
2189
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1757
2190
|
statuses: Optional[List[str]] = None,
|
|
2191
|
+
fields: Optional[List[str]] = None,
|
|
1758
2192
|
) -> str:
|
|
1759
2193
|
code = textwrap.dedent(f"""\
|
|
1760
2194
|
if managed_job_version < 9:
|
|
@@ -1773,7 +2207,7 @@ class ManagedJobCodeGen:
|
|
|
1773
2207
|
page={page!r},
|
|
1774
2208
|
limit={limit!r},
|
|
1775
2209
|
user_hashes={user_hashes!r})
|
|
1776
|
-
|
|
2210
|
+
elif managed_job_version < 12:
|
|
1777
2211
|
job_table = utils.dump_managed_job_queue(
|
|
1778
2212
|
skip_finished={skip_finished},
|
|
1779
2213
|
accessible_workspaces={accessible_workspaces!r},
|
|
@@ -1785,6 +2219,19 @@ class ManagedJobCodeGen:
|
|
|
1785
2219
|
limit={limit!r},
|
|
1786
2220
|
user_hashes={user_hashes!r},
|
|
1787
2221
|
statuses={statuses!r})
|
|
2222
|
+
else:
|
|
2223
|
+
job_table = utils.dump_managed_job_queue(
|
|
2224
|
+
skip_finished={skip_finished},
|
|
2225
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2226
|
+
job_ids={job_ids!r},
|
|
2227
|
+
workspace_match={workspace_match!r},
|
|
2228
|
+
name_match={name_match!r},
|
|
2229
|
+
pool_match={pool_match!r},
|
|
2230
|
+
page={page!r},
|
|
2231
|
+
limit={limit!r},
|
|
2232
|
+
user_hashes={user_hashes!r},
|
|
2233
|
+
statuses={statuses!r},
|
|
2234
|
+
fields={fields!r})
|
|
1788
2235
|
print(job_table, flush=True)
|
|
1789
2236
|
""")
|
|
1790
2237
|
return cls._build(code)
|
|
@@ -1852,6 +2299,18 @@ class ManagedJobCodeGen:
|
|
|
1852
2299
|
""")
|
|
1853
2300
|
return cls._build(code)
|
|
1854
2301
|
|
|
2302
|
+
@classmethod
|
|
2303
|
+
def get_version(cls) -> str:
|
|
2304
|
+
"""Generate code to get controller version."""
|
|
2305
|
+
code = textwrap.dedent("""\
|
|
2306
|
+
from sky.skylet import constants as controller_constants
|
|
2307
|
+
|
|
2308
|
+
# Get controller version
|
|
2309
|
+
controller_version = controller_constants.SKYLET_VERSION
|
|
2310
|
+
print(f"controller_version:{controller_version}", flush=True)
|
|
2311
|
+
""")
|
|
2312
|
+
return cls._build(code)
|
|
2313
|
+
|
|
1855
2314
|
@classmethod
|
|
1856
2315
|
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
|
1857
2316
|
code = textwrap.dedent(f"""\
|
|
@@ -1889,8 +2348,12 @@ class ManagedJobCodeGen:
|
|
|
1889
2348
|
return cls._build(code)
|
|
1890
2349
|
|
|
1891
2350
|
@classmethod
|
|
1892
|
-
def set_pending(cls,
|
|
1893
|
-
|
|
2351
|
+
def set_pending(cls,
|
|
2352
|
+
job_id: int,
|
|
2353
|
+
managed_job_dag: 'dag_lib.Dag',
|
|
2354
|
+
workspace: str,
|
|
2355
|
+
entrypoint: str,
|
|
2356
|
+
user_hash: Optional[str] = None) -> str:
|
|
1894
2357
|
dag_name = managed_job_dag.name
|
|
1895
2358
|
pool = managed_job_dag.pool
|
|
1896
2359
|
# Add the managed job to queue table.
|
|
@@ -1907,6 +2370,8 @@ class ManagedJobCodeGen:
|
|
|
1907
2370
|
pool_hash = serve_state.get_service_hash({pool!r})
|
|
1908
2371
|
set_job_info_kwargs['pool'] = {pool!r}
|
|
1909
2372
|
set_job_info_kwargs['pool_hash'] = pool_hash
|
|
2373
|
+
if managed_job_version >= 11:
|
|
2374
|
+
set_job_info_kwargs['user_hash'] = {user_hash!r}
|
|
1910
2375
|
managed_job_state.set_job_info(
|
|
1911
2376
|
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
|
1912
2377
|
""")
|