skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -4,60 +4,86 @@ NOTE: whenever an API change is made in this file, we need to bump the
|
|
|
4
4
|
jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
|
|
5
5
|
ManagedJobCodeGen.
|
|
6
6
|
"""
|
|
7
|
+
import asyncio
|
|
7
8
|
import collections
|
|
9
|
+
from datetime import datetime
|
|
8
10
|
import enum
|
|
9
11
|
import os
|
|
10
12
|
import pathlib
|
|
13
|
+
import re
|
|
11
14
|
import shlex
|
|
12
15
|
import textwrap
|
|
13
16
|
import time
|
|
14
17
|
import traceback
|
|
15
18
|
import typing
|
|
16
|
-
from typing import Any, Dict, List, Optional, Set,
|
|
19
|
+
from typing import (Any, Deque, Dict, Iterable, List, Literal, Optional, Set,
|
|
20
|
+
TextIO, Tuple, Union)
|
|
17
21
|
|
|
18
22
|
import colorama
|
|
19
23
|
import filelock
|
|
20
|
-
from typing_extensions import Literal
|
|
21
24
|
|
|
22
25
|
from sky import backends
|
|
23
26
|
from sky import exceptions
|
|
24
27
|
from sky import global_user_state
|
|
25
28
|
from sky import sky_logging
|
|
29
|
+
from sky import skypilot_config
|
|
26
30
|
from sky.adaptors import common as adaptors_common
|
|
27
31
|
from sky.backends import backend_utils
|
|
32
|
+
from sky.backends import cloud_vm_ray_backend
|
|
28
33
|
from sky.jobs import constants as managed_job_constants
|
|
29
34
|
from sky.jobs import scheduler
|
|
30
35
|
from sky.jobs import state as managed_job_state
|
|
36
|
+
from sky.schemas.api import responses
|
|
31
37
|
from sky.skylet import constants
|
|
32
38
|
from sky.skylet import job_lib
|
|
33
39
|
from sky.skylet import log_lib
|
|
34
40
|
from sky.usage import usage_lib
|
|
41
|
+
from sky.utils import annotations
|
|
35
42
|
from sky.utils import common_utils
|
|
43
|
+
from sky.utils import context_utils
|
|
44
|
+
from sky.utils import controller_utils
|
|
45
|
+
from sky.utils import infra_utils
|
|
36
46
|
from sky.utils import log_utils
|
|
37
47
|
from sky.utils import message_utils
|
|
48
|
+
from sky.utils import resources_utils
|
|
38
49
|
from sky.utils import rich_utils
|
|
39
50
|
from sky.utils import subprocess_utils
|
|
40
51
|
from sky.utils import ux_utils
|
|
41
52
|
|
|
42
53
|
if typing.TYPE_CHECKING:
|
|
54
|
+
from google.protobuf import descriptor
|
|
55
|
+
from google.protobuf import json_format
|
|
56
|
+
import grpc
|
|
43
57
|
import psutil
|
|
44
58
|
|
|
45
59
|
import sky
|
|
46
60
|
from sky import dag as dag_lib
|
|
61
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
62
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
47
63
|
else:
|
|
64
|
+
json_format = adaptors_common.LazyImport('google.protobuf.json_format')
|
|
65
|
+
descriptor = adaptors_common.LazyImport('google.protobuf.descriptor')
|
|
48
66
|
psutil = adaptors_common.LazyImport('psutil')
|
|
67
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
68
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
69
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
70
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
49
71
|
|
|
50
72
|
logger = sky_logging.init_logger(__name__)
|
|
51
73
|
|
|
52
|
-
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
53
74
|
# Controller checks its job's status every this many seconds.
|
|
54
|
-
|
|
75
|
+
# This is a tradeoff between the latency and the resource usage.
|
|
76
|
+
JOB_STATUS_CHECK_GAP_SECONDS = 15
|
|
55
77
|
|
|
56
78
|
# Controller checks if its job has started every this many seconds.
|
|
57
79
|
JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
58
80
|
|
|
59
81
|
_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
|
60
82
|
|
|
83
|
+
_JOB_STATUS_FETCH_MAX_RETRIES = 3
|
|
84
|
+
_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
|
|
85
|
+
_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
|
|
86
|
+
|
|
61
87
|
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
|
62
88
|
'Waiting for task to start[/]'
|
|
63
89
|
'{status_str}. It may take a few minutes.\n'
|
|
@@ -72,7 +98,35 @@ _JOB_CANCELLED_MESSAGE = (
|
|
|
72
98
|
# blocking for a long time. This should be significantly longer than the
|
|
73
99
|
# JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
|
|
74
100
|
# update the state.
|
|
75
|
-
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS =
|
|
101
|
+
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
|
|
102
|
+
|
|
103
|
+
# After enabling consolidation mode, we need to restart the API server to get
|
|
104
|
+
# the jobs refresh deamon and correct number of executors. We use this file to
|
|
105
|
+
# indicate that the API server has been restarted after enabling consolidation
|
|
106
|
+
# mode.
|
|
107
|
+
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
|
|
108
|
+
'~/.sky/.jobs_controller_consolidation_reloaded_signal')
|
|
109
|
+
|
|
110
|
+
# The response fields for managed jobs that require cluster handle
|
|
111
|
+
_CLUSTER_HANDLE_FIELDS = [
|
|
112
|
+
'cluster_resources',
|
|
113
|
+
'cluster_resources_full',
|
|
114
|
+
'cloud',
|
|
115
|
+
'region',
|
|
116
|
+
'zone',
|
|
117
|
+
'infra',
|
|
118
|
+
'accelerators',
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
# The response fields for managed jobs that are not stored in the database
|
|
122
|
+
# These fields will be mapped to the DB fields in the `_update_fields`.
|
|
123
|
+
_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class ManagedJobQueueResultType(enum.Enum):
|
|
127
|
+
"""The type of the managed job queue result."""
|
|
128
|
+
DICT = 'DICT'
|
|
129
|
+
LIST = 'LIST'
|
|
76
130
|
|
|
77
131
|
|
|
78
132
|
class UserSignal(enum.Enum):
|
|
@@ -83,7 +137,10 @@ class UserSignal(enum.Enum):
|
|
|
83
137
|
|
|
84
138
|
|
|
85
139
|
# ====== internal functions ======
|
|
86
|
-
def terminate_cluster(
|
|
140
|
+
def terminate_cluster(
|
|
141
|
+
cluster_name: str,
|
|
142
|
+
max_retry: int = 6,
|
|
143
|
+
) -> None:
|
|
87
144
|
"""Terminate the cluster."""
|
|
88
145
|
from sky import core # pylint: disable=import-outside-toplevel
|
|
89
146
|
retry_cnt = 0
|
|
@@ -121,43 +178,313 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
|
121
178
|
time.sleep(backoff.current_backoff())
|
|
122
179
|
|
|
123
180
|
|
|
124
|
-
def
|
|
125
|
-
|
|
181
|
+
def _validate_consolidation_mode_config(
|
|
182
|
+
current_is_consolidation_mode: bool) -> None:
|
|
183
|
+
"""Validate the consolidation mode config."""
|
|
184
|
+
# Check whether the consolidation mode config is changed.
|
|
185
|
+
if current_is_consolidation_mode:
|
|
186
|
+
controller_cn = (
|
|
187
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
|
188
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
189
|
+
logger.warning(
|
|
190
|
+
f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
|
|
191
|
+
f'but the controller cluster {controller_cn} is still running. '
|
|
192
|
+
'Please terminate the controller cluster first.'
|
|
193
|
+
f'{colorama.Style.RESET_ALL}')
|
|
194
|
+
else:
|
|
195
|
+
total_jobs = managed_job_state.get_managed_jobs_total()
|
|
196
|
+
if total_jobs > 0:
|
|
197
|
+
nonterminal_jobs = (
|
|
198
|
+
managed_job_state.get_nonterminal_job_ids_by_name(
|
|
199
|
+
None, None, all_users=True))
|
|
200
|
+
if nonterminal_jobs:
|
|
201
|
+
logger.warning(
|
|
202
|
+
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
|
203
|
+
f'but there are still {len(nonterminal_jobs)} managed jobs '
|
|
204
|
+
'running. Please terminate those jobs first.'
|
|
205
|
+
f'{colorama.Style.RESET_ALL}')
|
|
206
|
+
else:
|
|
207
|
+
logger.warning(
|
|
208
|
+
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
|
209
|
+
f'but there are {total_jobs} jobs from previous '
|
|
210
|
+
'consolidation mode. Reset the `jobs.controller.'
|
|
211
|
+
'consolidation_mode` to `true` and run `sky jobs queue` '
|
|
212
|
+
'to see those jobs. Switching to normal mode will '
|
|
213
|
+
f'lose the job history.{colorama.Style.RESET_ALL}')
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Whether to use consolidation mode or not. When this is enabled, the managed
|
|
217
|
+
# jobs controller will not be running on a separate cluster, but locally on the
|
|
218
|
+
# API Server. Under the hood, we submit the job monitoring logic as processes
|
|
219
|
+
# directly in the API Server.
|
|
220
|
+
# Use LRU Cache so that the check is only done once.
|
|
221
|
+
@annotations.lru_cache(scope='request', maxsize=2)
|
|
222
|
+
def is_consolidation_mode(on_api_restart: bool = False) -> bool:
|
|
223
|
+
if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
config_consolidation_mode = skypilot_config.get_nested(
|
|
227
|
+
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
228
|
+
|
|
229
|
+
signal_file = pathlib.Path(
|
|
230
|
+
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
|
|
231
|
+
|
|
232
|
+
if on_api_restart:
|
|
233
|
+
if config_consolidation_mode:
|
|
234
|
+
signal_file.touch()
|
|
235
|
+
else:
|
|
236
|
+
restart_signal_file_exists = signal_file.exists()
|
|
237
|
+
if not restart_signal_file_exists:
|
|
238
|
+
if config_consolidation_mode:
|
|
239
|
+
logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
|
|
240
|
+
'managed jobs is enabled in the server config, '
|
|
241
|
+
'but the API server has not been restarted yet. '
|
|
242
|
+
'Please restart the API server to enable it.'
|
|
243
|
+
f'{colorama.Style.RESET_ALL}')
|
|
244
|
+
return False
|
|
245
|
+
elif not config_consolidation_mode:
|
|
246
|
+
# Cleanup the signal file if the consolidation mode is disabled in
|
|
247
|
+
# the config. This allow the user to disable the consolidation mode
|
|
248
|
+
# without restarting the API server.
|
|
249
|
+
signal_file.unlink()
|
|
250
|
+
|
|
251
|
+
# We should only do this check on API server, as the controller will not
|
|
252
|
+
# have related config and will always seemingly disabled for consolidation
|
|
253
|
+
# mode. Check #6611 for more details.
|
|
254
|
+
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
255
|
+
_validate_consolidation_mode_config(config_consolidation_mode)
|
|
256
|
+
return config_consolidation_mode
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def ha_recovery_for_consolidation_mode() -> None:
|
|
260
|
+
"""Recovery logic for consolidation mode.
|
|
261
|
+
|
|
262
|
+
This should only be called from the managed-job-status-refresh-daemon, due
|
|
263
|
+
so that we have correct ordering recovery -> controller start -> job status
|
|
264
|
+
updates. This also should ensure correct operation during a rolling update.
|
|
265
|
+
"""
|
|
266
|
+
# No setup recovery is needed in consolidation mode, as the API server
|
|
267
|
+
# already has all runtime installed. Directly start jobs recovery here.
|
|
268
|
+
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
269
|
+
scheduler.maybe_start_controllers()
|
|
270
|
+
with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
|
|
271
|
+
'a',
|
|
272
|
+
encoding='utf-8') as f:
|
|
273
|
+
start = time.time()
|
|
274
|
+
f.write(f'Starting HA recovery at {datetime.now()}\n')
|
|
275
|
+
jobs, _ = managed_job_state.get_managed_jobs_with_filters(fields=[
|
|
276
|
+
'job_id', 'controller_pid', 'controller_pid_started_at',
|
|
277
|
+
'schedule_state', 'status'
|
|
278
|
+
])
|
|
279
|
+
for job in jobs:
|
|
280
|
+
job_id = job['job_id']
|
|
281
|
+
controller_pid = job['controller_pid']
|
|
282
|
+
controller_pid_started_at = job.get('controller_pid_started_at')
|
|
283
|
+
|
|
284
|
+
# In consolidation mode, it is possible that only the API server
|
|
285
|
+
# process is restarted, and the controller process is not. In such
|
|
286
|
+
# case, we don't need to do anything and the controller process will
|
|
287
|
+
# just keep running. However, in most cases, the controller process
|
|
288
|
+
# will also be stopped - either by a pod restart in k8s API server,
|
|
289
|
+
# or by `sky api stop`, which will stop controllers.
|
|
290
|
+
# TODO(cooperc): Make sure we cannot have a controller process
|
|
291
|
+
# running across API server restarts for consistency.
|
|
292
|
+
if controller_pid is not None:
|
|
293
|
+
try:
|
|
294
|
+
# Note: We provide the legacy job id to the
|
|
295
|
+
# controller_process_alive just in case, but we shouldn't
|
|
296
|
+
# have a running legacy job controller process at this point
|
|
297
|
+
if controller_process_alive(
|
|
298
|
+
managed_job_state.ControllerPidRecord(
|
|
299
|
+
pid=controller_pid,
|
|
300
|
+
started_at=controller_pid_started_at), job_id):
|
|
301
|
+
message = (f'Controller pid {controller_pid} for '
|
|
302
|
+
f'job {job_id} is still running. '
|
|
303
|
+
'Skipping recovery.\n')
|
|
304
|
+
logger.debug(message)
|
|
305
|
+
f.write(message)
|
|
306
|
+
continue
|
|
307
|
+
except Exception: # pylint: disable=broad-except
|
|
308
|
+
# _controller_process_alive may raise if psutil fails; we
|
|
309
|
+
# should not crash the recovery logic because of this.
|
|
310
|
+
message = ('Error checking controller pid '
|
|
311
|
+
f'{controller_pid} for job {job_id}\n')
|
|
312
|
+
logger.warning(message, exc_info=True)
|
|
313
|
+
f.write(message)
|
|
314
|
+
|
|
315
|
+
# Controller process is not set or not alive.
|
|
316
|
+
if job['schedule_state'] not in [
|
|
317
|
+
managed_job_state.ManagedJobScheduleState.DONE,
|
|
318
|
+
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
319
|
+
# INACTIVE job may be mid-submission, don't set to WAITING.
|
|
320
|
+
managed_job_state.ManagedJobScheduleState.INACTIVE,
|
|
321
|
+
]:
|
|
322
|
+
managed_job_state.reset_job_for_recovery(job_id)
|
|
323
|
+
message = (f'Job {job_id} completed recovery at '
|
|
324
|
+
f'{datetime.now()}\n')
|
|
325
|
+
logger.info(message)
|
|
326
|
+
f.write(message)
|
|
327
|
+
f.write(f'HA recovery completed at {datetime.now()}\n')
|
|
328
|
+
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
async def get_job_status(
|
|
332
|
+
backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
333
|
+
job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
|
|
126
334
|
"""Check the status of the job running on a managed job cluster.
|
|
127
335
|
|
|
128
336
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
129
337
|
FAILED_SETUP or CANCELLED.
|
|
130
338
|
"""
|
|
131
|
-
|
|
339
|
+
# TODO(luca) make this async
|
|
340
|
+
handle = await context_utils.to_thread(
|
|
341
|
+
global_user_state.get_handle_from_cluster_name, cluster_name)
|
|
132
342
|
if handle is None:
|
|
133
343
|
# This can happen if the cluster was preempted and background status
|
|
134
344
|
# refresh already noticed and cleaned it up.
|
|
135
345
|
logger.info(f'Cluster {cluster_name} not found.')
|
|
136
346
|
return None
|
|
137
347
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
138
|
-
|
|
348
|
+
job_ids = None if job_id is None else [job_id]
|
|
349
|
+
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
350
|
+
try:
|
|
351
|
+
logger.info('=== Checking the job status... ===')
|
|
352
|
+
statuses = await asyncio.wait_for(
|
|
353
|
+
context_utils.to_thread(backend.get_job_status,
|
|
354
|
+
handle,
|
|
355
|
+
job_ids=job_ids,
|
|
356
|
+
stream_logs=False),
|
|
357
|
+
timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
|
|
358
|
+
status = list(statuses.values())[0]
|
|
359
|
+
if status is None:
|
|
360
|
+
logger.info('No job found.')
|
|
361
|
+
else:
|
|
362
|
+
logger.info(f'Job status: {status}')
|
|
363
|
+
logger.info('=' * 34)
|
|
364
|
+
return status
|
|
365
|
+
except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
|
|
366
|
+
ValueError, TypeError, asyncio.TimeoutError) as e:
|
|
367
|
+
# Note: Each of these exceptions has some additional conditions to
|
|
368
|
+
# limit how we handle it and whether or not we catch it.
|
|
369
|
+
# Retry on k8s transient network errors. This is useful when using
|
|
370
|
+
# coreweave which may have transient network issue sometimes.
|
|
371
|
+
is_transient_error = False
|
|
372
|
+
detailed_reason = None
|
|
373
|
+
if isinstance(e, exceptions.CommandError):
|
|
374
|
+
detailed_reason = e.detailed_reason
|
|
375
|
+
if (detailed_reason is not None and
|
|
376
|
+
_JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
|
|
377
|
+
is_transient_error = True
|
|
378
|
+
elif isinstance(e, grpc.RpcError):
|
|
379
|
+
detailed_reason = e.details()
|
|
380
|
+
if e.code() in [
|
|
381
|
+
grpc.StatusCode.UNAVAILABLE,
|
|
382
|
+
grpc.StatusCode.DEADLINE_EXCEEDED
|
|
383
|
+
]:
|
|
384
|
+
is_transient_error = True
|
|
385
|
+
elif isinstance(e, grpc.FutureTimeoutError):
|
|
386
|
+
detailed_reason = 'Timeout'
|
|
387
|
+
elif isinstance(e, asyncio.TimeoutError):
|
|
388
|
+
detailed_reason = ('Job status check timed out after '
|
|
389
|
+
f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
|
|
390
|
+
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
391
|
+
elif isinstance(e, ValueError):
|
|
392
|
+
# If the cluster yaml is deleted in the middle of getting the
|
|
393
|
+
# SSH credentials, we could see this. See
|
|
394
|
+
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
395
|
+
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
396
|
+
detailed_reason = 'Cluster yaml was deleted'
|
|
397
|
+
else:
|
|
398
|
+
raise
|
|
399
|
+
elif isinstance(e, TypeError):
|
|
400
|
+
# We will grab the SSH credentials from the cluster yaml, but if
|
|
401
|
+
# handle.cluster_yaml is None, we will just return an empty dict
|
|
402
|
+
# for the credentials. See
|
|
403
|
+
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
404
|
+
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
405
|
+
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
406
|
+
# TypeError if the cluster yaml is removed from the handle right
|
|
407
|
+
# when we pull it before the cluster is fully deleted.
|
|
408
|
+
error_msg_to_check = (
|
|
409
|
+
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
410
|
+
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
411
|
+
if str(e) == error_msg_to_check:
|
|
412
|
+
detailed_reason = 'SSH credentials were already cleaned up'
|
|
413
|
+
else:
|
|
414
|
+
raise
|
|
415
|
+
if is_transient_error:
|
|
416
|
+
logger.info('Failed to connect to the cluster. Retrying '
|
|
417
|
+
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
418
|
+
logger.info('=' * 34)
|
|
419
|
+
await asyncio.sleep(1)
|
|
420
|
+
else:
|
|
421
|
+
logger.info(f'Failed to get job status: {detailed_reason}')
|
|
422
|
+
logger.info('=' * 34)
|
|
423
|
+
return None
|
|
424
|
+
return None
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def controller_process_alive(record: managed_job_state.ControllerPidRecord,
|
|
428
|
+
legacy_job_id: Optional[int] = None,
|
|
429
|
+
quiet: bool = True) -> bool:
|
|
430
|
+
"""Check if the controller process is alive.
|
|
431
|
+
|
|
432
|
+
If legacy_job_id is provided, this will also return True for a legacy
|
|
433
|
+
single-job controller process with that job id, based on the cmdline. This
|
|
434
|
+
is how the old check worked before #7051.
|
|
435
|
+
"""
|
|
139
436
|
try:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
437
|
+
process = psutil.Process(record.pid)
|
|
438
|
+
|
|
439
|
+
if record.started_at is not None:
|
|
440
|
+
if process.create_time() != record.started_at:
|
|
441
|
+
if not quiet:
|
|
442
|
+
logger.debug(f'Controller process {record.pid} has started '
|
|
443
|
+
f'at {record.started_at} but process has '
|
|
444
|
+
f'started at {process.create_time()}')
|
|
445
|
+
return False
|
|
145
446
|
else:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
447
|
+
# If we can't check the create_time try to check the cmdline instead
|
|
448
|
+
cmd_str = ' '.join(process.cmdline())
|
|
449
|
+
# pylint: disable=line-too-long
|
|
450
|
+
# Pre-#7051 cmdline: /path/to/python -u -m sky.jobs.controller <dag.yaml_path> --job-id <job_id>
|
|
451
|
+
# Post-#7051 cmdline: /path/to/python -u -msky.jobs.controller
|
|
452
|
+
# pylint: enable=line-too-long
|
|
453
|
+
if ('-m sky.jobs.controller' not in cmd_str and
|
|
454
|
+
'-msky.jobs.controller' not in cmd_str):
|
|
455
|
+
if not quiet:
|
|
456
|
+
logger.debug(f'Process {record.pid} is not a controller '
|
|
457
|
+
'process - missing "-m sky.jobs.controller" '
|
|
458
|
+
f'from cmdline: {cmd_str}')
|
|
459
|
+
return False
|
|
460
|
+
if (legacy_job_id is not None and '--job-id' in cmd_str and
|
|
461
|
+
f'--job-id {legacy_job_id}' not in cmd_str):
|
|
462
|
+
if not quiet:
|
|
463
|
+
logger.debug(f'Controller process {record.pid} has the '
|
|
464
|
+
f'wrong --job-id (expected {legacy_job_id}) '
|
|
465
|
+
f'in cmdline: {cmd_str}')
|
|
466
|
+
return False
|
|
467
|
+
|
|
468
|
+
# On linux, psutil.Process(pid) will return a valid process object
|
|
469
|
+
# even if the pid is actually a thread ID within the process. This
|
|
470
|
+
# hugely inflates the number of valid-looking pids, increasing the
|
|
471
|
+
# chance that we will falsely believe a controller is alive. The pid
|
|
472
|
+
# file should never contain thread IDs, just process IDs. We can
|
|
473
|
+
# check this with psutil.pid_exists(pid), which is false for TIDs.
|
|
474
|
+
# See pid_exists in psutil/_pslinux.py
|
|
475
|
+
if not psutil.pid_exists(record.pid):
|
|
476
|
+
if not quiet:
|
|
477
|
+
logger.debug(
|
|
478
|
+
f'Controller process {record.pid} is not a valid '
|
|
479
|
+
'process id.')
|
|
480
|
+
return False
|
|
151
481
|
|
|
482
|
+
return process.is_running()
|
|
152
483
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
# The last two args of the command line should be --job-id <id>
|
|
158
|
-
job_args = process.cmdline()[-2:]
|
|
159
|
-
return process.is_running() and job_args == ['--job-id', str(job_id)]
|
|
160
|
-
except psutil.NoSuchProcess:
|
|
484
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
|
|
485
|
+
OSError) as e:
|
|
486
|
+
if not quiet:
|
|
487
|
+
logger.debug(f'Controller process {record.pid} is not running: {e}')
|
|
161
488
|
return False
|
|
162
489
|
|
|
163
490
|
|
|
@@ -173,6 +500,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
173
500
|
Note: we expect that job_id, if provided, refers to a nonterminal job or a
|
|
174
501
|
job that has not completed its cleanup (schedule state not DONE).
|
|
175
502
|
"""
|
|
503
|
+
# This signal file suggests that the controller is recovering from a
|
|
504
|
+
# failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
505
|
+
# When restarting the controller processes, we don't want this event to
|
|
506
|
+
# set the job status to FAILED_CONTROLLER.
|
|
507
|
+
# TODO(tian): Change this to restart the controller process. For now we
|
|
508
|
+
# disabled it when recovering because we want to avoid caveats of infinite
|
|
509
|
+
# restart of last controller process that fully occupied the controller VM.
|
|
510
|
+
if os.path.exists(
|
|
511
|
+
os.path.expanduser(
|
|
512
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
|
513
|
+
return
|
|
176
514
|
|
|
177
515
|
def _cleanup_job_clusters(job_id: int) -> Optional[str]:
|
|
178
516
|
"""Clean up clusters for a job. Returns error message if any.
|
|
@@ -181,15 +519,22 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
181
519
|
capture the error message, and log/return it.
|
|
182
520
|
"""
|
|
183
521
|
error_msg = None
|
|
184
|
-
tasks = managed_job_state.
|
|
522
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
185
523
|
for task in tasks:
|
|
186
|
-
|
|
187
|
-
|
|
524
|
+
pool = task.get('pool', None)
|
|
525
|
+
if pool is None:
|
|
526
|
+
task_name = task['job_name']
|
|
527
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
528
|
+
task_name, job_id)
|
|
529
|
+
else:
|
|
530
|
+
cluster_name, _ = (
|
|
531
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
188
532
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
189
533
|
cluster_name)
|
|
190
534
|
if handle is not None:
|
|
191
535
|
try:
|
|
192
|
-
|
|
536
|
+
if pool is None:
|
|
537
|
+
terminate_cluster(cluster_name)
|
|
193
538
|
except Exception as e: # pylint: disable=broad-except
|
|
194
539
|
error_msg = (
|
|
195
540
|
f'Failed to terminate cluster {cluster_name}: '
|
|
@@ -197,43 +542,6 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
197
542
|
logger.exception(error_msg, exc_info=e)
|
|
198
543
|
return error_msg
|
|
199
544
|
|
|
200
|
-
# For backwards compatible jobs
|
|
201
|
-
# TODO(cooperc): Remove before 0.11.0.
|
|
202
|
-
def _handle_legacy_job(job_id: int):
|
|
203
|
-
controller_status = job_lib.get_status(job_id)
|
|
204
|
-
if controller_status is None or controller_status.is_terminal():
|
|
205
|
-
logger.error(f'Controller process for legacy job {job_id} is '
|
|
206
|
-
'in an unexpected state.')
|
|
207
|
-
|
|
208
|
-
cleanup_error = _cleanup_job_clusters(job_id)
|
|
209
|
-
if cleanup_error:
|
|
210
|
-
# Unconditionally set the job to failed_controller if the
|
|
211
|
-
# cleanup fails.
|
|
212
|
-
managed_job_state.set_failed(
|
|
213
|
-
job_id,
|
|
214
|
-
task_id=None,
|
|
215
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
|
216
|
-
FAILED_CONTROLLER,
|
|
217
|
-
failure_reason=
|
|
218
|
-
'Legacy controller process has exited abnormally, and '
|
|
219
|
-
f'cleanup failed: {cleanup_error}. For more details, run: '
|
|
220
|
-
f'sky jobs logs --controller {job_id}',
|
|
221
|
-
override_terminal=True)
|
|
222
|
-
return
|
|
223
|
-
|
|
224
|
-
# It's possible for the job to have transitioned to
|
|
225
|
-
# another terminal state while between when we checked its
|
|
226
|
-
# state and now. In that case, set_failed won't do
|
|
227
|
-
# anything, which is fine.
|
|
228
|
-
managed_job_state.set_failed(
|
|
229
|
-
job_id,
|
|
230
|
-
task_id=None,
|
|
231
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
|
232
|
-
FAILED_CONTROLLER,
|
|
233
|
-
failure_reason=(
|
|
234
|
-
'Legacy controller process has exited abnormally. For '
|
|
235
|
-
f'more details, run: sky jobs logs --controller {job_id}'))
|
|
236
|
-
|
|
237
545
|
# Get jobs that need checking (non-terminal or not DONE)
|
|
238
546
|
job_ids = managed_job_state.get_jobs_to_check_status(job_id)
|
|
239
547
|
if not job_ids:
|
|
@@ -242,29 +550,23 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
242
550
|
return
|
|
243
551
|
|
|
244
552
|
for job_id in job_ids:
|
|
245
|
-
|
|
553
|
+
assert job_id is not None
|
|
554
|
+
tasks = managed_job_state.get_managed_job_tasks(job_id)
|
|
246
555
|
# Note: controller_pid and schedule_state are in the job_info table
|
|
247
556
|
# which is joined to the spot table, so all tasks with the same job_id
|
|
248
557
|
# will have the same value for these columns. This is what lets us just
|
|
249
558
|
# take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
|
|
250
559
|
schedule_state = tasks[0]['schedule_state']
|
|
251
560
|
|
|
252
|
-
# Backwards compatibility: this job was submitted when ray was still
|
|
253
|
-
# used for managing the parallelism of job controllers, before #4485.
|
|
254
|
-
# TODO(cooperc): Remove before 0.11.0.
|
|
255
|
-
if (schedule_state is
|
|
256
|
-
managed_job_state.ManagedJobScheduleState.INVALID):
|
|
257
|
-
_handle_legacy_job(job_id)
|
|
258
|
-
continue
|
|
259
|
-
|
|
260
561
|
# Handle jobs with schedule state (non-legacy jobs):
|
|
261
562
|
pid = tasks[0]['controller_pid']
|
|
563
|
+
pid_started_at = tasks[0].get('controller_pid_started_at')
|
|
262
564
|
if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
|
|
263
565
|
# There are two cases where we could get a job that is DONE.
|
|
264
566
|
# 1. At query time (get_jobs_to_check_status), the job was not yet
|
|
265
|
-
# DONE, but since then (before
|
|
266
|
-
# hit a terminal status, marked itself done, and exited.
|
|
267
|
-
# fine.
|
|
567
|
+
# DONE, but since then (before get_managed_job_tasks is called)
|
|
568
|
+
# it has hit a terminal status, marked itself done, and exited.
|
|
569
|
+
# This is fine.
|
|
268
570
|
# 2. The job is DONE, but in a non-terminal status. This is
|
|
269
571
|
# unexpected. For instance, the task status is RUNNING, but the
|
|
270
572
|
# job schedule_state is DONE.
|
|
@@ -311,7 +613,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
311
613
|
failure_reason = f'No controller pid set for {schedule_state.value}'
|
|
312
614
|
else:
|
|
313
615
|
logger.debug(f'Checking controller pid {pid}')
|
|
314
|
-
if
|
|
616
|
+
if controller_process_alive(
|
|
617
|
+
managed_job_state.ControllerPidRecord(
|
|
618
|
+
pid=pid, started_at=pid_started_at), job_id):
|
|
315
619
|
# The controller is still running, so this job is fine.
|
|
316
620
|
continue
|
|
317
621
|
|
|
@@ -369,11 +673,34 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
369
673
|
|
|
370
674
|
|
|
371
675
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
372
|
-
get_end_time: bool) -> float:
|
|
676
|
+
job_id: Optional[int], get_end_time: bool) -> float:
|
|
373
677
|
"""Get the submitted/ended time of the job."""
|
|
374
|
-
code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
375
|
-
job_id=None, get_ended_time=get_end_time)
|
|
376
678
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
679
|
+
assert handle is not None, (
|
|
680
|
+
f'handle for cluster {cluster_name!r} should not be None')
|
|
681
|
+
if handle.is_grpc_enabled_with_flag:
|
|
682
|
+
try:
|
|
683
|
+
if get_end_time:
|
|
684
|
+
end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
|
|
685
|
+
job_id=job_id)
|
|
686
|
+
end_ts_response = backend_utils.invoke_skylet_with_retries(
|
|
687
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
688
|
+
handle.get_grpc_channel()).get_job_ended_timestamp(
|
|
689
|
+
end_ts_request))
|
|
690
|
+
return end_ts_response.timestamp
|
|
691
|
+
else:
|
|
692
|
+
submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
|
|
693
|
+
job_id=job_id)
|
|
694
|
+
submit_ts_response = backend_utils.invoke_skylet_with_retries(
|
|
695
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
696
|
+
handle.get_grpc_channel()).get_job_submitted_timestamp(
|
|
697
|
+
submit_ts_request))
|
|
698
|
+
return submit_ts_response.timestamp
|
|
699
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
700
|
+
pass
|
|
701
|
+
|
|
702
|
+
code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
703
|
+
job_id=job_id, get_ended_time=get_end_time))
|
|
377
704
|
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
378
705
|
code,
|
|
379
706
|
stream_logs=False,
|
|
@@ -386,16 +713,24 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
|
386
713
|
|
|
387
714
|
|
|
388
715
|
def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
389
|
-
cluster_name: str) -> float:
|
|
716
|
+
cluster_name: str, job_id: Optional[int]) -> float:
|
|
390
717
|
"""Try to get the end time of the job.
|
|
391
718
|
|
|
392
719
|
If the job is preempted or we can't connect to the instance for whatever
|
|
393
720
|
reason, fall back to the current time.
|
|
394
721
|
"""
|
|
395
722
|
try:
|
|
396
|
-
return get_job_timestamp(backend,
|
|
397
|
-
|
|
398
|
-
|
|
723
|
+
return get_job_timestamp(backend,
|
|
724
|
+
cluster_name,
|
|
725
|
+
job_id=job_id,
|
|
726
|
+
get_end_time=True)
|
|
727
|
+
except (exceptions.CommandError, grpc.RpcError,
|
|
728
|
+
grpc.FutureTimeoutError) as e:
|
|
729
|
+
if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
|
|
730
|
+
(isinstance(e, grpc.RpcError) and e.code() in [
|
|
731
|
+
grpc.StatusCode.UNAVAILABLE,
|
|
732
|
+
grpc.StatusCode.DEADLINE_EXCEEDED,
|
|
733
|
+
]) or isinstance(e, grpc.FutureTimeoutError):
|
|
399
734
|
# Failed to connect - probably the instance was preempted since the
|
|
400
735
|
# job completed. We shouldn't crash here, so just log and use the
|
|
401
736
|
# current time.
|
|
@@ -407,7 +742,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
|
407
742
|
raise
|
|
408
743
|
|
|
409
744
|
|
|
410
|
-
def event_callback_func(
|
|
745
|
+
def event_callback_func(
|
|
746
|
+
job_id: int, task_id: Optional[int],
|
|
747
|
+
task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
|
|
411
748
|
"""Run event callback for the task."""
|
|
412
749
|
|
|
413
750
|
def callback_func(status: str):
|
|
@@ -415,8 +752,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
415
752
|
if event_callback is None or task is None:
|
|
416
753
|
return
|
|
417
754
|
event_callback = event_callback.strip()
|
|
418
|
-
|
|
419
|
-
|
|
755
|
+
pool = managed_job_state.get_pool_from_job_id(job_id)
|
|
756
|
+
if pool is not None:
|
|
757
|
+
cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
|
|
758
|
+
else:
|
|
759
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
760
|
+
task.name, job_id) if task.name else None
|
|
420
761
|
logger.info(f'=== START: event callback for {status!r} ===')
|
|
421
762
|
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
422
763
|
'managed_job_event',
|
|
@@ -442,7 +783,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
442
783
|
f'Bash:{event_callback},log_path:{log_path},result:{result}')
|
|
443
784
|
logger.info(f'=== END: event callback for {status!r} ===')
|
|
444
785
|
|
|
445
|
-
|
|
786
|
+
async def async_callback_func(status: str):
|
|
787
|
+
return await context_utils.to_thread(callback_func, status)
|
|
788
|
+
|
|
789
|
+
return async_callback_func
|
|
446
790
|
|
|
447
791
|
|
|
448
792
|
# ======== user functions ========
|
|
@@ -461,20 +805,24 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
|
|
|
461
805
|
|
|
462
806
|
|
|
463
807
|
def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
464
|
-
all_users: bool = False
|
|
808
|
+
all_users: bool = False,
|
|
809
|
+
current_workspace: Optional[str] = None,
|
|
810
|
+
user_hash: Optional[str] = None) -> str:
|
|
465
811
|
"""Cancel jobs by id.
|
|
466
812
|
|
|
467
813
|
If job_ids is None, cancel all jobs.
|
|
468
814
|
"""
|
|
469
815
|
if job_ids is None:
|
|
470
816
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
|
|
471
|
-
None, all_users)
|
|
817
|
+
None, user_hash, all_users)
|
|
472
818
|
job_ids = list(set(job_ids))
|
|
473
819
|
if not job_ids:
|
|
474
820
|
return 'No job to cancel.'
|
|
475
|
-
|
|
476
|
-
|
|
821
|
+
if current_workspace is None:
|
|
822
|
+
current_workspace = constants.SKYPILOT_DEFAULT_WORKSPACE
|
|
823
|
+
|
|
477
824
|
cancelled_job_ids: List[int] = []
|
|
825
|
+
wrong_workspace_job_ids: List[int] = []
|
|
478
826
|
for job_id in job_ids:
|
|
479
827
|
# Check the status of the managed job status. If it is in
|
|
480
828
|
# terminal state, we can safely skip it.
|
|
@@ -486,30 +834,70 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
|
|
|
486
834
|
logger.info(f'Job {job_id} is already in terminal state '
|
|
487
835
|
f'{job_status.value}. Skipped.')
|
|
488
836
|
continue
|
|
837
|
+
elif job_status == managed_job_state.ManagedJobStatus.PENDING:
|
|
838
|
+
# the "if PENDING" is a short circuit, this will be atomic.
|
|
839
|
+
cancelled = managed_job_state.set_pending_cancelled(job_id)
|
|
840
|
+
if cancelled:
|
|
841
|
+
cancelled_job_ids.append(job_id)
|
|
842
|
+
continue
|
|
489
843
|
|
|
490
844
|
update_managed_jobs_statuses(job_id)
|
|
491
845
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
846
|
+
job_workspace = managed_job_state.get_workspace(job_id)
|
|
847
|
+
if current_workspace is not None and job_workspace != current_workspace:
|
|
848
|
+
wrong_workspace_job_ids.append(job_id)
|
|
849
|
+
continue
|
|
850
|
+
|
|
851
|
+
if managed_job_state.is_legacy_controller_process(job_id):
|
|
852
|
+
# The job is running on a legacy single-job controller process.
|
|
853
|
+
# TODO(cooperc): Remove this handling for 0.13.0
|
|
854
|
+
|
|
855
|
+
# Send the signal to the jobs controller.
|
|
856
|
+
signal_file = (pathlib.Path(
|
|
857
|
+
managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
|
|
858
|
+
# Filelock is needed to prevent race condition between signal
|
|
859
|
+
# check/removal and signal writing.
|
|
860
|
+
with filelock.FileLock(str(signal_file) + '.lock'):
|
|
861
|
+
with signal_file.open('w', encoding='utf-8') as f:
|
|
862
|
+
f.write(UserSignal.CANCEL.value)
|
|
863
|
+
f.flush()
|
|
864
|
+
else:
|
|
865
|
+
# New controller process.
|
|
866
|
+
try:
|
|
867
|
+
signal_file = pathlib.Path(
|
|
868
|
+
managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
|
|
869
|
+
signal_file.touch()
|
|
870
|
+
except OSError as e:
|
|
871
|
+
logger.error(f'Failed to cancel job {job_id}: {e}')
|
|
872
|
+
# Don't add it to the to be cancelled job ids
|
|
873
|
+
continue
|
|
874
|
+
|
|
500
875
|
cancelled_job_ids.append(job_id)
|
|
501
876
|
|
|
877
|
+
wrong_workspace_job_str = ''
|
|
878
|
+
if wrong_workspace_job_ids:
|
|
879
|
+
plural = 's' if len(wrong_workspace_job_ids) > 1 else ''
|
|
880
|
+
plural_verb = 'are' if len(wrong_workspace_job_ids) > 1 else 'is'
|
|
881
|
+
wrong_workspace_job_str = (
|
|
882
|
+
f' Job{plural} with ID{plural}'
|
|
883
|
+
f' {", ".join(map(str, wrong_workspace_job_ids))} '
|
|
884
|
+
f'{plural_verb} skipped as they are not in the active workspace '
|
|
885
|
+
f'{current_workspace!r}. Check the workspace of the job with: '
|
|
886
|
+
f'sky jobs queue')
|
|
887
|
+
|
|
502
888
|
if not cancelled_job_ids:
|
|
503
|
-
return 'No job to cancel.'
|
|
889
|
+
return f'No job to cancel.{wrong_workspace_job_str}'
|
|
504
890
|
identity_str = f'Job with ID {cancelled_job_ids[0]} is'
|
|
505
891
|
if len(cancelled_job_ids) > 1:
|
|
506
892
|
cancelled_job_ids_str = ', '.join(map(str, cancelled_job_ids))
|
|
507
893
|
identity_str = f'Jobs with IDs {cancelled_job_ids_str} are'
|
|
508
894
|
|
|
509
|
-
|
|
895
|
+
msg = f'{identity_str} scheduled to be cancelled.{wrong_workspace_job_str}'
|
|
896
|
+
return msg
|
|
510
897
|
|
|
511
898
|
|
|
512
|
-
def cancel_job_by_name(job_name: str
|
|
899
|
+
def cancel_job_by_name(job_name: str,
|
|
900
|
+
current_workspace: Optional[str] = None) -> str:
|
|
513
901
|
"""Cancel a job by name."""
|
|
514
902
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
|
|
515
903
|
if not job_ids:
|
|
@@ -518,11 +906,30 @@ def cancel_job_by_name(job_name: str) -> str:
|
|
|
518
906
|
return (f'{colorama.Fore.RED}Multiple running jobs found '
|
|
519
907
|
f'with name {job_name!r}.\n'
|
|
520
908
|
f'Job IDs: {job_ids}{colorama.Style.RESET_ALL}')
|
|
521
|
-
cancel_jobs_by_id(job_ids)
|
|
522
|
-
return f'
|
|
909
|
+
msg = cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
|
910
|
+
return f'{job_name!r} {msg}'
|
|
523
911
|
|
|
524
912
|
|
|
525
|
-
def
|
|
913
|
+
def cancel_jobs_by_pool(pool_name: str,
|
|
914
|
+
current_workspace: Optional[str] = None) -> str:
|
|
915
|
+
"""Cancel all jobs in a pool."""
|
|
916
|
+
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
|
|
917
|
+
if not job_ids:
|
|
918
|
+
return f'No running job found in pool {pool_name!r}.'
|
|
919
|
+
return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
def controller_log_file_for_job(job_id: int,
|
|
923
|
+
create_if_not_exists: bool = False) -> str:
|
|
924
|
+
log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
925
|
+
if create_if_not_exists:
|
|
926
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
927
|
+
return os.path.join(log_dir, f'{job_id}.log')
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def stream_logs_by_id(job_id: int,
|
|
931
|
+
follow: bool = True,
|
|
932
|
+
tail: Optional[int] = None) -> Tuple[str, int]:
|
|
526
933
|
"""Stream logs by job id.
|
|
527
934
|
|
|
528
935
|
Returns:
|
|
@@ -552,18 +959,60 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
|
|
|
552
959
|
if managed_job_status.is_failed():
|
|
553
960
|
job_msg = ('\nFailure reason: '
|
|
554
961
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
962
|
+
log_file_ever_existed = False
|
|
963
|
+
task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
|
|
964
|
+
job_id)
|
|
965
|
+
num_tasks = len(task_info)
|
|
966
|
+
for (task_id, task_name, task_status, log_file,
|
|
967
|
+
logs_cleaned_at) in task_info:
|
|
968
|
+
if log_file:
|
|
969
|
+
log_file_ever_existed = True
|
|
970
|
+
if logs_cleaned_at is not None:
|
|
971
|
+
ts_str = datetime.fromtimestamp(
|
|
972
|
+
logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
|
|
973
|
+
print(f'Task {task_name}({task_id}) log has been '
|
|
974
|
+
f'cleaned at {ts_str}.')
|
|
975
|
+
continue
|
|
976
|
+
task_str = (f'Task {task_name}({task_id})'
|
|
977
|
+
if task_name else f'Task {task_id}')
|
|
978
|
+
if num_tasks > 1:
|
|
979
|
+
print(f'=== {task_str} ===')
|
|
980
|
+
with open(os.path.expanduser(log_file),
|
|
981
|
+
'r',
|
|
982
|
+
encoding='utf-8') as f:
|
|
983
|
+
# Stream the logs to the console without reading the
|
|
984
|
+
# whole file into memory.
|
|
985
|
+
start_streaming = False
|
|
986
|
+
read_from: Union[TextIO, Deque[str]] = f
|
|
987
|
+
if tail is not None:
|
|
988
|
+
assert tail > 0
|
|
989
|
+
# Read only the last 'tail' lines using deque
|
|
990
|
+
read_from = collections.deque(f, maxlen=tail)
|
|
991
|
+
# We set start_streaming to True here in case
|
|
992
|
+
# truncating the log file removes the line that
|
|
993
|
+
# contains LOG_FILE_START_STREAMING_AT. This does
|
|
994
|
+
# not cause issues for log files shorter than tail
|
|
995
|
+
# because tail_logs in sky/skylet/log_lib.py also
|
|
996
|
+
# handles LOG_FILE_START_STREAMING_AT.
|
|
564
997
|
start_streaming = True
|
|
565
|
-
|
|
566
|
-
|
|
998
|
+
for line in read_from:
|
|
999
|
+
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
|
1000
|
+
start_streaming = True
|
|
1001
|
+
if start_streaming:
|
|
1002
|
+
print(line, end='', flush=True)
|
|
1003
|
+
if num_tasks > 1:
|
|
1004
|
+
# Add the "Task finished" message for terminal states
|
|
1005
|
+
if task_status.is_terminal():
|
|
1006
|
+
print(ux_utils.finishing_message(
|
|
1007
|
+
f'{task_str} finished '
|
|
1008
|
+
f'(status: {task_status.value}).'),
|
|
1009
|
+
flush=True)
|
|
1010
|
+
if log_file_ever_existed:
|
|
1011
|
+
# Add the "Job finished" message for terminal states
|
|
1012
|
+
if managed_job_status.is_terminal():
|
|
1013
|
+
print(ux_utils.finishing_message(
|
|
1014
|
+
f'Job finished (status: {managed_job_status.value}).'),
|
|
1015
|
+
flush=True)
|
|
567
1016
|
return '', exceptions.JobExitCode.from_managed_job_status(
|
|
568
1017
|
managed_job_status)
|
|
569
1018
|
return (f'{colorama.Fore.YELLOW}'
|
|
@@ -585,12 +1034,19 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
|
|
|
585
1034
|
|
|
586
1035
|
while should_keep_logging(managed_job_status):
|
|
587
1036
|
handle = None
|
|
1037
|
+
job_id_to_tail = None
|
|
588
1038
|
if task_id is not None:
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
1039
|
+
pool = managed_job_state.get_pool_from_job_id(job_id)
|
|
1040
|
+
if pool is not None:
|
|
1041
|
+
cluster_name, job_id_to_tail = (
|
|
1042
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
1043
|
+
else:
|
|
1044
|
+
task_name = managed_job_state.get_task_name(job_id, task_id)
|
|
1045
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1046
|
+
task_name, job_id)
|
|
1047
|
+
if cluster_name is not None:
|
|
1048
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
1049
|
+
cluster_name)
|
|
594
1050
|
|
|
595
1051
|
# Check the handle: The cluster can be preempted and removed from
|
|
596
1052
|
# the table before the managed job state is updated by the
|
|
@@ -620,10 +1076,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
|
|
|
620
1076
|
managed_job_state.ManagedJobStatus.RUNNING)
|
|
621
1077
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
622
1078
|
status_display.stop()
|
|
1079
|
+
tail_param = tail if tail is not None else 0
|
|
623
1080
|
returncode = backend.tail_logs(handle,
|
|
624
|
-
job_id=
|
|
1081
|
+
job_id=job_id_to_tail,
|
|
625
1082
|
managed_job_id=job_id,
|
|
626
|
-
follow=follow
|
|
1083
|
+
follow=follow,
|
|
1084
|
+
tail=tail_param)
|
|
627
1085
|
if returncode in [rc.value for rc in exceptions.JobExitCode]:
|
|
628
1086
|
# If the log tailing exits with a known exit code we can safely
|
|
629
1087
|
# break the loop because it indicates the tailing process
|
|
@@ -760,7 +1218,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
|
|
|
760
1218
|
def stream_logs(job_id: Optional[int],
|
|
761
1219
|
job_name: Optional[str],
|
|
762
1220
|
controller: bool = False,
|
|
763
|
-
follow: bool = True
|
|
1221
|
+
follow: bool = True,
|
|
1222
|
+
tail: Optional[int] = None) -> Tuple[str, int]:
|
|
764
1223
|
"""Stream logs by job id or job name.
|
|
765
1224
|
|
|
766
1225
|
Returns:
|
|
@@ -776,7 +1235,8 @@ def stream_logs(job_id: Optional[int],
|
|
|
776
1235
|
if controller:
|
|
777
1236
|
if job_id is None:
|
|
778
1237
|
assert job_name is not None
|
|
779
|
-
managed_jobs = managed_job_state.
|
|
1238
|
+
managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
|
|
1239
|
+
name_match=job_name, fields=['job_id', 'job_name', 'status'])
|
|
780
1240
|
# We manually filter the jobs by name, instead of using
|
|
781
1241
|
# get_nonterminal_job_ids_by_name, as with `controller=True`, we
|
|
782
1242
|
# should be able to show the logs for jobs in terminal states.
|
|
@@ -799,9 +1259,7 @@ def stream_logs(job_id: Optional[int],
|
|
|
799
1259
|
job_id = managed_job_ids.pop()
|
|
800
1260
|
assert job_id is not None, (job_id, job_name)
|
|
801
1261
|
|
|
802
|
-
controller_log_path =
|
|
803
|
-
os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
|
|
804
|
-
f'{job_id}.log')
|
|
1262
|
+
controller_log_path = controller_log_file_for_job(job_id)
|
|
805
1263
|
job_status = None
|
|
806
1264
|
|
|
807
1265
|
# Wait for the log file to be written
|
|
@@ -831,7 +1289,12 @@ def stream_logs(job_id: Optional[int],
|
|
|
831
1289
|
with open(controller_log_path, 'r', newline='', encoding='utf-8') as f:
|
|
832
1290
|
# Note: we do not need to care about start_stream_at here, since
|
|
833
1291
|
# that should be in the job log printed above.
|
|
834
|
-
|
|
1292
|
+
read_from: Union[TextIO, Deque[str]] = f
|
|
1293
|
+
if tail is not None:
|
|
1294
|
+
assert tail > 0
|
|
1295
|
+
# Read only the last 'tail' lines efficiently using deque
|
|
1296
|
+
read_from = collections.deque(f, maxlen=tail)
|
|
1297
|
+
for line in read_from:
|
|
835
1298
|
print(line, end='')
|
|
836
1299
|
# Flush.
|
|
837
1300
|
print(end='', flush=True)
|
|
@@ -883,61 +1346,384 @@ def stream_logs(job_id: Optional[int],
|
|
|
883
1346
|
f'Multiple running jobs found with name {job_name!r}.')
|
|
884
1347
|
job_id = job_ids[0]
|
|
885
1348
|
|
|
886
|
-
return stream_logs_by_id(job_id, follow)
|
|
1349
|
+
return stream_logs_by_id(job_id, follow, tail)
|
|
1350
|
+
|
|
1351
|
+
|
|
1352
|
+
def dump_managed_job_queue(
|
|
1353
|
+
skip_finished: bool = False,
|
|
1354
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1355
|
+
job_ids: Optional[List[int]] = None,
|
|
1356
|
+
workspace_match: Optional[str] = None,
|
|
1357
|
+
name_match: Optional[str] = None,
|
|
1358
|
+
pool_match: Optional[str] = None,
|
|
1359
|
+
page: Optional[int] = None,
|
|
1360
|
+
limit: Optional[int] = None,
|
|
1361
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1362
|
+
statuses: Optional[List[str]] = None,
|
|
1363
|
+
fields: Optional[List[str]] = None,
|
|
1364
|
+
) -> str:
|
|
1365
|
+
return message_utils.encode_payload(
|
|
1366
|
+
get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
|
|
1367
|
+
workspace_match, name_match, pool_match, page,
|
|
1368
|
+
limit, user_hashes, statuses, fields))
|
|
887
1369
|
|
|
888
1370
|
|
|
889
|
-
def
|
|
890
|
-
|
|
1371
|
+
def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
|
|
1372
|
+
"""Update the fields list to include the necessary fields.
|
|
1373
|
+
|
|
1374
|
+
Args:
|
|
1375
|
+
fields: The fields to update.
|
|
1376
|
+
|
|
1377
|
+
It will:
|
|
1378
|
+
- Add the necessary dependent fields to the list.
|
|
1379
|
+
- Remove the fields that are not in the DB.
|
|
1380
|
+
- Determine if cluster handle is required.
|
|
1381
|
+
|
|
1382
|
+
Returns:
|
|
1383
|
+
A tuple containing the updated fields and a boolean indicating if
|
|
1384
|
+
cluster handle is required.
|
|
1385
|
+
"""
|
|
1386
|
+
cluster_handle_required = True
|
|
1387
|
+
if _cluster_handle_not_required(fields):
|
|
1388
|
+
cluster_handle_required = False
|
|
1389
|
+
# Copy the list to avoid modifying the original list
|
|
1390
|
+
new_fields = fields.copy()
|
|
1391
|
+
# status and job_id are always included
|
|
1392
|
+
if 'status' not in new_fields:
|
|
1393
|
+
new_fields.append('status')
|
|
1394
|
+
if 'job_id' not in new_fields:
|
|
1395
|
+
new_fields.append('job_id')
|
|
1396
|
+
# user_hash is required if user_name is present
|
|
1397
|
+
if 'user_name' in new_fields and 'user_hash' not in new_fields:
|
|
1398
|
+
new_fields.append('user_hash')
|
|
1399
|
+
if 'job_duration' in new_fields:
|
|
1400
|
+
if 'last_recovered_at' not in new_fields:
|
|
1401
|
+
new_fields.append('last_recovered_at')
|
|
1402
|
+
if 'end_at' not in new_fields:
|
|
1403
|
+
new_fields.append('end_at')
|
|
1404
|
+
if 'job_name' in new_fields and 'task_name' not in new_fields:
|
|
1405
|
+
new_fields.append('task_name')
|
|
1406
|
+
if 'details' in new_fields:
|
|
1407
|
+
if 'schedule_state' not in new_fields:
|
|
1408
|
+
new_fields.append('schedule_state')
|
|
1409
|
+
if 'priority' not in new_fields:
|
|
1410
|
+
new_fields.append('priority')
|
|
1411
|
+
if 'failure_reason' not in new_fields:
|
|
1412
|
+
new_fields.append('failure_reason')
|
|
1413
|
+
if 'user_yaml' in new_fields:
|
|
1414
|
+
if 'original_user_yaml_path' not in new_fields:
|
|
1415
|
+
new_fields.append('original_user_yaml_path')
|
|
1416
|
+
if 'original_user_yaml_content' not in new_fields:
|
|
1417
|
+
new_fields.append('original_user_yaml_content')
|
|
1418
|
+
if cluster_handle_required:
|
|
1419
|
+
if 'task_name' not in new_fields:
|
|
1420
|
+
new_fields.append('task_name')
|
|
1421
|
+
if 'current_cluster_name' not in new_fields:
|
|
1422
|
+
new_fields.append('current_cluster_name')
|
|
1423
|
+
# Remove _NON_DB_FIELDS
|
|
1424
|
+
# These fields have been mapped to the DB fields in the above code, so we
|
|
1425
|
+
# don't need to include them in the updated fields.
|
|
1426
|
+
for field in _NON_DB_FIELDS:
|
|
1427
|
+
if field in new_fields:
|
|
1428
|
+
new_fields.remove(field)
|
|
1429
|
+
return new_fields, cluster_handle_required
|
|
1430
|
+
|
|
1431
|
+
|
|
1432
|
+
def _cluster_handle_not_required(fields: List[str]) -> bool:
|
|
1433
|
+
"""Determine if cluster handle is not required.
|
|
1434
|
+
|
|
1435
|
+
Args:
|
|
1436
|
+
fields: The fields to check if they contain any of the cluster handle
|
|
1437
|
+
fields.
|
|
1438
|
+
|
|
1439
|
+
Returns:
|
|
1440
|
+
True if the fields do not contain any of the cluster handle fields,
|
|
1441
|
+
False otherwise.
|
|
1442
|
+
"""
|
|
1443
|
+
return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def get_managed_job_queue(
|
|
1447
|
+
skip_finished: bool = False,
|
|
1448
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1449
|
+
job_ids: Optional[List[int]] = None,
|
|
1450
|
+
workspace_match: Optional[str] = None,
|
|
1451
|
+
name_match: Optional[str] = None,
|
|
1452
|
+
pool_match: Optional[str] = None,
|
|
1453
|
+
page: Optional[int] = None,
|
|
1454
|
+
limit: Optional[int] = None,
|
|
1455
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1456
|
+
statuses: Optional[List[str]] = None,
|
|
1457
|
+
fields: Optional[List[str]] = None,
|
|
1458
|
+
) -> Dict[str, Any]:
|
|
1459
|
+
"""Get the managed job queue.
|
|
1460
|
+
|
|
1461
|
+
Args:
|
|
1462
|
+
skip_finished: Whether to skip finished jobs.
|
|
1463
|
+
accessible_workspaces: The accessible workspaces.
|
|
1464
|
+
job_ids: The job ids.
|
|
1465
|
+
workspace_match: The workspace name to match.
|
|
1466
|
+
name_match: The job name to match.
|
|
1467
|
+
pool_match: The pool name to match.
|
|
1468
|
+
page: The page number.
|
|
1469
|
+
limit: The limit number.
|
|
1470
|
+
user_hashes: The user hashes.
|
|
1471
|
+
statuses: The statuses.
|
|
1472
|
+
fields: The fields to include in the response.
|
|
1473
|
+
|
|
1474
|
+
Returns:
|
|
1475
|
+
A dictionary containing the managed job queue.
|
|
1476
|
+
"""
|
|
1477
|
+
cluster_handle_required = True
|
|
1478
|
+
updated_fields = None
|
|
1479
|
+
# The caller only need to specify the fields in the
|
|
1480
|
+
# `class ManagedJobRecord` in `response.py`, and the `_update_fields`
|
|
1481
|
+
# function will add the necessary dependent fields to the list, for
|
|
1482
|
+
# example, if the caller specifies `['user_name']`, the `_update_fields`
|
|
1483
|
+
# function will add `['user_hash']` to the list.
|
|
1484
|
+
if fields:
|
|
1485
|
+
updated_fields, cluster_handle_required = _update_fields(fields)
|
|
1486
|
+
|
|
1487
|
+
total_no_filter = managed_job_state.get_managed_jobs_total()
|
|
1488
|
+
|
|
1489
|
+
status_counts = managed_job_state.get_status_count_with_filters(
|
|
1490
|
+
fields=fields,
|
|
1491
|
+
job_ids=job_ids,
|
|
1492
|
+
accessible_workspaces=accessible_workspaces,
|
|
1493
|
+
workspace_match=workspace_match,
|
|
1494
|
+
name_match=name_match,
|
|
1495
|
+
pool_match=pool_match,
|
|
1496
|
+
user_hashes=user_hashes,
|
|
1497
|
+
skip_finished=skip_finished,
|
|
1498
|
+
)
|
|
1499
|
+
|
|
1500
|
+
jobs, total = managed_job_state.get_managed_jobs_with_filters(
|
|
1501
|
+
fields=updated_fields,
|
|
1502
|
+
job_ids=job_ids,
|
|
1503
|
+
accessible_workspaces=accessible_workspaces,
|
|
1504
|
+
workspace_match=workspace_match,
|
|
1505
|
+
name_match=name_match,
|
|
1506
|
+
pool_match=pool_match,
|
|
1507
|
+
user_hashes=user_hashes,
|
|
1508
|
+
statuses=statuses,
|
|
1509
|
+
skip_finished=skip_finished,
|
|
1510
|
+
page=page,
|
|
1511
|
+
limit=limit,
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
if cluster_handle_required:
|
|
1515
|
+
# Fetch the cluster name to handle map for managed clusters only.
|
|
1516
|
+
cluster_name_to_handle = (
|
|
1517
|
+
global_user_state.get_cluster_name_to_handle_map(is_managed=True))
|
|
1518
|
+
|
|
1519
|
+
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1520
|
+
if not fields or 'details' in fields:
|
|
1521
|
+
# Figure out what the highest priority blocking job is. We need to know
|
|
1522
|
+
# in order to determine if other jobs are blocked by a higher priority
|
|
1523
|
+
# job, or just by the limited controller resources.
|
|
1524
|
+
highest_blocking_priority = (
|
|
1525
|
+
managed_job_state.get_managed_jobs_highest_priority())
|
|
891
1526
|
|
|
892
1527
|
for job in jobs:
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
end_at
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
1528
|
+
if not fields or 'job_duration' in fields:
|
|
1529
|
+
end_at = job['end_at']
|
|
1530
|
+
if end_at is None:
|
|
1531
|
+
end_at = time.time()
|
|
1532
|
+
|
|
1533
|
+
job_submitted_at = job['last_recovered_at'] - job['job_duration']
|
|
1534
|
+
if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
|
|
1535
|
+
# When job is recovering, the duration is exact
|
|
1536
|
+
# job['job_duration']
|
|
1537
|
+
job_duration = job['job_duration']
|
|
1538
|
+
elif job_submitted_at > 0:
|
|
1539
|
+
job_duration = end_at - job_submitted_at
|
|
1540
|
+
else:
|
|
1541
|
+
# When job_start_at <= 0, that means the last_recovered_at
|
|
1542
|
+
# is not set yet, i.e. the job is not started.
|
|
1543
|
+
job_duration = 0
|
|
1544
|
+
job['job_duration'] = job_duration
|
|
908
1545
|
job['status'] = job['status'].value
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
912
|
-
job['task_name'], job['job_id'])
|
|
913
|
-
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
914
|
-
if handle is not None:
|
|
915
|
-
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
916
|
-
job['cluster_resources'] = (
|
|
917
|
-
f'{handle.launched_nodes}x {handle.launched_resources}')
|
|
918
|
-
job['region'] = handle.launched_resources.region
|
|
1546
|
+
if not fields or 'schedule_state' in fields:
|
|
1547
|
+
job['schedule_state'] = job['schedule_state'].value
|
|
919
1548
|
else:
|
|
920
|
-
|
|
921
|
-
job['cluster_resources'] = '-'
|
|
922
|
-
job['region'] = '-'
|
|
1549
|
+
job['schedule_state'] = None
|
|
923
1550
|
|
|
924
|
-
|
|
1551
|
+
if cluster_handle_required:
|
|
1552
|
+
cluster_name = job.get('current_cluster_name', None)
|
|
1553
|
+
if cluster_name is None:
|
|
1554
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1555
|
+
job['task_name'], job['job_id'])
|
|
1556
|
+
handle = cluster_name_to_handle.get(
|
|
1557
|
+
cluster_name, None) if cluster_name is not None else None
|
|
1558
|
+
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1559
|
+
resources_str_simple, resources_str_full = (
|
|
1560
|
+
resources_utils.get_readable_resources_repr(
|
|
1561
|
+
handle, simplified_only=False))
|
|
1562
|
+
assert resources_str_full is not None
|
|
1563
|
+
job['cluster_resources'] = resources_str_simple
|
|
1564
|
+
job['cluster_resources_full'] = resources_str_full
|
|
1565
|
+
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1566
|
+
job['region'] = handle.launched_resources.region
|
|
1567
|
+
job['zone'] = handle.launched_resources.zone
|
|
1568
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1569
|
+
str(handle.launched_resources.cloud),
|
|
1570
|
+
handle.launched_resources.region,
|
|
1571
|
+
handle.launched_resources.zone).formatted_str()
|
|
1572
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1573
|
+
else:
|
|
1574
|
+
# FIXME(zongheng): display the last cached values for these.
|
|
1575
|
+
job['cluster_resources'] = '-'
|
|
1576
|
+
job['cluster_resources_full'] = '-'
|
|
1577
|
+
job['cloud'] = '-'
|
|
1578
|
+
job['region'] = '-'
|
|
1579
|
+
job['zone'] = '-'
|
|
1580
|
+
job['infra'] = '-'
|
|
1581
|
+
|
|
1582
|
+
if not fields or 'details' in fields:
|
|
1583
|
+
# Add details about schedule state / backoff.
|
|
1584
|
+
state_details = None
|
|
1585
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
|
1586
|
+
state_details = 'In backoff, waiting for resources'
|
|
1587
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
|
1588
|
+
priority = job.get('priority')
|
|
1589
|
+
if (priority is not None and
|
|
1590
|
+
priority < highest_blocking_priority):
|
|
1591
|
+
# Job is lower priority than some other blocking job.
|
|
1592
|
+
state_details = 'Waiting for higher priority jobs to launch'
|
|
1593
|
+
else:
|
|
1594
|
+
state_details = 'Waiting for other jobs to launch'
|
|
1595
|
+
|
|
1596
|
+
if state_details and job['failure_reason']:
|
|
1597
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
|
1598
|
+
elif state_details:
|
|
1599
|
+
job['details'] = state_details
|
|
1600
|
+
elif job['failure_reason']:
|
|
1601
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
|
1602
|
+
else:
|
|
1603
|
+
job['details'] = None
|
|
1604
|
+
|
|
1605
|
+
return {
|
|
1606
|
+
'jobs': jobs,
|
|
1607
|
+
'total': total,
|
|
1608
|
+
'total_no_filter': total_no_filter,
|
|
1609
|
+
'status_counts': status_counts
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
def filter_jobs(
|
|
1614
|
+
jobs: List[Dict[str, Any]],
|
|
1615
|
+
workspace_match: Optional[str],
|
|
1616
|
+
name_match: Optional[str],
|
|
1617
|
+
pool_match: Optional[str],
|
|
1618
|
+
page: Optional[int],
|
|
1619
|
+
limit: Optional[int],
|
|
1620
|
+
user_match: Optional[str] = None,
|
|
1621
|
+
enable_user_match: bool = False,
|
|
1622
|
+
statuses: Optional[List[str]] = None,
|
|
1623
|
+
) -> Tuple[List[Dict[str, Any]], int, Dict[str, int]]:
|
|
1624
|
+
"""Filter jobs based on the given criteria.
|
|
1625
|
+
|
|
1626
|
+
Args:
|
|
1627
|
+
jobs: List of jobs to filter.
|
|
1628
|
+
workspace_match: Workspace name to filter.
|
|
1629
|
+
name_match: Job name to filter.
|
|
1630
|
+
pool_match: Pool name to filter.
|
|
1631
|
+
page: Page to filter.
|
|
1632
|
+
limit: Limit to filter.
|
|
1633
|
+
user_match: User name to filter.
|
|
1634
|
+
enable_user_match: Whether to enable user match.
|
|
1635
|
+
statuses: Statuses to filter.
|
|
1636
|
+
|
|
1637
|
+
Returns:
|
|
1638
|
+
List of filtered jobs
|
|
1639
|
+
Total number of jobs
|
|
1640
|
+
Dictionary of status counts
|
|
1641
|
+
"""
|
|
1642
|
+
|
|
1643
|
+
# TODO(hailong): refactor the whole function including the
|
|
1644
|
+
# `dump_managed_job_queue()` to use DB filtering.
|
|
1645
|
+
|
|
1646
|
+
def _pattern_matches(job: Dict[str, Any], key: str,
|
|
1647
|
+
pattern: Optional[str]) -> bool:
|
|
1648
|
+
if pattern is None:
|
|
1649
|
+
return True
|
|
1650
|
+
if key not in job:
|
|
1651
|
+
return False
|
|
1652
|
+
value = job[key]
|
|
1653
|
+
if not value:
|
|
1654
|
+
return False
|
|
1655
|
+
return pattern in str(value)
|
|
1656
|
+
|
|
1657
|
+
def _handle_page_and_limit(
|
|
1658
|
+
result: List[Dict[str, Any]],
|
|
1659
|
+
page: Optional[int],
|
|
1660
|
+
limit: Optional[int],
|
|
1661
|
+
) -> List[Dict[str, Any]]:
|
|
1662
|
+
if page is None and limit is None:
|
|
1663
|
+
return result
|
|
1664
|
+
assert page is not None and limit is not None, (page, limit)
|
|
1665
|
+
# page starts from 1
|
|
1666
|
+
start = (page - 1) * limit
|
|
1667
|
+
end = min(start + limit, len(result))
|
|
1668
|
+
return result[start:end]
|
|
925
1669
|
|
|
1670
|
+
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1671
|
+
result = []
|
|
1672
|
+
checks = [
|
|
1673
|
+
('workspace', workspace_match),
|
|
1674
|
+
('job_name', name_match),
|
|
1675
|
+
('pool', pool_match),
|
|
1676
|
+
]
|
|
1677
|
+
if enable_user_match:
|
|
1678
|
+
checks.append(('user_name', user_match))
|
|
1679
|
+
|
|
1680
|
+
for job in jobs:
|
|
1681
|
+
if not all(
|
|
1682
|
+
_pattern_matches(job, key, pattern) for key, pattern in checks):
|
|
1683
|
+
continue
|
|
1684
|
+
status_counts[job['status'].value] += 1
|
|
1685
|
+
if statuses:
|
|
1686
|
+
if job['status'].value not in statuses:
|
|
1687
|
+
continue
|
|
1688
|
+
result.append(job)
|
|
1689
|
+
|
|
1690
|
+
total = len(result)
|
|
1691
|
+
|
|
1692
|
+
return _handle_page_and_limit(result, page, limit), total, status_counts
|
|
926
1693
|
|
|
927
|
-
|
|
1694
|
+
|
|
1695
|
+
def load_managed_job_queue(
|
|
1696
|
+
payload: str
|
|
1697
|
+
) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType, int, Dict[
|
|
1698
|
+
str, int]]:
|
|
928
1699
|
"""Load job queue from json string."""
|
|
929
|
-
|
|
1700
|
+
result = message_utils.decode_payload(payload)
|
|
1701
|
+
result_type = ManagedJobQueueResultType.DICT
|
|
1702
|
+
status_counts: Dict[str, int] = {}
|
|
1703
|
+
if isinstance(result, dict):
|
|
1704
|
+
jobs: List[Dict[str, Any]] = result['jobs']
|
|
1705
|
+
total: int = result['total']
|
|
1706
|
+
status_counts = result.get('status_counts', {})
|
|
1707
|
+
total_no_filter: int = result.get('total_no_filter', total)
|
|
1708
|
+
else:
|
|
1709
|
+
jobs = result
|
|
1710
|
+
total = len(jobs)
|
|
1711
|
+
total_no_filter = total
|
|
1712
|
+
result_type = ManagedJobQueueResultType.LIST
|
|
1713
|
+
|
|
1714
|
+
all_users = global_user_state.get_all_users()
|
|
1715
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
930
1716
|
for job in jobs:
|
|
931
1717
|
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
932
1718
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
933
1719
|
# Skip jobs that do not have user_hash info.
|
|
934
1720
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
935
|
-
job['user_name'] =
|
|
936
|
-
return jobs
|
|
1721
|
+
job['user_name'] = all_users_map.get(job['user_hash'])
|
|
1722
|
+
return jobs, total, result_type, total_no_filter, status_counts
|
|
937
1723
|
|
|
938
1724
|
|
|
939
1725
|
def _get_job_status_from_tasks(
|
|
940
|
-
job_tasks: List[Dict[str, Any]]
|
|
1726
|
+
job_tasks: Union[List[responses.ManagedJobRecord], List[Dict[str, Any]]]
|
|
941
1727
|
) -> Tuple[managed_job_state.ManagedJobStatus, int]:
|
|
942
1728
|
"""Get the current task status and the current task id for a job."""
|
|
943
1729
|
managed_task_status = managed_job_state.ManagedJobStatus.SUCCEEDED
|
|
@@ -949,7 +1735,7 @@ def _get_job_status_from_tasks(
|
|
|
949
1735
|
# Use the first non-succeeded status.
|
|
950
1736
|
if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
|
|
951
1737
|
# TODO(zhwu): we should not blindly use the first non-
|
|
952
|
-
# succeeded as the status could be changed to
|
|
1738
|
+
# succeeded as the status could be changed to PENDING
|
|
953
1739
|
# when going from one task to the next one, which can be
|
|
954
1740
|
# confusing.
|
|
955
1741
|
break
|
|
@@ -957,29 +1743,40 @@ def _get_job_status_from_tasks(
|
|
|
957
1743
|
|
|
958
1744
|
|
|
959
1745
|
@typing.overload
|
|
960
|
-
def format_job_table(
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
1746
|
+
def format_job_table(
|
|
1747
|
+
tasks: List[Dict[str, Any]],
|
|
1748
|
+
show_all: bool,
|
|
1749
|
+
show_user: bool,
|
|
1750
|
+
return_rows: Literal[False] = False,
|
|
1751
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1752
|
+
max_jobs: Optional[int] = None,
|
|
1753
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1754
|
+
) -> str:
|
|
965
1755
|
...
|
|
966
1756
|
|
|
967
1757
|
|
|
968
1758
|
@typing.overload
|
|
969
|
-
def format_job_table(
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
1759
|
+
def format_job_table(
|
|
1760
|
+
tasks: List[Dict[str, Any]],
|
|
1761
|
+
show_all: bool,
|
|
1762
|
+
show_user: bool,
|
|
1763
|
+
return_rows: Literal[True],
|
|
1764
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1765
|
+
max_jobs: Optional[int] = None,
|
|
1766
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1767
|
+
) -> List[List[str]]:
|
|
974
1768
|
...
|
|
975
1769
|
|
|
976
1770
|
|
|
977
1771
|
def format_job_table(
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
1772
|
+
tasks: List[Dict[str, Any]],
|
|
1773
|
+
show_all: bool,
|
|
1774
|
+
show_user: bool,
|
|
1775
|
+
return_rows: bool = False,
|
|
1776
|
+
pool_status: Optional[List[Dict[str, Any]]] = None,
|
|
1777
|
+
max_jobs: Optional[int] = None,
|
|
1778
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1779
|
+
) -> Union[str, List[List[str]]]:
|
|
983
1780
|
"""Returns managed jobs as a formatted string.
|
|
984
1781
|
|
|
985
1782
|
Args:
|
|
@@ -988,13 +1785,15 @@ def format_job_table(
|
|
|
988
1785
|
max_jobs: The maximum number of jobs to show in the table.
|
|
989
1786
|
return_rows: If True, return the rows as a list of strings instead of
|
|
990
1787
|
all rows concatenated into a single string.
|
|
1788
|
+
pool_status: List of pool status dictionaries with replica_info.
|
|
1789
|
+
job_status_counts: The counts of each job status.
|
|
991
1790
|
|
|
992
1791
|
Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
|
|
993
1792
|
a list of "rows" (each of which is a list of str).
|
|
994
1793
|
"""
|
|
995
1794
|
jobs = collections.defaultdict(list)
|
|
996
1795
|
# Check if the tasks have user information from kubernetes.
|
|
997
|
-
# This is only used for sky status
|
|
1796
|
+
# This is only used for sky status-kubernetes.
|
|
998
1797
|
tasks_have_k8s_user = any([task.get('user') for task in tasks])
|
|
999
1798
|
if max_jobs and tasks_have_k8s_user:
|
|
1000
1799
|
raise ValueError('max_jobs is not supported when tasks have user info.')
|
|
@@ -1004,16 +1803,41 @@ def format_job_table(
|
|
|
1004
1803
|
return (task['user'], task['job_id'])
|
|
1005
1804
|
return task['job_id']
|
|
1006
1805
|
|
|
1806
|
+
def _get_job_id_to_worker_map(
|
|
1807
|
+
pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
|
|
1808
|
+
"""Create a mapping from job_id to worker replica_id.
|
|
1809
|
+
|
|
1810
|
+
Args:
|
|
1811
|
+
pool_status: List of pool status dictionaries with replica_info.
|
|
1812
|
+
|
|
1813
|
+
Returns:
|
|
1814
|
+
Dictionary mapping job_id to replica_id (worker ID).
|
|
1815
|
+
"""
|
|
1816
|
+
job_to_worker: Dict[int, int] = {}
|
|
1817
|
+
if pool_status is None:
|
|
1818
|
+
return job_to_worker
|
|
1819
|
+
for pool in pool_status:
|
|
1820
|
+
replica_info = pool.get('replica_info', [])
|
|
1821
|
+
for replica in replica_info:
|
|
1822
|
+
used_by = replica.get('used_by')
|
|
1823
|
+
if used_by is not None:
|
|
1824
|
+
job_to_worker[used_by] = replica.get('replica_id')
|
|
1825
|
+
return job_to_worker
|
|
1826
|
+
|
|
1827
|
+
# Create mapping from job_id to worker replica_id
|
|
1828
|
+
job_to_worker = _get_job_id_to_worker_map(pool_status)
|
|
1829
|
+
|
|
1007
1830
|
for task in tasks:
|
|
1008
1831
|
# The tasks within the same job_id are already sorted
|
|
1009
1832
|
# by the task_id.
|
|
1010
1833
|
jobs[get_hash(task)].append(task)
|
|
1011
1834
|
|
|
1012
|
-
|
|
1835
|
+
workspaces = set()
|
|
1013
1836
|
for job_tasks in jobs.values():
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1837
|
+
workspaces.add(job_tasks[0].get('workspace',
|
|
1838
|
+
constants.SKYPILOT_DEFAULT_WORKSPACE))
|
|
1839
|
+
|
|
1840
|
+
show_workspace = len(workspaces) > 1 or show_all
|
|
1017
1841
|
|
|
1018
1842
|
user_cols: List[str] = []
|
|
1019
1843
|
if show_user:
|
|
@@ -1024,26 +1848,43 @@ def format_job_table(
|
|
|
1024
1848
|
columns = [
|
|
1025
1849
|
'ID',
|
|
1026
1850
|
'TASK',
|
|
1851
|
+
*(['WORKSPACE'] if show_workspace else []),
|
|
1027
1852
|
'NAME',
|
|
1028
1853
|
*user_cols,
|
|
1029
|
-
'
|
|
1854
|
+
'REQUESTED',
|
|
1030
1855
|
'SUBMITTED',
|
|
1031
1856
|
'TOT. DURATION',
|
|
1032
1857
|
'JOB DURATION',
|
|
1033
1858
|
'#RECOVERIES',
|
|
1034
1859
|
'STATUS',
|
|
1860
|
+
'POOL',
|
|
1035
1861
|
]
|
|
1036
1862
|
if show_all:
|
|
1037
1863
|
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
|
1038
|
-
columns += [
|
|
1864
|
+
columns += [
|
|
1865
|
+
'WORKER_CLUSTER',
|
|
1866
|
+
'WORKER_JOB_ID',
|
|
1867
|
+
'STARTED',
|
|
1868
|
+
'INFRA',
|
|
1869
|
+
'RESOURCES',
|
|
1870
|
+
'SCHED. STATE',
|
|
1871
|
+
'DETAILS',
|
|
1872
|
+
'GIT_COMMIT',
|
|
1873
|
+
]
|
|
1039
1874
|
if tasks_have_k8s_user:
|
|
1040
1875
|
columns.insert(0, 'USER')
|
|
1041
1876
|
job_table = log_utils.create_table(columns)
|
|
1042
1877
|
|
|
1043
1878
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1879
|
+
if job_status_counts:
|
|
1880
|
+
for status_value, count in job_status_counts.items():
|
|
1881
|
+
status = managed_job_state.ManagedJobStatus(status_value)
|
|
1882
|
+
if not status.is_terminal():
|
|
1883
|
+
status_counts[status_value] = count
|
|
1884
|
+
else:
|
|
1885
|
+
for task in tasks:
|
|
1886
|
+
if not task['status'].is_terminal():
|
|
1887
|
+
status_counts[task['status'].value] += 1
|
|
1047
1888
|
|
|
1048
1889
|
all_tasks = tasks
|
|
1049
1890
|
if max_jobs is not None:
|
|
@@ -1054,7 +1895,10 @@ def format_job_table(
|
|
|
1054
1895
|
# by the task_id.
|
|
1055
1896
|
jobs[get_hash(task)].append(task)
|
|
1056
1897
|
|
|
1057
|
-
def generate_details(
|
|
1898
|
+
def generate_details(details: Optional[str],
|
|
1899
|
+
failure_reason: Optional[str]) -> str:
|
|
1900
|
+
if details is not None:
|
|
1901
|
+
return details
|
|
1058
1902
|
if failure_reason is not None:
|
|
1059
1903
|
return f'Failure: {failure_reason}'
|
|
1060
1904
|
return '-'
|
|
@@ -1083,6 +1927,8 @@ def format_job_table(
|
|
|
1083
1927
|
for job_hash, job_tasks in jobs.items():
|
|
1084
1928
|
if show_all:
|
|
1085
1929
|
schedule_state = job_tasks[0]['schedule_state']
|
|
1930
|
+
workspace = job_tasks[0].get('workspace',
|
|
1931
|
+
constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
1086
1932
|
|
|
1087
1933
|
if len(job_tasks) > 1:
|
|
1088
1934
|
# Aggregate the tasks into a new row in the table.
|
|
@@ -1120,10 +1966,20 @@ def format_job_table(
|
|
|
1120
1966
|
|
|
1121
1967
|
user_values = get_user_column_values(job_tasks[0])
|
|
1122
1968
|
|
|
1969
|
+
pool = job_tasks[0].get('pool')
|
|
1970
|
+
if pool is None:
|
|
1971
|
+
pool = '-'
|
|
1972
|
+
|
|
1973
|
+
# Add worker information if job is assigned to a worker
|
|
1123
1974
|
job_id = job_hash[1] if tasks_have_k8s_user else job_hash
|
|
1975
|
+
# job_id is now always an integer, use it to look up worker
|
|
1976
|
+
if job_id in job_to_worker and pool != '-':
|
|
1977
|
+
pool = f'{pool} (worker={job_to_worker[job_id]})'
|
|
1978
|
+
|
|
1124
1979
|
job_values = [
|
|
1125
1980
|
job_id,
|
|
1126
1981
|
'',
|
|
1982
|
+
*([''] if show_workspace else []),
|
|
1127
1983
|
job_name,
|
|
1128
1984
|
*user_values,
|
|
1129
1985
|
'-',
|
|
@@ -1132,15 +1988,20 @@ def format_job_table(
|
|
|
1132
1988
|
job_duration,
|
|
1133
1989
|
recovery_cnt,
|
|
1134
1990
|
status_str,
|
|
1991
|
+
pool,
|
|
1135
1992
|
]
|
|
1136
1993
|
if show_all:
|
|
1994
|
+
details = job_tasks[current_task_id].get('details')
|
|
1137
1995
|
failure_reason = job_tasks[current_task_id]['failure_reason']
|
|
1138
1996
|
job_values.extend([
|
|
1997
|
+
'-',
|
|
1998
|
+
'-',
|
|
1139
1999
|
'-',
|
|
1140
2000
|
'-',
|
|
1141
2001
|
'-',
|
|
1142
2002
|
job_tasks[0]['schedule_state'],
|
|
1143
|
-
generate_details(failure_reason),
|
|
2003
|
+
generate_details(details, failure_reason),
|
|
2004
|
+
job_tasks[0].get('metadata', {}).get('git_commit', '-'),
|
|
1144
2005
|
])
|
|
1145
2006
|
if tasks_have_k8s_user:
|
|
1146
2007
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
|
@@ -1153,9 +2014,20 @@ def format_job_table(
|
|
|
1153
2014
|
0, task['job_duration'], absolute=True)
|
|
1154
2015
|
submitted = log_utils.readable_time_duration(task['submitted_at'])
|
|
1155
2016
|
user_values = get_user_column_values(task)
|
|
2017
|
+
task_workspace = '-' if len(job_tasks) > 1 else workspace
|
|
2018
|
+
pool = task.get('pool')
|
|
2019
|
+
if pool is None:
|
|
2020
|
+
pool = '-'
|
|
2021
|
+
|
|
2022
|
+
# Add worker information if task is assigned to a worker
|
|
2023
|
+
task_job_id = task['job_id']
|
|
2024
|
+
if task_job_id in job_to_worker and pool != '-':
|
|
2025
|
+
pool = f'{pool} (worker={job_to_worker[task_job_id]})'
|
|
2026
|
+
|
|
1156
2027
|
values = [
|
|
1157
2028
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
|
1158
2029
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
|
2030
|
+
*([task_workspace] if show_workspace else []),
|
|
1159
2031
|
task['task_name'],
|
|
1160
2032
|
*user_values,
|
|
1161
2033
|
task['resources'],
|
|
@@ -1168,20 +2040,50 @@ def format_job_table(
|
|
|
1168
2040
|
job_duration,
|
|
1169
2041
|
task['recovery_count'],
|
|
1170
2042
|
task['status'].colored_str(),
|
|
2043
|
+
pool,
|
|
1171
2044
|
]
|
|
1172
2045
|
if show_all:
|
|
1173
2046
|
# schedule_state is only set at the job level, so if we have
|
|
1174
2047
|
# more than one task, only display on the aggregated row.
|
|
1175
2048
|
schedule_state = (task['schedule_state']
|
|
1176
2049
|
if len(job_tasks) == 1 else '-')
|
|
2050
|
+
infra_str = task.get('infra')
|
|
2051
|
+
if infra_str is None:
|
|
2052
|
+
cloud = task.get('cloud')
|
|
2053
|
+
if cloud is None:
|
|
2054
|
+
# Backward compatibility for old jobs controller without
|
|
2055
|
+
# cloud info returned, we parse it from the cluster
|
|
2056
|
+
# resources
|
|
2057
|
+
# TODO(zhwu): remove this after 0.12.0
|
|
2058
|
+
cloud = task['cluster_resources'].split('(')[0].split(
|
|
2059
|
+
'x')[-1]
|
|
2060
|
+
task['cluster_resources'] = task[
|
|
2061
|
+
'cluster_resources'].replace(f'{cloud}(',
|
|
2062
|
+
'(').replace(
|
|
2063
|
+
'x ', 'x')
|
|
2064
|
+
region = task['region']
|
|
2065
|
+
zone = task.get('zone')
|
|
2066
|
+
if cloud == '-':
|
|
2067
|
+
cloud = None
|
|
2068
|
+
if region == '-':
|
|
2069
|
+
region = None
|
|
2070
|
+
if zone == '-':
|
|
2071
|
+
zone = None
|
|
2072
|
+
infra_str = infra_utils.InfraInfo(cloud, region,
|
|
2073
|
+
zone).formatted_str()
|
|
1177
2074
|
values.extend([
|
|
2075
|
+
task.get('current_cluster_name', '-'),
|
|
2076
|
+
task.get('job_id_on_pool_cluster', '-'),
|
|
1178
2077
|
# STARTED
|
|
1179
2078
|
log_utils.readable_time_duration(task['start_at']),
|
|
2079
|
+
infra_str,
|
|
1180
2080
|
task['cluster_resources'],
|
|
1181
|
-
task['region'],
|
|
1182
2081
|
schedule_state,
|
|
1183
|
-
generate_details(task
|
|
2082
|
+
generate_details(task.get('details'),
|
|
2083
|
+
task['failure_reason']),
|
|
1184
2084
|
])
|
|
2085
|
+
|
|
2086
|
+
values.append(task.get('metadata', {}).get('git_commit', '-'))
|
|
1185
2087
|
if tasks_have_k8s_user:
|
|
1186
2088
|
values.insert(0, task.get('user', '-'))
|
|
1187
2089
|
job_table.add_row(values)
|
|
@@ -1204,6 +2106,59 @@ def format_job_table(
|
|
|
1204
2106
|
return output
|
|
1205
2107
|
|
|
1206
2108
|
|
|
2109
|
+
def decode_managed_job_protos(
|
|
2110
|
+
job_protos: Iterable['managed_jobsv1_pb2.ManagedJobInfo']
|
|
2111
|
+
) -> List[Dict[str, Any]]:
|
|
2112
|
+
"""Decode job protos to dicts. Similar to load_managed_job_queue."""
|
|
2113
|
+
user_hash_to_user = global_user_state.get_users(
|
|
2114
|
+
set(job.user_hash for job in job_protos if job.user_hash))
|
|
2115
|
+
|
|
2116
|
+
jobs = []
|
|
2117
|
+
for job_proto in job_protos:
|
|
2118
|
+
job_dict = _job_proto_to_dict(job_proto)
|
|
2119
|
+
user_hash = job_dict.get('user_hash', None)
|
|
2120
|
+
if user_hash is not None:
|
|
2121
|
+
# Skip jobs that do not have user_hash info.
|
|
2122
|
+
# TODO(cooperc): Remove check before 0.12.0.
|
|
2123
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
2124
|
+
job_dict['user_name'] = user.name if user is not None else None
|
|
2125
|
+
jobs.append(job_dict)
|
|
2126
|
+
return jobs
|
|
2127
|
+
|
|
2128
|
+
|
|
2129
|
+
def _job_proto_to_dict(
|
|
2130
|
+
job_proto: 'managed_jobsv1_pb2.ManagedJobInfo') -> Dict[str, Any]:
|
|
2131
|
+
job_dict = json_format.MessageToDict(
|
|
2132
|
+
job_proto,
|
|
2133
|
+
always_print_fields_with_no_presence=True,
|
|
2134
|
+
# Our API returns fields in snake_case.
|
|
2135
|
+
preserving_proto_field_name=True,
|
|
2136
|
+
use_integers_for_enums=True)
|
|
2137
|
+
for field in job_proto.DESCRIPTOR.fields:
|
|
2138
|
+
# Ensure optional fields are present with None values for
|
|
2139
|
+
# backwards compatibility with older clients.
|
|
2140
|
+
if field.has_presence and field.name not in job_dict:
|
|
2141
|
+
job_dict[field.name] = None
|
|
2142
|
+
# json_format.MessageToDict is meant for encoding to JSON,
|
|
2143
|
+
# and Protobuf encodes int64 as decimal strings in JSON,
|
|
2144
|
+
# so we need to convert them back to ints.
|
|
2145
|
+
# https://protobuf.dev/programming-guides/json/#field-representation
|
|
2146
|
+
if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
|
|
2147
|
+
job_dict.get(field.name) is not None):
|
|
2148
|
+
job_dict[field.name] = int(job_dict[field.name])
|
|
2149
|
+
job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
|
|
2150
|
+
job_dict['status'])
|
|
2151
|
+
# For backwards compatibility, convert schedule_state to a string,
|
|
2152
|
+
# as we don't have the logic to handle it in our request
|
|
2153
|
+
# encoder/decoder, unlike status.
|
|
2154
|
+
schedule_state_enum = (
|
|
2155
|
+
managed_job_state.ManagedJobScheduleState.from_protobuf(
|
|
2156
|
+
job_dict['schedule_state']))
|
|
2157
|
+
job_dict['schedule_state'] = (schedule_state_enum.value
|
|
2158
|
+
if schedule_state_enum is not None else None)
|
|
2159
|
+
return job_dict
|
|
2160
|
+
|
|
2161
|
+
|
|
1207
2162
|
class ManagedJobCodeGen:
|
|
1208
2163
|
"""Code generator for managed job utility functions.
|
|
1209
2164
|
|
|
@@ -1221,9 +2176,62 @@ class ManagedJobCodeGen:
|
|
|
1221
2176
|
""")
|
|
1222
2177
|
|
|
1223
2178
|
@classmethod
|
|
1224
|
-
def get_job_table(
|
|
1225
|
-
|
|
1226
|
-
|
|
2179
|
+
def get_job_table(
|
|
2180
|
+
cls,
|
|
2181
|
+
skip_finished: bool = False,
|
|
2182
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
2183
|
+
job_ids: Optional[List[int]] = None,
|
|
2184
|
+
workspace_match: Optional[str] = None,
|
|
2185
|
+
name_match: Optional[str] = None,
|
|
2186
|
+
pool_match: Optional[str] = None,
|
|
2187
|
+
page: Optional[int] = None,
|
|
2188
|
+
limit: Optional[int] = None,
|
|
2189
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
2190
|
+
statuses: Optional[List[str]] = None,
|
|
2191
|
+
fields: Optional[List[str]] = None,
|
|
2192
|
+
) -> str:
|
|
2193
|
+
code = textwrap.dedent(f"""\
|
|
2194
|
+
if managed_job_version < 9:
|
|
2195
|
+
# For backward compatibility, since filtering is not supported
|
|
2196
|
+
# before #6652.
|
|
2197
|
+
# TODO(hailong): Remove compatibility before 0.12.0
|
|
2198
|
+
job_table = utils.dump_managed_job_queue()
|
|
2199
|
+
elif managed_job_version < 10:
|
|
2200
|
+
job_table = utils.dump_managed_job_queue(
|
|
2201
|
+
skip_finished={skip_finished},
|
|
2202
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2203
|
+
job_ids={job_ids!r},
|
|
2204
|
+
workspace_match={workspace_match!r},
|
|
2205
|
+
name_match={name_match!r},
|
|
2206
|
+
pool_match={pool_match!r},
|
|
2207
|
+
page={page!r},
|
|
2208
|
+
limit={limit!r},
|
|
2209
|
+
user_hashes={user_hashes!r})
|
|
2210
|
+
elif managed_job_version < 12:
|
|
2211
|
+
job_table = utils.dump_managed_job_queue(
|
|
2212
|
+
skip_finished={skip_finished},
|
|
2213
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2214
|
+
job_ids={job_ids!r},
|
|
2215
|
+
workspace_match={workspace_match!r},
|
|
2216
|
+
name_match={name_match!r},
|
|
2217
|
+
pool_match={pool_match!r},
|
|
2218
|
+
page={page!r},
|
|
2219
|
+
limit={limit!r},
|
|
2220
|
+
user_hashes={user_hashes!r},
|
|
2221
|
+
statuses={statuses!r})
|
|
2222
|
+
else:
|
|
2223
|
+
job_table = utils.dump_managed_job_queue(
|
|
2224
|
+
skip_finished={skip_finished},
|
|
2225
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2226
|
+
job_ids={job_ids!r},
|
|
2227
|
+
workspace_match={workspace_match!r},
|
|
2228
|
+
name_match={name_match!r},
|
|
2229
|
+
pool_match={pool_match!r},
|
|
2230
|
+
page={page!r},
|
|
2231
|
+
limit={limit!r},
|
|
2232
|
+
user_hashes={user_hashes!r},
|
|
2233
|
+
statuses={statuses!r},
|
|
2234
|
+
fields={fields!r})
|
|
1227
2235
|
print(job_table, flush=True)
|
|
1228
2236
|
""")
|
|
1229
2237
|
return cls._build(code)
|
|
@@ -1232,26 +2240,77 @@ class ManagedJobCodeGen:
|
|
|
1232
2240
|
def cancel_jobs_by_id(cls,
|
|
1233
2241
|
job_ids: Optional[List[int]],
|
|
1234
2242
|
all_users: bool = False) -> str:
|
|
2243
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
1235
2244
|
code = textwrap.dedent(f"""\
|
|
1236
2245
|
if managed_job_version < 2:
|
|
1237
2246
|
# For backward compatibility, since all_users is not supported
|
|
1238
|
-
# before #4787.
|
|
2247
|
+
# before #4787.
|
|
1239
2248
|
# TODO(cooperc): Remove compatibility before 0.12.0
|
|
1240
2249
|
msg = utils.cancel_jobs_by_id({job_ids})
|
|
1241
|
-
|
|
2250
|
+
elif managed_job_version < 4:
|
|
2251
|
+
# For backward compatibility, since current_workspace is not
|
|
2252
|
+
# supported before #5660. Don't check the workspace.
|
|
2253
|
+
# TODO(zhwu): Remove compatibility before 0.12.0
|
|
1242
2254
|
msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users})
|
|
2255
|
+
else:
|
|
2256
|
+
msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users},
|
|
2257
|
+
current_workspace={active_workspace!r})
|
|
1243
2258
|
print(msg, end="", flush=True)
|
|
1244
2259
|
""")
|
|
1245
2260
|
return cls._build(code)
|
|
1246
2261
|
|
|
1247
2262
|
@classmethod
|
|
1248
2263
|
def cancel_job_by_name(cls, job_name: str) -> str:
|
|
2264
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
1249
2265
|
code = textwrap.dedent(f"""\
|
|
1250
|
-
|
|
2266
|
+
if managed_job_version < 4:
|
|
2267
|
+
# For backward compatibility, since current_workspace is not
|
|
2268
|
+
# supported before #5660. Don't check the workspace.
|
|
2269
|
+
# TODO(zhwu): Remove compatibility before 0.12.0
|
|
2270
|
+
msg = utils.cancel_job_by_name({job_name!r})
|
|
2271
|
+
else:
|
|
2272
|
+
msg = utils.cancel_job_by_name({job_name!r}, {active_workspace!r})
|
|
1251
2273
|
print(msg, end="", flush=True)
|
|
1252
2274
|
""")
|
|
1253
2275
|
return cls._build(code)
|
|
1254
2276
|
|
|
2277
|
+
@classmethod
|
|
2278
|
+
def cancel_jobs_by_pool(cls, pool_name: str) -> str:
|
|
2279
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
2280
|
+
code = textwrap.dedent(f"""\
|
|
2281
|
+
msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
|
|
2282
|
+
print(msg, end="", flush=True)
|
|
2283
|
+
""")
|
|
2284
|
+
return cls._build(code)
|
|
2285
|
+
|
|
2286
|
+
@classmethod
|
|
2287
|
+
def get_version_and_job_table(cls) -> str:
|
|
2288
|
+
"""Generate code to get controller version and raw job table."""
|
|
2289
|
+
code = textwrap.dedent("""\
|
|
2290
|
+
from sky.skylet import constants as controller_constants
|
|
2291
|
+
|
|
2292
|
+
# Get controller version
|
|
2293
|
+
controller_version = controller_constants.SKYLET_VERSION
|
|
2294
|
+
print(f"controller_version:{controller_version}", flush=True)
|
|
2295
|
+
|
|
2296
|
+
# Get and print raw job table (load_managed_job_queue can parse this directly)
|
|
2297
|
+
job_table = utils.dump_managed_job_queue()
|
|
2298
|
+
print(job_table, flush=True)
|
|
2299
|
+
""")
|
|
2300
|
+
return cls._build(code)
|
|
2301
|
+
|
|
2302
|
+
@classmethod
|
|
2303
|
+
def get_version(cls) -> str:
|
|
2304
|
+
"""Generate code to get controller version."""
|
|
2305
|
+
code = textwrap.dedent("""\
|
|
2306
|
+
from sky.skylet import constants as controller_constants
|
|
2307
|
+
|
|
2308
|
+
# Get controller version
|
|
2309
|
+
controller_version = controller_constants.SKYLET_VERSION
|
|
2310
|
+
print(f"controller_version:{controller_version}", flush=True)
|
|
2311
|
+
""")
|
|
2312
|
+
return cls._build(code)
|
|
2313
|
+
|
|
1255
2314
|
@classmethod
|
|
1256
2315
|
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
|
1257
2316
|
code = textwrap.dedent(f"""\
|
|
@@ -1266,10 +2325,16 @@ class ManagedJobCodeGen:
|
|
|
1266
2325
|
job_name: Optional[str],
|
|
1267
2326
|
job_id: Optional[int],
|
|
1268
2327
|
follow: bool = True,
|
|
1269
|
-
controller: bool = False
|
|
2328
|
+
controller: bool = False,
|
|
2329
|
+
tail: Optional[int] = None) -> str:
|
|
1270
2330
|
code = textwrap.dedent(f"""\
|
|
1271
|
-
|
|
1272
|
-
|
|
2331
|
+
if managed_job_version < 6:
|
|
2332
|
+
# Versions before 5 did not support tail parameter
|
|
2333
|
+
result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
|
|
2334
|
+
follow={follow}, controller={controller})
|
|
2335
|
+
else:
|
|
2336
|
+
result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
|
|
2337
|
+
follow={follow}, controller={controller}, tail={tail!r})
|
|
1273
2338
|
if managed_job_version < 3:
|
|
1274
2339
|
# Versions 2 and older did not return a retcode, so we just print
|
|
1275
2340
|
# the result.
|
|
@@ -1283,18 +2348,44 @@ class ManagedJobCodeGen:
|
|
|
1283
2348
|
return cls._build(code)
|
|
1284
2349
|
|
|
1285
2350
|
@classmethod
|
|
1286
|
-
def set_pending(cls,
|
|
2351
|
+
def set_pending(cls,
|
|
2352
|
+
job_id: int,
|
|
2353
|
+
managed_job_dag: 'dag_lib.Dag',
|
|
2354
|
+
workspace: str,
|
|
2355
|
+
entrypoint: str,
|
|
2356
|
+
user_hash: Optional[str] = None) -> str:
|
|
1287
2357
|
dag_name = managed_job_dag.name
|
|
2358
|
+
pool = managed_job_dag.pool
|
|
1288
2359
|
# Add the managed job to queue table.
|
|
1289
2360
|
code = textwrap.dedent(f"""\
|
|
1290
|
-
|
|
2361
|
+
set_job_info_kwargs = {{'workspace': {workspace!r}}}
|
|
2362
|
+
if managed_job_version < 4:
|
|
2363
|
+
set_job_info_kwargs = {{}}
|
|
2364
|
+
if managed_job_version >= 5:
|
|
2365
|
+
set_job_info_kwargs['entrypoint'] = {entrypoint!r}
|
|
2366
|
+
if managed_job_version >= 8:
|
|
2367
|
+
from sky.serve import serve_state
|
|
2368
|
+
pool_hash = None
|
|
2369
|
+
if {pool!r} != None:
|
|
2370
|
+
pool_hash = serve_state.get_service_hash({pool!r})
|
|
2371
|
+
set_job_info_kwargs['pool'] = {pool!r}
|
|
2372
|
+
set_job_info_kwargs['pool_hash'] = pool_hash
|
|
2373
|
+
if managed_job_version >= 11:
|
|
2374
|
+
set_job_info_kwargs['user_hash'] = {user_hash!r}
|
|
2375
|
+
managed_job_state.set_job_info(
|
|
2376
|
+
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
|
1291
2377
|
""")
|
|
1292
2378
|
for task_id, task in enumerate(managed_job_dag.tasks):
|
|
1293
2379
|
resources_str = backend_utils.get_task_resources_str(
|
|
1294
2380
|
task, is_managed_job=True)
|
|
1295
2381
|
code += textwrap.dedent(f"""\
|
|
1296
|
-
|
|
1297
|
-
|
|
2382
|
+
if managed_job_version < 7:
|
|
2383
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
|
2384
|
+
{task.name!r}, {resources_str!r})
|
|
2385
|
+
else:
|
|
2386
|
+
managed_job_state.set_pending({job_id}, {task_id},
|
|
2387
|
+
{task.name!r}, {resources_str!r},
|
|
2388
|
+
{task.metadata_json!r})
|
|
1298
2389
|
""")
|
|
1299
2390
|
return cls._build(code)
|
|
1300
2391
|
|