skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/controller.py
CHANGED
|
@@ -1,24 +1,32 @@
|
|
|
1
|
-
"""Controller: handles the life cycle of a managed job.
|
|
2
|
-
|
|
3
|
-
TODO(cooperc): Document lifecycle, and multiprocess layout.
|
|
1
|
+
"""Controller: handles scheduling and the life cycle of a managed job.
|
|
4
2
|
"""
|
|
5
|
-
import
|
|
6
|
-
import
|
|
3
|
+
import asyncio
|
|
4
|
+
import io
|
|
7
5
|
import os
|
|
8
6
|
import pathlib
|
|
7
|
+
import resource
|
|
9
8
|
import shutil
|
|
9
|
+
import sys
|
|
10
|
+
import threading
|
|
10
11
|
import time
|
|
11
12
|
import traceback
|
|
12
13
|
import typing
|
|
13
|
-
from typing import Optional,
|
|
14
|
+
from typing import Dict, Optional, Set
|
|
14
15
|
|
|
15
|
-
import
|
|
16
|
+
import dotenv
|
|
16
17
|
|
|
18
|
+
import sky
|
|
19
|
+
from sky import core
|
|
17
20
|
from sky import exceptions
|
|
18
21
|
from sky import sky_logging
|
|
22
|
+
from sky import skypilot_config
|
|
23
|
+
from sky.adaptors import common as adaptors_common
|
|
19
24
|
from sky.backends import backend_utils
|
|
20
25
|
from sky.backends import cloud_vm_ray_backend
|
|
21
26
|
from sky.data import data_utils
|
|
27
|
+
from sky.jobs import constants as jobs_constants
|
|
28
|
+
from sky.jobs import file_content_utils
|
|
29
|
+
from sky.jobs import log_gc
|
|
22
30
|
from sky.jobs import recovery_strategy
|
|
23
31
|
from sky.jobs import scheduler
|
|
24
32
|
from sky.jobs import state as managed_job_state
|
|
@@ -26,39 +34,130 @@ from sky.jobs import utils as managed_job_utils
|
|
|
26
34
|
from sky.skylet import constants
|
|
27
35
|
from sky.skylet import job_lib
|
|
28
36
|
from sky.usage import usage_lib
|
|
37
|
+
from sky.utils import annotations
|
|
29
38
|
from sky.utils import common
|
|
30
39
|
from sky.utils import common_utils
|
|
40
|
+
from sky.utils import context
|
|
41
|
+
from sky.utils import context_utils
|
|
31
42
|
from sky.utils import controller_utils
|
|
32
43
|
from sky.utils import dag_utils
|
|
33
44
|
from sky.utils import status_lib
|
|
34
|
-
from sky.utils import subprocess_utils
|
|
35
45
|
from sky.utils import ux_utils
|
|
36
46
|
|
|
37
47
|
if typing.TYPE_CHECKING:
|
|
38
|
-
import
|
|
48
|
+
import psutil
|
|
49
|
+
else:
|
|
50
|
+
psutil = adaptors_common.LazyImport('psutil')
|
|
39
51
|
|
|
40
|
-
# Use the explicit logger name so that the logger is under the
|
|
41
|
-
# `sky.jobs.controller` namespace when executed directly, so as
|
|
42
|
-
# to inherit the setup from the `sky` logger.
|
|
43
52
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
44
53
|
|
|
54
|
+
_background_tasks: Set[asyncio.Task] = set()
|
|
55
|
+
_background_tasks_lock: asyncio.Lock = asyncio.Lock()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def create_background_task(coro: typing.Coroutine) -> None:
|
|
59
|
+
"""Create a background task and add it to the set of background tasks.
|
|
60
|
+
|
|
61
|
+
Main reason we do this is since tasks are only held as a weak reference in
|
|
62
|
+
the executor, we need to keep a strong reference to the task to avoid it
|
|
63
|
+
being garbage collected.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
coro: The coroutine to create a task for.
|
|
67
|
+
"""
|
|
68
|
+
async with _background_tasks_lock:
|
|
69
|
+
task = asyncio.create_task(coro)
|
|
70
|
+
_background_tasks.add(task)
|
|
71
|
+
# TODO(cooperc): Discard needs a lock?
|
|
72
|
+
task.add_done_callback(_background_tasks.discard)
|
|
45
73
|
|
|
46
|
-
def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
|
|
47
|
-
dag = dag_utils.load_chain_dag_from_yaml(dag_yaml)
|
|
48
|
-
dag_name = dag.name
|
|
49
|
-
assert dag_name is not None, dag
|
|
50
|
-
return dag, dag_name
|
|
51
74
|
|
|
75
|
+
# Make sure to limit the size as we don't want to cache too many DAGs in memory.
|
|
76
|
+
@annotations.lru_cache(scope='global', maxsize=50)
|
|
77
|
+
def _get_dag(job_id: int) -> 'sky.Dag':
|
|
78
|
+
dag_content = file_content_utils.get_job_dag_content(job_id)
|
|
79
|
+
if dag_content is None:
|
|
80
|
+
raise RuntimeError('Managed job DAG YAML content is unavailable for '
|
|
81
|
+
f'job {job_id}. This can happen if the job was '
|
|
82
|
+
'submitted before file migration completed or if '
|
|
83
|
+
'the submission failed to persist the DAG. Please '
|
|
84
|
+
're-submit the job.')
|
|
52
85
|
|
|
53
|
-
|
|
54
|
-
|
|
86
|
+
dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
|
|
87
|
+
assert dag.name is not None, dag
|
|
88
|
+
return dag
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class JobController:
|
|
92
|
+
"""Controls the lifecycle of a single managed job.
|
|
93
|
+
|
|
94
|
+
This controller executes the chain DAG recorded for the job by:
|
|
95
|
+
- Loading the DAG and preparing per-task environment variables so each task
|
|
96
|
+
has a stable global job identifier across recoveries.
|
|
97
|
+
- Launching the task on the configured backend (``CloudVmRayBackend``),
|
|
98
|
+
optionally via a pool.
|
|
99
|
+
- Persisting state transitions to the managed jobs state store
|
|
100
|
+
(e.g., STARTING → RUNNING → SUCCEEDED/FAILED/CANCELLED).
|
|
101
|
+
- Monitoring execution, downloading/streaming logs, detecting failures or
|
|
102
|
+
preemptions, and invoking recovery through
|
|
103
|
+
``recovery_strategy.StrategyExecutor``.
|
|
104
|
+
- Cleaning up clusters and ephemeral resources when tasks finish.
|
|
105
|
+
|
|
106
|
+
Concurrency and coordination:
|
|
107
|
+
- Runs inside an ``asyncio`` event loop.
|
|
108
|
+
- Shares a ``starting`` set, guarded by ``starting_lock`` and signaled via
|
|
109
|
+
``starting_signal``, to throttle concurrent launches across jobs that the
|
|
110
|
+
top-level ``Controller`` manages.
|
|
111
|
+
|
|
112
|
+
Key attributes:
|
|
113
|
+
- ``_job_id``: Integer identifier of this managed job.
|
|
114
|
+
- ``_dag`` / ``_dag_name``: The job definition and metadata loaded from the
|
|
115
|
+
database-backed job YAML.
|
|
116
|
+
- ``_backend``: Backend used to launch and manage clusters.
|
|
117
|
+
- ``_pool``: Optional pool name if using a pool.
|
|
118
|
+
- ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
|
|
119
|
+
coordination primitives. ``starting_lock`` must be used for accessing
|
|
120
|
+
``starting_signal`` and ``starting``
|
|
121
|
+
- ``_strategy_executor``: Recovery/launch strategy executor (created per
|
|
122
|
+
task).
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
job_id: int,
|
|
128
|
+
starting: Set[int],
|
|
129
|
+
starting_lock: asyncio.Lock,
|
|
130
|
+
starting_signal: asyncio.Condition,
|
|
131
|
+
pool: Optional[str] = None,
|
|
132
|
+
) -> None:
|
|
133
|
+
"""Initialize a ``JobsController``.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
job_id: Integer ID of the managed job.
|
|
137
|
+
starting: Shared set of job IDs currently in the STARTING phase,
|
|
138
|
+
used to limit concurrent launches.
|
|
139
|
+
starting_lock: ``asyncio.Lock`` guarding access to the shared
|
|
140
|
+
scheduler state (e.g., the ``starting`` set).
|
|
141
|
+
starting_signal: ``asyncio.Condition`` used to notify when a job
|
|
142
|
+
exits STARTING so more jobs can be admitted.
|
|
143
|
+
pool: Optional pool name. When provided, the job is
|
|
144
|
+
submitted to the pool rather than launching a dedicated
|
|
145
|
+
cluster.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
self.starting = starting
|
|
149
|
+
self.starting_lock = starting_lock
|
|
150
|
+
self.starting_signal = starting_signal
|
|
151
|
+
|
|
152
|
+
logger.info('Initializing JobsController for job_id=%s', job_id)
|
|
55
153
|
|
|
56
|
-
def __init__(self, job_id: int, dag_yaml: str) -> None:
|
|
57
154
|
self._job_id = job_id
|
|
58
|
-
self._dag
|
|
59
|
-
|
|
60
|
-
|
|
155
|
+
self._dag = _get_dag(job_id)
|
|
156
|
+
self._dag_name = self._dag.name
|
|
157
|
+
logger.info(f'Loaded DAG: {self._dag}')
|
|
158
|
+
|
|
61
159
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
160
|
+
self._pool = pool
|
|
62
161
|
|
|
63
162
|
# pylint: disable=line-too-long
|
|
64
163
|
# Add a unique identifier to the task environment variables, so that
|
|
@@ -76,6 +175,7 @@ class JobsController:
|
|
|
76
175
|
# dag_utils.maybe_infer_and_fill_dag_and_task_names.
|
|
77
176
|
assert task_name is not None, self._dag
|
|
78
177
|
task_name = f'{self._dag_name}_{task_name}'
|
|
178
|
+
|
|
79
179
|
job_id_env_var = common_utils.get_global_job_id(
|
|
80
180
|
self._backend.run_timestamp,
|
|
81
181
|
f'{task_name}',
|
|
@@ -92,8 +192,10 @@ class JobsController:
|
|
|
92
192
|
task.update_envs(task_envs)
|
|
93
193
|
|
|
94
194
|
def _download_log_and_stream(
|
|
95
|
-
self,
|
|
96
|
-
|
|
195
|
+
self,
|
|
196
|
+
task_id: Optional[int],
|
|
197
|
+
handle: Optional['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
|
|
198
|
+
job_id_on_pool_cluster: Optional[int],
|
|
97
199
|
) -> None:
|
|
98
200
|
"""Downloads and streams the logs of the current job with given task ID.
|
|
99
201
|
|
|
@@ -105,18 +207,36 @@ class JobsController:
|
|
|
105
207
|
logger.info(f'Cluster for job {self._job_id} is not found. '
|
|
106
208
|
'Skipping downloading and streaming the logs.')
|
|
107
209
|
return
|
|
210
|
+
|
|
108
211
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
109
|
-
'managed_jobs'
|
|
110
|
-
|
|
111
|
-
|
|
212
|
+
'managed_jobs',
|
|
213
|
+
f'job-id-{self._job_id}')
|
|
214
|
+
log_file = controller_utils.download_and_stream_job_log(
|
|
215
|
+
self._backend,
|
|
216
|
+
handle,
|
|
217
|
+
managed_job_logs_dir,
|
|
218
|
+
job_ids=[str(job_id_on_pool_cluster)]
|
|
219
|
+
if job_id_on_pool_cluster is not None else None)
|
|
112
220
|
if log_file is not None:
|
|
113
|
-
# Set the path of the log file for the current task, so it can
|
|
114
|
-
# accessed even after the job is finished
|
|
221
|
+
# Set the path of the log file for the current task, so it can
|
|
222
|
+
# be accessed even after the job is finished
|
|
115
223
|
managed_job_state.set_local_log_file(self._job_id, task_id,
|
|
116
224
|
log_file)
|
|
225
|
+
else:
|
|
226
|
+
logger.warning(
|
|
227
|
+
f'No log file was downloaded for job {self._job_id}, '
|
|
228
|
+
f'task {task_id}')
|
|
229
|
+
|
|
117
230
|
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
|
118
231
|
|
|
119
|
-
def
|
|
232
|
+
async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
|
|
233
|
+
if cluster_name is None:
|
|
234
|
+
return
|
|
235
|
+
if self._pool is None:
|
|
236
|
+
await context_utils.to_thread(managed_job_utils.terminate_cluster,
|
|
237
|
+
cluster_name)
|
|
238
|
+
|
|
239
|
+
async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
|
|
120
240
|
"""Busy loop monitoring cluster status and handling recovery.
|
|
121
241
|
|
|
122
242
|
When the task is successfully completed, this function returns True,
|
|
@@ -151,70 +271,185 @@ class JobsController:
|
|
|
151
271
|
3. Any unexpected error happens during the `sky.launch`.
|
|
152
272
|
Other exceptions may be raised depending on the backend.
|
|
153
273
|
"""
|
|
274
|
+
task_start_time = time.time()
|
|
275
|
+
logger.info(
|
|
276
|
+
f'Starting task {task_id} ({task.name}) for job {self._job_id}')
|
|
277
|
+
|
|
278
|
+
latest_task_id, last_task_prev_status = (
|
|
279
|
+
await
|
|
280
|
+
managed_job_state.get_latest_task_id_status_async(self._job_id))
|
|
281
|
+
|
|
282
|
+
is_resume = False
|
|
283
|
+
if (latest_task_id is not None and last_task_prev_status !=
|
|
284
|
+
managed_job_state.ManagedJobStatus.PENDING):
|
|
285
|
+
assert latest_task_id >= task_id, (latest_task_id, task_id)
|
|
286
|
+
if latest_task_id > task_id:
|
|
287
|
+
logger.info(f'Task {task_id} ({task.name}) has already '
|
|
288
|
+
'been executed. Skipping...')
|
|
289
|
+
return True
|
|
290
|
+
if latest_task_id == task_id:
|
|
291
|
+
# Start recovery.
|
|
292
|
+
is_resume = True
|
|
293
|
+
logger.info(f'Resuming task {task_id} from previous execution')
|
|
154
294
|
|
|
155
295
|
callback_func = managed_job_utils.event_callback_func(
|
|
156
296
|
job_id=self._job_id, task_id=task_id, task=task)
|
|
297
|
+
|
|
157
298
|
if task.run is None:
|
|
158
299
|
logger.info(f'Skip running task {task_id} ({task.name}) due to its '
|
|
159
300
|
'run commands being empty.')
|
|
160
301
|
# Call set_started first to initialize columns in the state table,
|
|
161
302
|
# including start_at and last_recovery_at to avoid issues for
|
|
162
303
|
# uninitialized columns.
|
|
163
|
-
managed_job_state.
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
304
|
+
await managed_job_state.set_started_async(
|
|
305
|
+
job_id=self._job_id,
|
|
306
|
+
task_id=task_id,
|
|
307
|
+
start_time=time.time(),
|
|
308
|
+
callback_func=callback_func)
|
|
309
|
+
await managed_job_state.set_succeeded_async(
|
|
310
|
+
job_id=self._job_id,
|
|
311
|
+
task_id=task_id,
|
|
312
|
+
end_time=time.time(),
|
|
313
|
+
callback_func=callback_func)
|
|
314
|
+
logger.info(f'Empty task {task_id} marked as succeeded immediately')
|
|
171
315
|
return True
|
|
316
|
+
|
|
172
317
|
usage_lib.messages.usage.update_task_id(task_id)
|
|
173
318
|
task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
|
|
174
|
-
submitted_at = time.time()
|
|
175
|
-
if task_id == 0:
|
|
176
|
-
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
|
177
|
-
self._backend.run_timestamp)
|
|
178
319
|
assert task.name is not None, task
|
|
320
|
+
# Set the cluster name to None if the job is submitted
|
|
321
|
+
# to a pool. This will be updated when we later calls the `launch`
|
|
322
|
+
# or `recover` function from the strategy executor.
|
|
179
323
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
|
180
|
-
task.name, self._job_id)
|
|
324
|
+
task.name, self._job_id) if self._pool is None else None
|
|
181
325
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
|
182
|
-
cluster_name, self._backend, task, self._job_id
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
326
|
+
cluster_name, self._backend, task, self._job_id, task_id,
|
|
327
|
+
self._pool, self.starting, self.starting_lock, self.starting_signal)
|
|
328
|
+
if not is_resume:
|
|
329
|
+
submitted_at = time.time()
|
|
330
|
+
if task_id == 0:
|
|
331
|
+
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
|
332
|
+
self._backend.run_timestamp)
|
|
333
|
+
|
|
334
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
335
|
+
task, is_managed_job=True)
|
|
336
|
+
|
|
337
|
+
await managed_job_state.set_starting_async(
|
|
338
|
+
self._job_id,
|
|
339
|
+
task_id,
|
|
340
|
+
self._backend.run_timestamp,
|
|
341
|
+
submitted_at,
|
|
342
|
+
resources_str=resources_str,
|
|
343
|
+
specs={
|
|
344
|
+
'max_restarts_on_errors':
|
|
345
|
+
self._strategy_executor.max_restarts_on_errors
|
|
346
|
+
},
|
|
347
|
+
callback_func=callback_func)
|
|
348
|
+
logger.info(f'Submitted managed job {self._job_id} '
|
|
349
|
+
f'(task: {task_id}, name: {task.name!r}); '
|
|
350
|
+
f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
|
198
351
|
|
|
199
352
|
logger.info('Started monitoring.')
|
|
200
|
-
managed_job_state.set_starting(job_id=self._job_id,
|
|
201
|
-
task_id=task_id,
|
|
202
|
-
callback_func=callback_func)
|
|
203
|
-
remote_job_submitted_at = self._strategy_executor.launch()
|
|
204
|
-
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
|
205
353
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
354
|
+
# Only do the initial cluster launch if not resuming from a controller
|
|
355
|
+
# failure. Otherwise, we will transit to recovering immediately.
|
|
356
|
+
remote_job_submitted_at = time.time()
|
|
357
|
+
if not is_resume:
|
|
358
|
+
launch_start = time.time()
|
|
359
|
+
|
|
360
|
+
# Run the launch in a separate thread to avoid blocking the event
|
|
361
|
+
# loop. The scheduler functions used internally already have their
|
|
362
|
+
# own file locks.
|
|
363
|
+
remote_job_submitted_at = await self._strategy_executor.launch()
|
|
364
|
+
|
|
365
|
+
launch_time = time.time() - launch_start
|
|
366
|
+
logger.info(f'Cluster launch completed in {launch_time:.2f}s')
|
|
367
|
+
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
|
368
|
+
if self._pool is None:
|
|
369
|
+
job_id_on_pool_cluster = None
|
|
370
|
+
else:
|
|
371
|
+
# Update the cluster name when using pool.
|
|
372
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
373
|
+
await
|
|
374
|
+
managed_job_state.get_pool_submit_info_async(self._job_id))
|
|
375
|
+
if cluster_name is None:
|
|
376
|
+
# Check if we have been cancelled here, in the case where a user
|
|
377
|
+
# quickly cancels the job we want to gracefully handle it here,
|
|
378
|
+
# otherwise we will end up in the FAILED_CONTROLLER state.
|
|
379
|
+
logger.info(f'Cluster name is None for job {self._job_id}, '
|
|
380
|
+
f'task {task_id}. Checking if we have been '
|
|
381
|
+
'cancelled.')
|
|
382
|
+
status = await (managed_job_state.get_job_status_with_task_id_async(
|
|
383
|
+
job_id=self._job_id, task_id=task_id))
|
|
384
|
+
logger.debug(f'Status for job {self._job_id}, task {task_id}:'
|
|
385
|
+
f'{status}')
|
|
386
|
+
if status == managed_job_state.ManagedJobStatus.CANCELLED:
|
|
387
|
+
logger.info(f'Job {self._job_id}, task {task_id} has '
|
|
388
|
+
'been quickly cancelled.')
|
|
389
|
+
raise asyncio.CancelledError()
|
|
390
|
+
assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
|
|
391
|
+
|
|
392
|
+
if not is_resume:
|
|
393
|
+
await managed_job_state.set_started_async(
|
|
394
|
+
job_id=self._job_id,
|
|
395
|
+
task_id=task_id,
|
|
396
|
+
start_time=remote_job_submitted_at,
|
|
397
|
+
callback_func=callback_func)
|
|
398
|
+
|
|
399
|
+
monitoring_start_time = time.time()
|
|
400
|
+
status_check_count = 0
|
|
401
|
+
|
|
402
|
+
async with self.starting_lock:
|
|
403
|
+
try:
|
|
404
|
+
self.starting.remove(self._job_id)
|
|
405
|
+
# its fine if we notify again, better to wake someone up
|
|
406
|
+
# and have them go to sleep again, then have some stuck
|
|
407
|
+
# sleeping.
|
|
408
|
+
# ps. this shouldn't actually happen because if its been
|
|
409
|
+
# removed from the set then we would get a key error.
|
|
410
|
+
self.starting_signal.notify()
|
|
411
|
+
except KeyError:
|
|
412
|
+
pass
|
|
210
413
|
|
|
211
414
|
while True:
|
|
212
|
-
|
|
415
|
+
status_check_count += 1
|
|
416
|
+
|
|
417
|
+
# NOTE: if we are resuming from a controller failure, we only keep
|
|
418
|
+
# monitoring if the job is in RUNNING state. For all other cases,
|
|
419
|
+
# we will directly transit to recovering since we have no idea what
|
|
420
|
+
# the cluster status is.
|
|
421
|
+
force_transit_to_recovering = False
|
|
422
|
+
if is_resume:
|
|
423
|
+
prev_status = await (
|
|
424
|
+
managed_job_state.get_job_status_with_task_id_async(
|
|
425
|
+
job_id=self._job_id, task_id=task_id))
|
|
426
|
+
|
|
427
|
+
if prev_status is not None:
|
|
428
|
+
if prev_status.is_terminal():
|
|
429
|
+
logger.info(
|
|
430
|
+
f'Task {task_id} already in terminal state: '
|
|
431
|
+
f'{prev_status}')
|
|
432
|
+
return (prev_status ==
|
|
433
|
+
managed_job_state.ManagedJobStatus.SUCCEEDED)
|
|
434
|
+
if (prev_status ==
|
|
435
|
+
managed_job_state.ManagedJobStatus.CANCELLING):
|
|
436
|
+
# If the controller is down when cancelling the job,
|
|
437
|
+
# we re-raise the error to run the `_cleanup` function
|
|
438
|
+
# again to clean up any remaining resources.
|
|
439
|
+
logger.info(f'Task {task_id} was being cancelled, '
|
|
440
|
+
're-raising cancellation')
|
|
441
|
+
raise asyncio.CancelledError()
|
|
442
|
+
if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
|
|
443
|
+
force_transit_to_recovering = True
|
|
444
|
+
# This resume logic should only be triggered once.
|
|
445
|
+
is_resume = False
|
|
446
|
+
|
|
447
|
+
await asyncio.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
|
|
213
448
|
|
|
214
449
|
# Check the network connection to avoid false alarm for job failure.
|
|
215
450
|
# Network glitch was observed even in the VM.
|
|
216
451
|
try:
|
|
217
|
-
backend_utils.
|
|
452
|
+
await backend_utils.async_check_network_connection()
|
|
218
453
|
except exceptions.NetworkError:
|
|
219
454
|
logger.info('Network is not available. Retrying again in '
|
|
220
455
|
f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
|
|
@@ -223,31 +458,63 @@ class JobsController:
|
|
|
223
458
|
|
|
224
459
|
# NOTE: we do not check cluster status first because race condition
|
|
225
460
|
# can occur, i.e. cluster can be down during the job status check.
|
|
226
|
-
|
|
227
|
-
|
|
461
|
+
# NOTE: If fetching the job status fails or we force to transit to
|
|
462
|
+
# recovering, we will set the job status to None, which will force
|
|
463
|
+
# enter the recovering logic.
|
|
464
|
+
job_status = None
|
|
465
|
+
if not force_transit_to_recovering:
|
|
466
|
+
try:
|
|
467
|
+
job_status = await managed_job_utils.get_job_status(
|
|
468
|
+
self._backend,
|
|
469
|
+
cluster_name,
|
|
470
|
+
job_id=job_id_on_pool_cluster,
|
|
471
|
+
)
|
|
472
|
+
except exceptions.FetchClusterInfoError as fetch_e:
|
|
473
|
+
logger.info(
|
|
474
|
+
'Failed to fetch the job status. Start recovery.\n'
|
|
475
|
+
f'Exception: {common_utils.format_exception(fetch_e)}\n'
|
|
476
|
+
f'Traceback: {traceback.format_exc()}')
|
|
228
477
|
|
|
229
478
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
|
230
|
-
|
|
231
|
-
|
|
479
|
+
logger.info(f'Task {task_id} succeeded! '
|
|
480
|
+
'Getting end time and cleaning up')
|
|
481
|
+
try:
|
|
482
|
+
success_end_time = await context_utils.to_thread(
|
|
483
|
+
managed_job_utils.try_to_get_job_end_time,
|
|
484
|
+
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
485
|
+
except Exception as e: # pylint: disable=broad-except
|
|
486
|
+
logger.warning(
|
|
487
|
+
f'Failed to get job end time: '
|
|
488
|
+
f'{common_utils.format_exception(e)}',
|
|
489
|
+
exc_info=True)
|
|
490
|
+
success_end_time = 0
|
|
491
|
+
|
|
232
492
|
# The job is done. Set the job to SUCCEEDED first before start
|
|
233
493
|
# downloading and streaming the logs to make it more responsive.
|
|
234
|
-
managed_job_state.
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
494
|
+
await managed_job_state.set_succeeded_async(
|
|
495
|
+
self._job_id,
|
|
496
|
+
task_id,
|
|
497
|
+
end_time=success_end_time,
|
|
498
|
+
callback_func=callback_func)
|
|
238
499
|
logger.info(
|
|
239
500
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
|
240
501
|
f'Cleaning up the cluster {cluster_name}.')
|
|
241
502
|
try:
|
|
242
|
-
|
|
503
|
+
logger.info(f'Downloading logs on cluster {cluster_name} '
|
|
504
|
+
f'and job id {job_id_on_pool_cluster}.')
|
|
505
|
+
clusters = await context_utils.to_thread(
|
|
506
|
+
backend_utils.get_clusters,
|
|
243
507
|
cluster_names=[cluster_name],
|
|
244
508
|
refresh=common.StatusRefreshMode.NONE,
|
|
245
|
-
all_users=True
|
|
509
|
+
all_users=True,
|
|
510
|
+
_include_is_managed=True)
|
|
246
511
|
if clusters:
|
|
247
512
|
assert len(clusters) == 1, (clusters, cluster_name)
|
|
248
513
|
handle = clusters[0].get('handle')
|
|
249
514
|
# Best effort to download and stream the logs.
|
|
250
|
-
|
|
515
|
+
await context_utils.to_thread(
|
|
516
|
+
self._download_log_and_stream, task_id, handle,
|
|
517
|
+
job_id_on_pool_cluster)
|
|
251
518
|
except Exception as e: # pylint: disable=broad-except
|
|
252
519
|
# We don't want to crash here, so just log and continue.
|
|
253
520
|
logger.warning(
|
|
@@ -256,7 +523,14 @@ class JobsController:
|
|
|
256
523
|
exc_info=True)
|
|
257
524
|
# Only clean up the cluster, not the storages, because tasks may
|
|
258
525
|
# share storages.
|
|
259
|
-
|
|
526
|
+
await self._cleanup_cluster(cluster_name)
|
|
527
|
+
|
|
528
|
+
task_total_time = time.time() - task_start_time
|
|
529
|
+
monitoring_time = time.time() - monitoring_start_time
|
|
530
|
+
logger.info(f'Task {task_id} completed successfully in '
|
|
531
|
+
f'{task_total_time:.2f}s '
|
|
532
|
+
f'(monitoring time: {monitoring_time:.2f}s, '
|
|
533
|
+
f'status checks: {status_check_count})')
|
|
260
534
|
return True
|
|
261
535
|
|
|
262
536
|
# For single-node jobs, non-terminated job_status indicates a
|
|
@@ -272,7 +546,7 @@ class JobsController:
|
|
|
272
546
|
if job_status in job_lib.JobStatus.user_code_failure_states():
|
|
273
547
|
# Add a grace period before the check of preemption to avoid
|
|
274
548
|
# false alarm for job failure.
|
|
275
|
-
|
|
549
|
+
await asyncio.sleep(5)
|
|
276
550
|
|
|
277
551
|
# Pull the actual cluster status from the cloud provider to
|
|
278
552
|
# determine whether the cluster is preempted or failed.
|
|
@@ -303,14 +577,19 @@ class JobsController:
|
|
|
303
577
|
in job_lib.JobStatus.user_code_failure_states() or
|
|
304
578
|
job_status == job_lib.JobStatus.FAILED_DRIVER):
|
|
305
579
|
# The user code has probably crashed, fail immediately.
|
|
306
|
-
|
|
307
|
-
|
|
580
|
+
logger.info(
|
|
581
|
+
f'Task {task_id} failed with status: {job_status}')
|
|
582
|
+
end_time = await context_utils.to_thread(
|
|
583
|
+
managed_job_utils.try_to_get_job_end_time,
|
|
584
|
+
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
308
585
|
logger.info(
|
|
309
586
|
f'The user job failed ({job_status}). Please check the '
|
|
310
587
|
'logs below.\n'
|
|
311
588
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
|
312
589
|
|
|
313
|
-
self._download_log_and_stream
|
|
590
|
+
await context_utils.to_thread(self._download_log_and_stream,
|
|
591
|
+
task_id, handle,
|
|
592
|
+
job_id_on_pool_cluster)
|
|
314
593
|
|
|
315
594
|
failure_reason = (
|
|
316
595
|
'To see the details, run: '
|
|
@@ -346,7 +625,9 @@ class JobsController:
|
|
|
346
625
|
f'[{self._strategy_executor.restart_cnt_on_failure}'
|
|
347
626
|
f'/{max_restarts}]')
|
|
348
627
|
else:
|
|
349
|
-
|
|
628
|
+
logger.info(
|
|
629
|
+
f'Task {task_id} failed and will not be retried')
|
|
630
|
+
await managed_job_state.set_failed_async(
|
|
350
631
|
self._job_id,
|
|
351
632
|
task_id,
|
|
352
633
|
failure_type=managed_job_status,
|
|
@@ -361,7 +642,7 @@ class JobsController:
|
|
|
361
642
|
failure_reason = (
|
|
362
643
|
f'Unknown job status {job_status}. To see the details, '
|
|
363
644
|
f'run: sky jobs logs --controller {self._job_id}')
|
|
364
|
-
managed_job_state.
|
|
645
|
+
await managed_job_state.set_failed_async(
|
|
365
646
|
self._job_id,
|
|
366
647
|
task_id,
|
|
367
648
|
failure_type=managed_job_state.ManagedJobStatus.
|
|
@@ -381,84 +662,131 @@ class JobsController:
|
|
|
381
662
|
if handle is not None:
|
|
382
663
|
resources = handle.launched_resources
|
|
383
664
|
assert resources is not None, handle
|
|
384
|
-
|
|
665
|
+
# If we are forcing to transit to recovering, we need to clean
|
|
666
|
+
# up the cluster as it is possible that we already submitted the
|
|
667
|
+
# job to the worker cluster, but state is not updated yet. In
|
|
668
|
+
# this case, it is possible that we will double-submit the job
|
|
669
|
+
# to the worker cluster. So we always clean up the cluster here.
|
|
670
|
+
# TODO(tian,cooperc): We can check if there is a running job on
|
|
671
|
+
# the worker cluster, and if so, we can skip the cleanup.
|
|
672
|
+
# Challenge: race condition when the worker cluster thought it
|
|
673
|
+
# does not have a running job yet but later the job is launched.
|
|
674
|
+
if (resources.need_cleanup_after_preemption_or_failure() or
|
|
675
|
+
force_transit_to_recovering):
|
|
385
676
|
# Some spot resource (e.g., Spot TPU VM) may need to be
|
|
386
677
|
# cleaned up after preemption, as running launch again on
|
|
387
678
|
# those clusters again may fail.
|
|
388
679
|
logger.info('Cleaning up the preempted or failed cluster'
|
|
389
680
|
'...')
|
|
390
|
-
|
|
681
|
+
await self._cleanup_cluster(cluster_name)
|
|
391
682
|
|
|
392
683
|
# Try to recover the managed jobs, when the cluster is preempted or
|
|
393
684
|
# failed or the job status is failed to be fetched.
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
685
|
+
logger.info(f'Starting recovery for task {task_id}, '
|
|
686
|
+
f'it is currently {job_status}')
|
|
687
|
+
await managed_job_state.set_recovering_async(
|
|
688
|
+
job_id=self._job_id,
|
|
689
|
+
task_id=task_id,
|
|
690
|
+
force_transit_to_recovering=force_transit_to_recovering,
|
|
691
|
+
callback_func=callback_func)
|
|
692
|
+
|
|
693
|
+
recovered_time = await self._strategy_executor.recover()
|
|
694
|
+
|
|
695
|
+
if self._pool is not None:
|
|
696
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
697
|
+
await
|
|
698
|
+
managed_job_state.get_pool_submit_info_async(self._job_id))
|
|
699
|
+
assert cluster_name is not None
|
|
700
|
+
await managed_job_state.set_recovered_async(
|
|
701
|
+
self._job_id,
|
|
702
|
+
task_id,
|
|
703
|
+
recovered_time=recovered_time,
|
|
704
|
+
callback_func=callback_func)
|
|
705
|
+
|
|
706
|
+
async def run(self):
|
|
404
707
|
"""Run controller logic and handle exceptions."""
|
|
708
|
+
logger.info(f'Starting JobsController run for job {self._job_id}')
|
|
405
709
|
task_id = 0
|
|
710
|
+
cancelled = False
|
|
711
|
+
|
|
406
712
|
try:
|
|
407
713
|
succeeded = True
|
|
408
714
|
# We support chain DAGs only for now.
|
|
409
715
|
for task_id, task in enumerate(self._dag.tasks):
|
|
410
|
-
|
|
716
|
+
logger.info(
|
|
717
|
+
f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
|
|
718
|
+
f'{task.name}')
|
|
719
|
+
task_start = time.time()
|
|
720
|
+
succeeded = await self._run_one_task(task_id, task)
|
|
721
|
+
task_time = time.time() - task_start
|
|
722
|
+
logger.info(f'Task {task_id} completed in {task_time:.2f}s '
|
|
723
|
+
f'with success={succeeded}')
|
|
724
|
+
|
|
411
725
|
if not succeeded:
|
|
726
|
+
logger.info(f'Task {task_id} failed, stopping execution')
|
|
412
727
|
break
|
|
728
|
+
|
|
413
729
|
except exceptions.ProvisionPrechecksError as e:
|
|
414
730
|
# Please refer to the docstring of self._run for the cases when
|
|
415
731
|
# this exception can occur.
|
|
732
|
+
logger.error(f'Provision prechecks failed for task {task_id}')
|
|
416
733
|
failure_reason = ('; '.join(
|
|
417
734
|
common_utils.format_exception(reason, use_bracket=True)
|
|
418
735
|
for reason in e.reasons))
|
|
419
736
|
logger.error(failure_reason)
|
|
420
|
-
self._update_failed_task_state(
|
|
737
|
+
await self._update_failed_task_state(
|
|
421
738
|
task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
|
|
422
739
|
failure_reason)
|
|
423
740
|
except exceptions.ManagedJobReachedMaxRetriesError as e:
|
|
424
741
|
# Please refer to the docstring of self._run for the cases when
|
|
425
742
|
# this exception can occur.
|
|
743
|
+
logger.error(f'Managed job reached max retries for task {task_id}')
|
|
426
744
|
failure_reason = common_utils.format_exception(e)
|
|
427
745
|
logger.error(failure_reason)
|
|
428
746
|
# The managed job should be marked as FAILED_NO_RESOURCE, as the
|
|
429
747
|
# managed job may be able to launch next time.
|
|
430
|
-
self._update_failed_task_state(
|
|
748
|
+
await self._update_failed_task_state(
|
|
431
749
|
task_id, managed_job_state.ManagedJobStatus.FAILED_NO_RESOURCE,
|
|
432
750
|
failure_reason)
|
|
751
|
+
except asyncio.CancelledError: # pylint: disable=try-except-raise
|
|
752
|
+
# have this here to avoid getting caught by the general except block
|
|
753
|
+
# below.
|
|
754
|
+
cancelled = True
|
|
755
|
+
raise
|
|
433
756
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
757
|
+
logger.error(
|
|
758
|
+
f'Unexpected error in JobsController run for task {task_id}')
|
|
434
759
|
with ux_utils.enable_traceback():
|
|
435
760
|
logger.error(traceback.format_exc())
|
|
436
761
|
msg = ('Unexpected error occurred: ' +
|
|
437
762
|
common_utils.format_exception(e, use_bracket=True))
|
|
438
763
|
logger.error(msg)
|
|
439
|
-
self._update_failed_task_state(
|
|
764
|
+
await self._update_failed_task_state(
|
|
440
765
|
task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
|
441
766
|
msg)
|
|
442
767
|
finally:
|
|
443
|
-
# This will set all unfinished tasks to CANCELLING, and will not
|
|
444
|
-
# affect the jobs in terminal states.
|
|
445
|
-
# We need to call set_cancelling before set_cancelled to make sure
|
|
446
|
-
# the table entries are correctly set.
|
|
447
768
|
callback_func = managed_job_utils.event_callback_func(
|
|
448
769
|
job_id=self._job_id,
|
|
449
770
|
task_id=task_id,
|
|
450
771
|
task=self._dag.tasks[task_id])
|
|
451
|
-
managed_job_state.
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
772
|
+
await managed_job_state.set_cancelling_async(
|
|
773
|
+
job_id=self._job_id, callback_func=callback_func)
|
|
774
|
+
if not cancelled:
|
|
775
|
+
# the others haven't been run yet so we can set them to
|
|
776
|
+
# cancelled immediately (no resources to clean up).
|
|
777
|
+
# if we are running and get cancelled, we need to clean up the
|
|
778
|
+
# resources first so this will be done later.
|
|
779
|
+
await managed_job_state.set_cancelled_async(
|
|
780
|
+
job_id=self._job_id, callback_func=callback_func)
|
|
455
781
|
|
|
456
|
-
def _update_failed_task_state(
|
|
782
|
+
async def _update_failed_task_state(
|
|
457
783
|
self, task_id: int,
|
|
458
784
|
failure_type: managed_job_state.ManagedJobStatus,
|
|
459
785
|
failure_reason: str):
|
|
460
786
|
"""Update the state of the failed task."""
|
|
461
|
-
|
|
787
|
+
logger.info(f'Updating failed task state: task_id={task_id}, '
|
|
788
|
+
f'failure_type={failure_type}')
|
|
789
|
+
await managed_job_state.set_failed_async(
|
|
462
790
|
self._job_id,
|
|
463
791
|
task_id=task_id,
|
|
464
792
|
failure_type=failure_type,
|
|
@@ -469,176 +797,421 @@ class JobsController:
|
|
|
469
797
|
task=self._dag.tasks[task_id]))
|
|
470
798
|
|
|
471
799
|
|
|
472
|
-
|
|
473
|
-
"""
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
jobs_controller = JobsController(job_id, dag_yaml)
|
|
477
|
-
jobs_controller.run()
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
def _handle_signal(job_id):
|
|
481
|
-
"""Handle the signal if the user sent it."""
|
|
482
|
-
signal_file = pathlib.Path(
|
|
483
|
-
managed_job_utils.SIGNAL_FILE_PREFIX.format(job_id))
|
|
484
|
-
user_signal = None
|
|
485
|
-
if signal_file.exists():
|
|
486
|
-
# Filelock is needed to prevent race condition with concurrent
|
|
487
|
-
# signal writing.
|
|
488
|
-
with filelock.FileLock(str(signal_file) + '.lock'):
|
|
489
|
-
with signal_file.open(mode='r', encoding='utf-8') as f:
|
|
490
|
-
user_signal = f.read().strip()
|
|
491
|
-
try:
|
|
492
|
-
user_signal = managed_job_utils.UserSignal(user_signal)
|
|
493
|
-
except ValueError:
|
|
494
|
-
logger.warning(
|
|
495
|
-
f'Unknown signal received: {user_signal}. Ignoring.')
|
|
496
|
-
user_signal = None
|
|
497
|
-
# Remove the signal file, after reading the signal.
|
|
498
|
-
signal_file.unlink()
|
|
499
|
-
if user_signal is None:
|
|
500
|
-
# None or empty string.
|
|
501
|
-
return
|
|
502
|
-
assert user_signal == managed_job_utils.UserSignal.CANCEL, (
|
|
503
|
-
f'Only cancel signal is supported, but {user_signal} got.')
|
|
504
|
-
raise exceptions.ManagedJobUserCancelledError(
|
|
505
|
-
f'User sent {user_signal.value} signal.')
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
def _cleanup(job_id: int, dag_yaml: str):
|
|
509
|
-
"""Clean up the cluster(s) and storages.
|
|
510
|
-
|
|
511
|
-
(1) Clean up the succeeded task(s)' ephemeral storage. The storage has
|
|
512
|
-
to be cleaned up after the whole job is finished, as the tasks
|
|
513
|
-
may share the same storage.
|
|
514
|
-
(2) Clean up the cluster(s) that are not cleaned up yet, which can happen
|
|
515
|
-
when the task failed or cancelled. At most one cluster should be left
|
|
516
|
-
when reaching here, as we currently only support chain DAGs, and only
|
|
517
|
-
task is executed at a time.
|
|
800
|
+
class ControllerManager:
|
|
801
|
+
"""Main loop for a job controller process.
|
|
802
|
+
|
|
803
|
+
Many jobs will be handled by this, each by a single JobController.
|
|
518
804
|
"""
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
#
|
|
527
|
-
#
|
|
528
|
-
|
|
529
|
-
#
|
|
530
|
-
#
|
|
531
|
-
#
|
|
532
|
-
#
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
805
|
+
|
|
806
|
+
def __init__(self, controller_uuid: str) -> None:
|
|
807
|
+
self._controller_uuid = controller_uuid
|
|
808
|
+
# Global state for active jobs
|
|
809
|
+
self.job_tasks: Dict[int, asyncio.Task] = {}
|
|
810
|
+
self.starting: Set[int] = set()
|
|
811
|
+
|
|
812
|
+
# Lock for synchronizing access to global state dictionary
|
|
813
|
+
# Must always hold _job_tasks_lock when accessing the _starting_signal.
|
|
814
|
+
self._job_tasks_lock = asyncio.Lock()
|
|
815
|
+
# We signal whenever a job leaves the api server launching state. Feel
|
|
816
|
+
# free to signal as much as you want to be safe from leaks (if you
|
|
817
|
+
# do not signal enough there may be some jobs forever waiting to
|
|
818
|
+
# launch).
|
|
819
|
+
self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
|
|
820
|
+
|
|
821
|
+
self._pid = os.getpid()
|
|
822
|
+
self._pid_started_at = psutil.Process(self._pid).create_time()
|
|
823
|
+
|
|
824
|
+
async def _cleanup(self, job_id: int, pool: Optional[str] = None):
|
|
825
|
+
"""Clean up the cluster(s) and storages.
|
|
826
|
+
|
|
827
|
+
(1) Clean up the succeeded task(s)' ephemeral storage. The storage has
|
|
828
|
+
to be cleaned up after the whole job is finished, as the tasks
|
|
829
|
+
may share the same storage.
|
|
830
|
+
(2) Clean up the cluster(s) that are not cleaned up yet, which can
|
|
831
|
+
happen when the task failed or cancelled. At most one cluster
|
|
832
|
+
should be left when reaching here, as we currently only support
|
|
833
|
+
chain DAGs, and only one task is executed at a time.
|
|
834
|
+
"""
|
|
835
|
+
# Cleanup the HA recovery script first as it is possible that some error
|
|
836
|
+
# was raised when we construct the task object (e.g.,
|
|
837
|
+
# sky.exceptions.ResourcesUnavailableError).
|
|
838
|
+
await managed_job_state.remove_ha_recovery_script_async(job_id)
|
|
839
|
+
|
|
840
|
+
def task_cleanup(task: 'sky.Task', job_id: int):
|
|
841
|
+
assert task.name is not None, task
|
|
842
|
+
error = None
|
|
843
|
+
|
|
540
844
|
try:
|
|
541
|
-
if
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
845
|
+
if pool is None:
|
|
846
|
+
cluster_name = (
|
|
847
|
+
managed_job_utils.generate_managed_job_cluster_name(
|
|
848
|
+
task.name, job_id))
|
|
849
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
|
850
|
+
status = core.status(cluster_names=[cluster_name],
|
|
851
|
+
all_users=True)
|
|
852
|
+
assert (len(status) == 0 or
|
|
853
|
+
status[0]['status'] == sky.ClusterStatus.STOPPED), (
|
|
854
|
+
f'{cluster_name} is not down: {status}')
|
|
855
|
+
logger.info(f'{cluster_name} is down')
|
|
856
|
+
else:
|
|
857
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
858
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
859
|
+
if cluster_name is not None:
|
|
860
|
+
if job_id_on_pool_cluster is not None:
|
|
861
|
+
core.cancel(cluster_name=cluster_name,
|
|
862
|
+
job_ids=[job_id_on_pool_cluster],
|
|
863
|
+
_try_cancel_if_cluster_is_init=True)
|
|
547
864
|
except Exception as e: # pylint: disable=broad-except
|
|
865
|
+
error = e
|
|
548
866
|
logger.warning(
|
|
549
|
-
f'Failed to
|
|
867
|
+
f'Failed to terminate cluster {cluster_name}: {e}')
|
|
868
|
+
# we continue to try cleaning up whatever else we can.
|
|
869
|
+
# Clean up Storages with persistent=False.
|
|
870
|
+
# TODO(zhwu): this assumes the specific backend.
|
|
871
|
+
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
872
|
+
# Need to re-construct storage object in the controller process
|
|
873
|
+
# because when SkyPilot API server machine sends the yaml config to
|
|
874
|
+
# the controller machine, only storage metadata is sent, not the
|
|
875
|
+
# storage object itself.
|
|
876
|
+
try:
|
|
877
|
+
for storage in task.storage_mounts.values():
|
|
878
|
+
storage.construct()
|
|
879
|
+
except (exceptions.StorageSpecError, exceptions.StorageError) as e:
|
|
880
|
+
logger.warning(
|
|
881
|
+
f'Failed to construct storage object for teardown: {e}\n'
|
|
882
|
+
'This may happen because storage construction already '
|
|
883
|
+
'failed during launch, storage was deleted externally, '
|
|
884
|
+
'credentials expired/changed, or network connectivity '
|
|
885
|
+
'issues.')
|
|
886
|
+
try:
|
|
887
|
+
backend.teardown_ephemeral_storage(task)
|
|
888
|
+
except Exception as e: # pylint: disable=broad-except
|
|
889
|
+
error = e
|
|
890
|
+
logger.warning(f'Failed to teardown ephemeral storage: {e}')
|
|
891
|
+
# we continue to try cleaning up whatever else we can.
|
|
550
892
|
|
|
893
|
+
# Clean up any files mounted from the local disk, such as two-hop
|
|
894
|
+
# file mounts.
|
|
895
|
+
for file_mount in (task.file_mounts or {}).values():
|
|
896
|
+
try:
|
|
897
|
+
# For consolidation mode, there is no two-hop file mounts
|
|
898
|
+
# and the file path here represents the real user data.
|
|
899
|
+
# We skip the cleanup for consolidation mode.
|
|
900
|
+
if (not data_utils.is_cloud_store_url(file_mount) and
|
|
901
|
+
not managed_job_utils.is_consolidation_mode()):
|
|
902
|
+
path = os.path.expanduser(file_mount)
|
|
903
|
+
if os.path.isdir(path):
|
|
904
|
+
shutil.rmtree(path)
|
|
905
|
+
else:
|
|
906
|
+
os.remove(path)
|
|
907
|
+
except Exception as e: # pylint: disable=broad-except
|
|
908
|
+
logger.warning(
|
|
909
|
+
f'Failed to clean up file mount {file_mount}: {e}')
|
|
551
910
|
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
911
|
+
if error is not None:
|
|
912
|
+
raise error
|
|
913
|
+
|
|
914
|
+
dag = _get_dag(job_id)
|
|
915
|
+
error = None
|
|
916
|
+
for task in dag.tasks:
|
|
917
|
+
# most things in this function are blocking
|
|
918
|
+
try:
|
|
919
|
+
await context_utils.to_thread(task_cleanup, task, job_id)
|
|
920
|
+
except Exception as e: # pylint: disable=broad-except
|
|
921
|
+
error = e
|
|
922
|
+
|
|
923
|
+
if error is not None:
|
|
924
|
+
# we only raise the last error that occurred, but its fine to lose
|
|
925
|
+
# some data here.
|
|
926
|
+
raise error
|
|
927
|
+
|
|
928
|
+
# Use context.contextual to enable per-job output redirection and env var
|
|
929
|
+
# isolation.
|
|
930
|
+
@context.contextual_async
|
|
931
|
+
async def run_job_loop(self,
|
|
932
|
+
job_id: int,
|
|
933
|
+
log_file: str,
|
|
934
|
+
pool: Optional[str] = None):
|
|
935
|
+
"""Background task that runs the job loop."""
|
|
936
|
+
ctx = context.get()
|
|
937
|
+
assert ctx is not None, 'Context is not initialized'
|
|
938
|
+
ctx.redirect_log(pathlib.Path(log_file))
|
|
939
|
+
|
|
940
|
+
logger.info(f'Starting job loop for {job_id}')
|
|
941
|
+
logger.info(f' log_file={log_file}')
|
|
942
|
+
logger.info(f' pool={pool}')
|
|
943
|
+
logger.info(f'From controller {self._controller_uuid}')
|
|
944
|
+
logger.info(f' pid={self._pid}')
|
|
945
|
+
|
|
946
|
+
env_content = file_content_utils.get_job_env_content(job_id)
|
|
947
|
+
if env_content:
|
|
948
|
+
try:
|
|
949
|
+
env_vars = dotenv.dotenv_values(stream=io.StringIO(env_content))
|
|
950
|
+
logger.info('Loading %d environment variables for job %s',
|
|
951
|
+
len(env_vars), job_id)
|
|
952
|
+
if ctx is not None:
|
|
953
|
+
for key, value in env_vars.items():
|
|
954
|
+
if value is not None:
|
|
955
|
+
ctx.override_envs({key: value})
|
|
956
|
+
logger.debug('Set environment variable: %s=%s', key,
|
|
957
|
+
value)
|
|
958
|
+
|
|
959
|
+
# Restore config file if needed
|
|
960
|
+
file_content_utils.restore_job_config_file(job_id)
|
|
961
|
+
|
|
962
|
+
skypilot_config.reload_config()
|
|
963
|
+
else: # pragma: no cover - defensive
|
|
964
|
+
logger.error('Context is None, cannot set environment '
|
|
965
|
+
'variables')
|
|
966
|
+
except Exception as e: # pylint: disable=broad-except
|
|
967
|
+
logger.error(
|
|
968
|
+
'Failed to load environment variables for job %s: '
|
|
969
|
+
'%s', job_id, e)
|
|
970
|
+
|
|
971
|
+
cancelling = False
|
|
972
|
+
try:
|
|
973
|
+
controller = JobController(job_id, self.starting,
|
|
974
|
+
self._job_tasks_lock,
|
|
975
|
+
self._starting_signal, pool)
|
|
976
|
+
|
|
977
|
+
async with self._job_tasks_lock:
|
|
978
|
+
if job_id in self.job_tasks:
|
|
979
|
+
logger.error(f'Job {job_id} already exists in job_tasks')
|
|
980
|
+
raise ValueError(f'Job {job_id} already exists')
|
|
981
|
+
|
|
982
|
+
# Create the task and store it
|
|
983
|
+
# This function should return instantly and run the job loop in
|
|
984
|
+
# the background.
|
|
985
|
+
task = asyncio.create_task(controller.run())
|
|
986
|
+
self.job_tasks[job_id] = task
|
|
987
|
+
await task
|
|
988
|
+
except asyncio.CancelledError:
|
|
989
|
+
logger.info(f'Job {job_id} was cancelled')
|
|
990
|
+
dag = _get_dag(job_id)
|
|
991
|
+
task_id, _ = await (
|
|
992
|
+
managed_job_state.get_latest_task_id_status_async(job_id))
|
|
993
|
+
assert task_id is not None, job_id
|
|
994
|
+
logger.info(f'Cancelling managed job, job_id: {job_id}, '
|
|
995
|
+
f'task_id: {task_id}')
|
|
996
|
+
await managed_job_state.set_cancelling_async(
|
|
608
997
|
job_id=job_id,
|
|
609
998
|
callback_func=managed_job_utils.event_callback_func(
|
|
610
999
|
job_id=job_id, task_id=task_id, task=dag.tasks[task_id]))
|
|
1000
|
+
cancelling = True
|
|
1001
|
+
raise
|
|
1002
|
+
except Exception as e:
|
|
1003
|
+
logger.error(f'Unexpected error in job loop for {job_id}: '
|
|
1004
|
+
f'{common_utils.format_exception(e)}')
|
|
1005
|
+
raise
|
|
1006
|
+
finally:
|
|
1007
|
+
try:
|
|
1008
|
+
await self._cleanup(job_id, pool=pool)
|
|
1009
|
+
logger.info(
|
|
1010
|
+
f'Cluster of managed job {job_id} has been cleaned up.')
|
|
1011
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1012
|
+
failure_reason = ('Failed to clean up: '
|
|
1013
|
+
f'{common_utils.format_exception(e)}')
|
|
1014
|
+
await managed_job_state.set_failed_async(
|
|
1015
|
+
job_id,
|
|
1016
|
+
task_id=None,
|
|
1017
|
+
failure_type=managed_job_state.ManagedJobStatus.
|
|
1018
|
+
FAILED_CONTROLLER,
|
|
1019
|
+
failure_reason=failure_reason,
|
|
1020
|
+
override_terminal=True)
|
|
1021
|
+
|
|
1022
|
+
if cancelling:
|
|
1023
|
+
# Since it's set with cancelling
|
|
1024
|
+
assert task_id is not None, job_id
|
|
1025
|
+
await managed_job_state.set_cancelled_async(
|
|
1026
|
+
job_id=job_id,
|
|
1027
|
+
callback_func=managed_job_utils.event_callback_func(
|
|
1028
|
+
job_id=job_id, task_id=task_id,
|
|
1029
|
+
task=dag.tasks[task_id]))
|
|
1030
|
+
|
|
1031
|
+
# We should check job status after 'set_cancelled', otherwise
|
|
1032
|
+
# the job status is not terminal.
|
|
1033
|
+
job_status = await managed_job_state.get_status_async(job_id)
|
|
1034
|
+
assert job_status is not None
|
|
1035
|
+
# The job can be non-terminal if the controller exited abnormally,
|
|
1036
|
+
# e.g. failed to launch cluster after reaching the MAX_RETRY.
|
|
1037
|
+
if not job_status.is_terminal():
|
|
1038
|
+
logger.info(f'Previous job status: {job_status.value}')
|
|
1039
|
+
await managed_job_state.set_failed_async(
|
|
1040
|
+
job_id,
|
|
1041
|
+
task_id=None,
|
|
1042
|
+
failure_type=managed_job_state.ManagedJobStatus.
|
|
1043
|
+
FAILED_CONTROLLER,
|
|
1044
|
+
failure_reason=(
|
|
1045
|
+
'Unexpected error occurred. For details, '
|
|
1046
|
+
f'run: sky jobs logs --controller {job_id}'))
|
|
1047
|
+
|
|
1048
|
+
await scheduler.job_done_async(job_id)
|
|
1049
|
+
|
|
1050
|
+
async with self._job_tasks_lock:
|
|
1051
|
+
try:
|
|
1052
|
+
# just in case we were cancelled or some other error
|
|
1053
|
+
# occurred during launch
|
|
1054
|
+
self.starting.remove(job_id)
|
|
1055
|
+
# its fine if we notify again, better to wake someone up
|
|
1056
|
+
# and have them go to sleep again, then have some stuck
|
|
1057
|
+
# sleeping.
|
|
1058
|
+
self._starting_signal.notify()
|
|
1059
|
+
except KeyError:
|
|
1060
|
+
pass
|
|
1061
|
+
|
|
1062
|
+
# Remove the job from the job_tasks dictionary.
|
|
1063
|
+
async with self._job_tasks_lock:
|
|
1064
|
+
if job_id in self.job_tasks:
|
|
1065
|
+
del self.job_tasks[job_id]
|
|
1066
|
+
|
|
1067
|
+
async def start_job(
|
|
1068
|
+
self,
|
|
1069
|
+
job_id: int,
|
|
1070
|
+
pool: Optional[str] = None,
|
|
1071
|
+
):
|
|
1072
|
+
"""Start a new job.
|
|
1073
|
+
|
|
1074
|
+
Args:
|
|
1075
|
+
job_id: The ID of the job to start.
|
|
1076
|
+
"""
|
|
1077
|
+
# Create log file path for job output redirection
|
|
1078
|
+
log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
1079
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
1080
|
+
log_file = os.path.join(log_dir, f'{job_id}.log')
|
|
1081
|
+
|
|
1082
|
+
logger.info(f'Starting job {job_id} with log_file={log_file}')
|
|
1083
|
+
|
|
1084
|
+
async with self._job_tasks_lock:
|
|
1085
|
+
self.starting.add(job_id)
|
|
1086
|
+
await create_background_task(self.run_job_loop(job_id, log_file, pool))
|
|
1087
|
+
|
|
1088
|
+
logger.info(f'Job {job_id} started successfully')
|
|
1089
|
+
|
|
1090
|
+
async def cancel_job(self):
|
|
1091
|
+
"""Cancel an existing job."""
|
|
1092
|
+
while True:
|
|
1093
|
+
cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
|
|
1094
|
+
for cancel in cancels:
|
|
1095
|
+
async with self._job_tasks_lock:
|
|
1096
|
+
job_id = int(cancel)
|
|
1097
|
+
if job_id in self.job_tasks:
|
|
1098
|
+
logger.info(f'Cancelling job {job_id}')
|
|
1099
|
+
|
|
1100
|
+
task = self.job_tasks[job_id]
|
|
1101
|
+
|
|
1102
|
+
# Run the cancellation in the background, so we can
|
|
1103
|
+
# return immediately.
|
|
1104
|
+
task.cancel()
|
|
1105
|
+
logger.info(f'Job {job_id} cancelled successfully')
|
|
1106
|
+
|
|
1107
|
+
os.remove(f'{jobs_constants.CONSOLIDATED_SIGNAL_PATH}/'
|
|
1108
|
+
f'{job_id}')
|
|
1109
|
+
await asyncio.sleep(15)
|
|
1110
|
+
|
|
1111
|
+
async def monitor_loop(self):
|
|
1112
|
+
"""Monitor the job loop."""
|
|
1113
|
+
logger.info(f'Starting monitor loop for pid {self._pid}...')
|
|
1114
|
+
|
|
1115
|
+
while True:
|
|
1116
|
+
async with self._job_tasks_lock:
|
|
1117
|
+
running_tasks = [
|
|
1118
|
+
task for task in self.job_tasks.values() if not task.done()
|
|
1119
|
+
]
|
|
1120
|
+
|
|
1121
|
+
async with self._job_tasks_lock:
|
|
1122
|
+
starting_count = len(self.starting)
|
|
1123
|
+
|
|
1124
|
+
if starting_count >= controller_utils.LAUNCHES_PER_WORKER:
|
|
1125
|
+
# launching a job takes around 1 minute, so lets wait half that
|
|
1126
|
+
# time
|
|
1127
|
+
await asyncio.sleep(30)
|
|
1128
|
+
continue
|
|
1129
|
+
|
|
1130
|
+
# Normally, 200 jobs can run on each controller. But if we have a
|
|
1131
|
+
# ton of controllers, we need to limit the number of jobs that can
|
|
1132
|
+
# run on each controller, to achieve a total of 2000 jobs across all
|
|
1133
|
+
# controllers.
|
|
1134
|
+
max_jobs = min(controller_utils.MAX_JOBS_PER_WORKER,
|
|
1135
|
+
(controller_utils.MAX_TOTAL_RUNNING_JOBS //
|
|
1136
|
+
controller_utils.get_number_of_jobs_controllers()))
|
|
611
1137
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
1138
|
+
if len(running_tasks) >= max_jobs:
|
|
1139
|
+
logger.info('Too many jobs running, waiting for 60 seconds')
|
|
1140
|
+
await asyncio.sleep(60)
|
|
1141
|
+
continue
|
|
1142
|
+
|
|
1143
|
+
# Check if there are any jobs that are waiting to launch
|
|
1144
|
+
try:
|
|
1145
|
+
waiting_job = await managed_job_state.get_waiting_job_async(
|
|
1146
|
+
pid=self._pid, pid_started_at=self._pid_started_at)
|
|
1147
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1148
|
+
logger.error(f'Failed to get waiting job: {e}')
|
|
1149
|
+
await asyncio.sleep(5)
|
|
1150
|
+
continue
|
|
1151
|
+
|
|
1152
|
+
if waiting_job is None:
|
|
1153
|
+
logger.info('No waiting job, waiting for 10 seconds')
|
|
1154
|
+
await asyncio.sleep(10)
|
|
1155
|
+
continue
|
|
1156
|
+
|
|
1157
|
+
logger.info(f'Claiming job {waiting_job["job_id"]}')
|
|
1158
|
+
job_id = waiting_job['job_id']
|
|
1159
|
+
pool = waiting_job.get('pool', None)
|
|
1160
|
+
|
|
1161
|
+
cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
|
|
1162
|
+
if str(job_id) in cancels:
|
|
1163
|
+
status = await managed_job_state.get_status_async(job_id)
|
|
1164
|
+
if status == managed_job_state.ManagedJobStatus.PENDING:
|
|
1165
|
+
logger.info(f'Job {job_id} cancelled')
|
|
1166
|
+
os.remove(f'{jobs_constants.CONSOLIDATED_SIGNAL_PATH}/'
|
|
1167
|
+
f'{job_id}')
|
|
1168
|
+
await managed_job_state.set_cancelling_async(
|
|
1169
|
+
job_id=job_id,
|
|
1170
|
+
callback_func=managed_job_utils.event_callback_func(
|
|
1171
|
+
job_id=job_id, task_id=None, task=None))
|
|
1172
|
+
await managed_job_state.set_cancelled_async(
|
|
1173
|
+
job_id=job_id,
|
|
1174
|
+
callback_func=managed_job_utils.event_callback_func(
|
|
1175
|
+
job_id=job_id, task_id=None, task=None))
|
|
1176
|
+
continue
|
|
1177
|
+
|
|
1178
|
+
await self.start_job(job_id, pool)
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
async def main(controller_uuid: str):
|
|
1182
|
+
logger.info(f'Starting controller {controller_uuid}')
|
|
1183
|
+
|
|
1184
|
+
context_utils.hijack_sys_attrs()
|
|
1185
|
+
|
|
1186
|
+
controller = ControllerManager(controller_uuid)
|
|
1187
|
+
|
|
1188
|
+
# Will happen multiple times, who cares though
|
|
1189
|
+
os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
|
|
1190
|
+
|
|
1191
|
+
# Increase number of files we can open
|
|
1192
|
+
soft = None
|
|
1193
|
+
try:
|
|
1194
|
+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
1195
|
+
logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
|
|
1196
|
+
logger.info(f'Increasing soft limit to {hard}')
|
|
1197
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
|
|
1198
|
+
except OSError as e:
|
|
1199
|
+
logger.warning(f'Failed to increase number of files we can open: {e}\n'
|
|
1200
|
+
f'Current soft limit: {soft}, hard limit: {hard}')
|
|
1201
|
+
|
|
1202
|
+
# Will loop forever, do it in the background
|
|
1203
|
+
cancel_job_task = asyncio.create_task(controller.cancel_job())
|
|
1204
|
+
monitor_loop_task = asyncio.create_task(controller.monitor_loop())
|
|
1205
|
+
# Run the garbage collector in a dedicated daemon thread to avoid affecting
|
|
1206
|
+
# the main event loop.
|
|
1207
|
+
gc_thread = threading.Thread(target=log_gc.elect_for_log_gc, daemon=True)
|
|
1208
|
+
gc_thread.start()
|
|
1209
|
+
try:
|
|
1210
|
+
await asyncio.gather(cancel_job_task, monitor_loop_task)
|
|
1211
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1212
|
+
logger.error(f'Controller server crashed: {e}')
|
|
1213
|
+
sys.exit(1)
|
|
629
1214
|
|
|
630
1215
|
|
|
631
1216
|
if __name__ == '__main__':
|
|
632
|
-
|
|
633
|
-
parser.add_argument('--job-id',
|
|
634
|
-
required=True,
|
|
635
|
-
type=int,
|
|
636
|
-
help='Job id for the controller job.')
|
|
637
|
-
parser.add_argument('dag_yaml',
|
|
638
|
-
type=str,
|
|
639
|
-
help='The path to the user job yaml file.')
|
|
640
|
-
args = parser.parse_args()
|
|
641
|
-
# We start process with 'spawn', because 'fork' could result in weird
|
|
642
|
-
# behaviors; 'spawn' is also cross-platform.
|
|
643
|
-
multiprocessing.set_start_method('spawn', force=True)
|
|
644
|
-
start(args.job_id, args.dag_yaml)
|
|
1217
|
+
asyncio.run(main(sys.argv[1]))
|