skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
|
+
import concurrent.futures
|
|
3
|
+
import copy
|
|
4
|
+
import ipaddress
|
|
2
5
|
import os
|
|
3
|
-
import
|
|
4
|
-
import subprocess
|
|
6
|
+
import pathlib
|
|
5
7
|
import tempfile
|
|
6
|
-
import time
|
|
7
8
|
import typing
|
|
8
9
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
|
+
from urllib import parse as urlparse
|
|
9
11
|
import uuid
|
|
10
12
|
|
|
11
13
|
import colorama
|
|
@@ -17,13 +19,23 @@ from sky import execution
|
|
|
17
19
|
from sky import global_user_state
|
|
18
20
|
from sky import provision as provision_lib
|
|
19
21
|
from sky import sky_logging
|
|
22
|
+
from sky import skypilot_config
|
|
20
23
|
from sky import task as task_lib
|
|
24
|
+
from sky.adaptors import common as adaptors_common
|
|
21
25
|
from sky.backends import backend_utils
|
|
22
|
-
from sky.
|
|
26
|
+
from sky.backends import cloud_vm_ray_backend
|
|
27
|
+
from sky.catalog import common as service_catalog_common
|
|
23
28
|
from sky.data import storage as storage_lib
|
|
24
29
|
from sky.jobs import constants as managed_job_constants
|
|
30
|
+
from sky.jobs import state as managed_job_state
|
|
25
31
|
from sky.jobs import utils as managed_job_utils
|
|
32
|
+
from sky.metrics import utils as metrics_lib
|
|
26
33
|
from sky.provision import common as provision_common
|
|
34
|
+
from sky.schemas.api import responses
|
|
35
|
+
from sky.serve import serve_state
|
|
36
|
+
from sky.serve import serve_utils
|
|
37
|
+
from sky.serve.server import impl
|
|
38
|
+
from sky.server.requests import request_names
|
|
27
39
|
from sky.skylet import constants as skylet_constants
|
|
28
40
|
from sky.usage import usage_lib
|
|
29
41
|
from sky.utils import admin_policy_utils
|
|
@@ -36,21 +48,153 @@ from sky.utils import status_lib
|
|
|
36
48
|
from sky.utils import subprocess_utils
|
|
37
49
|
from sky.utils import timeline
|
|
38
50
|
from sky.utils import ux_utils
|
|
51
|
+
from sky.workspaces import core as workspaces_core
|
|
39
52
|
|
|
40
53
|
if typing.TYPE_CHECKING:
|
|
54
|
+
from google.protobuf import json_format
|
|
55
|
+
|
|
41
56
|
import sky
|
|
42
|
-
from sky.
|
|
57
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
58
|
+
else:
|
|
59
|
+
json_format = adaptors_common.LazyImport('google.protobuf.json_format')
|
|
60
|
+
|
|
61
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
62
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
43
63
|
|
|
44
64
|
logger = sky_logging.init_logger(__name__)
|
|
45
65
|
|
|
66
|
+
_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
|
|
67
|
+
'job_id',
|
|
68
|
+
'task_id',
|
|
69
|
+
'workspace',
|
|
70
|
+
'job_name',
|
|
71
|
+
'task_name',
|
|
72
|
+
'resources',
|
|
73
|
+
'submitted_at',
|
|
74
|
+
'end_at',
|
|
75
|
+
'job_duration',
|
|
76
|
+
'recovery_count',
|
|
77
|
+
'status',
|
|
78
|
+
'pool',
|
|
79
|
+
'current_cluster_name',
|
|
80
|
+
'job_id_on_pool_cluster',
|
|
81
|
+
'start_at',
|
|
82
|
+
'infra',
|
|
83
|
+
'cloud',
|
|
84
|
+
'region',
|
|
85
|
+
'zone',
|
|
86
|
+
'cluster_resources',
|
|
87
|
+
'schedule_state',
|
|
88
|
+
'details',
|
|
89
|
+
'failure_reason',
|
|
90
|
+
'metadata',
|
|
91
|
+
'user_name',
|
|
92
|
+
'user_hash',
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
97
|
+
"""Upload files to the controller.
|
|
98
|
+
|
|
99
|
+
In consolidation mode, we still need to upload files to the controller as
|
|
100
|
+
we should keep a separate workdir for each jobs. Assuming two jobs using
|
|
101
|
+
the same workdir, if there are some modifications to the workdir after job 1
|
|
102
|
+
is submitted, on recovery of job 1, the modifications should not be applied.
|
|
103
|
+
"""
|
|
104
|
+
local_to_controller_file_mounts: Dict[str, str] = {}
|
|
105
|
+
|
|
106
|
+
# For consolidation mode, we don't need to use cloud storage,
|
|
107
|
+
# as uploading to the controller is only a local copy.
|
|
108
|
+
storage_clouds = (
|
|
109
|
+
storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
|
|
110
|
+
force_disable_cloud_bucket = skypilot_config.get_nested(
|
|
111
|
+
('jobs', 'force_disable_cloud_bucket'), False)
|
|
112
|
+
if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
|
|
113
|
+
not force_disable_cloud_bucket):
|
|
114
|
+
for task_ in dag.tasks:
|
|
115
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
116
|
+
task_, task_type='jobs')
|
|
117
|
+
else:
|
|
118
|
+
# We do not have any cloud storage available, so fall back to
|
|
119
|
+
# two-hop file_mount uploading.
|
|
120
|
+
# Note: we can't easily hack sync_storage_mounts() to upload
|
|
121
|
+
# directly to the controller, because the controller may not
|
|
122
|
+
# even be up yet.
|
|
123
|
+
for task_ in dag.tasks:
|
|
124
|
+
if task_.storage_mounts and not storage_clouds:
|
|
125
|
+
# Technically, we could convert COPY storage_mounts that
|
|
126
|
+
# have a local source and do not specify `store`, but we
|
|
127
|
+
# will not do that for now. Only plain file_mounts are
|
|
128
|
+
# supported.
|
|
129
|
+
raise exceptions.NotSupportedError(
|
|
130
|
+
'Cloud-based file_mounts are specified, but no cloud '
|
|
131
|
+
'storage is available. Please specify local '
|
|
132
|
+
'file_mounts only.')
|
|
133
|
+
|
|
134
|
+
# Merge file mounts from all tasks.
|
|
135
|
+
local_to_controller_file_mounts.update(
|
|
136
|
+
controller_utils.translate_local_file_mounts_to_two_hop(task_))
|
|
137
|
+
|
|
138
|
+
return local_to_controller_file_mounts
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
142
|
+
num_jobs: int) -> Optional[List[int]]:
|
|
143
|
+
"""Submit the managed job locally if in consolidation mode.
|
|
144
|
+
|
|
145
|
+
In normal mode the managed job submission is done in the ray job submission.
|
|
146
|
+
For consolidation mode, we need to manually submit it. Check the following
|
|
147
|
+
function for the normal mode submission:
|
|
148
|
+
sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
|
|
149
|
+
_exec_code_on_head::_maybe_add_managed_job_code
|
|
150
|
+
"""
|
|
151
|
+
if not managed_job_utils.is_consolidation_mode():
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
# Create local directory for the managed job.
|
|
155
|
+
pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
|
|
156
|
+
job_ids = []
|
|
157
|
+
pool = dag.pool
|
|
158
|
+
pool_hash = None
|
|
159
|
+
if pool is not None:
|
|
160
|
+
pool_hash = serve_state.get_service_hash(pool)
|
|
161
|
+
# Already checked in the sdk.
|
|
162
|
+
assert pool_hash is not None, f'Pool {pool} not found'
|
|
163
|
+
for _ in range(num_jobs):
|
|
164
|
+
# TODO(tian): We should have a separate name for each job when
|
|
165
|
+
# submitting multiple jobs. Current blocker is that we are sharing
|
|
166
|
+
# the same dag object for all jobs. Maybe we can do copy.copy() for
|
|
167
|
+
# each job and then give it a unique name (e.g. append job id after
|
|
168
|
+
# the task name). The name of the dag also needs to be aligned with
|
|
169
|
+
# the task name.
|
|
170
|
+
consolidation_mode_job_id = (
|
|
171
|
+
managed_job_state.set_job_info_without_job_id(
|
|
172
|
+
dag.name,
|
|
173
|
+
workspace=skypilot_config.get_active_workspace(
|
|
174
|
+
force_user_workspace=True),
|
|
175
|
+
entrypoint=common_utils.get_current_command(),
|
|
176
|
+
pool=pool,
|
|
177
|
+
pool_hash=pool_hash,
|
|
178
|
+
user_hash=common_utils.get_user_hash()))
|
|
179
|
+
for task_id, task in enumerate(dag.tasks):
|
|
180
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
181
|
+
task, is_managed_job=True)
|
|
182
|
+
managed_job_state.set_pending(consolidation_mode_job_id, task_id,
|
|
183
|
+
task.name, resources_str,
|
|
184
|
+
task.metadata_json)
|
|
185
|
+
job_ids.append(consolidation_mode_job_id)
|
|
186
|
+
return job_ids
|
|
187
|
+
|
|
46
188
|
|
|
47
189
|
@timeline.event
|
|
48
190
|
@usage_lib.entrypoint
|
|
49
191
|
def launch(
|
|
50
192
|
task: Union['sky.Task', 'sky.Dag'],
|
|
51
193
|
name: Optional[str] = None,
|
|
194
|
+
pool: Optional[str] = None,
|
|
195
|
+
num_jobs: Optional[int] = None,
|
|
52
196
|
stream_logs: bool = True,
|
|
53
|
-
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
197
|
+
) -> Tuple[Optional[Union[int, List[int]]], Optional[backends.ResourceHandle]]:
|
|
54
198
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
55
199
|
"""Launches a managed job.
|
|
56
200
|
|
|
@@ -76,21 +220,58 @@ def launch(
|
|
|
76
220
|
None if dryrun.
|
|
77
221
|
"""
|
|
78
222
|
entrypoint = task
|
|
223
|
+
# using hasattr instead of isinstance to avoid importing sky
|
|
224
|
+
if hasattr(task, 'metadata'):
|
|
225
|
+
metadata = task.metadata
|
|
226
|
+
else:
|
|
227
|
+
# we are a Dag, not a Task
|
|
228
|
+
if len(task.tasks) == 1:
|
|
229
|
+
metadata = task.tasks[0].metadata
|
|
230
|
+
else:
|
|
231
|
+
# doesn't make sense to have a git commit since there might be
|
|
232
|
+
# different metadatas for each task
|
|
233
|
+
metadata = {}
|
|
234
|
+
|
|
79
235
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
|
80
236
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
237
|
+
|
|
81
238
|
# Always apply the policy again here, even though it might have been applied
|
|
82
239
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
83
240
|
# and get the mutated config.
|
|
84
241
|
dag, mutated_user_config = admin_policy_utils.apply(
|
|
85
|
-
dag,
|
|
242
|
+
dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
|
|
243
|
+
dag.resolve_and_validate_volumes()
|
|
86
244
|
if not dag.is_chain():
|
|
87
245
|
with ux_utils.print_exception_no_traceback():
|
|
88
246
|
raise ValueError('Only single-task or chain DAG is '
|
|
89
247
|
f'allowed for job_launch. Dag: {dag}')
|
|
90
248
|
dag.validate()
|
|
249
|
+
# TODO(aylei): use consolidated job controller instead of performing
|
|
250
|
+
# pre-mount operations when submitting jobs.
|
|
251
|
+
dag.pre_mount_volumes()
|
|
252
|
+
|
|
253
|
+
# If there is a local postgres db, when the api server tries launching on
|
|
254
|
+
# the remote jobs controller it will fail. therefore, we should remove this
|
|
255
|
+
# before sending the config to the jobs controller.
|
|
256
|
+
# TODO(luca) there are a lot of potential problems with postgres being sent
|
|
257
|
+
# to the jobs controller. for example if the postgres is whitelisted to
|
|
258
|
+
# only the API server, this will then break. the simple solution to that is
|
|
259
|
+
# telling the user to add the jobs controller to the postgres whitelist.
|
|
260
|
+
if not managed_job_utils.is_consolidation_mode():
|
|
261
|
+
db_path = mutated_user_config.get('db', None)
|
|
262
|
+
if db_path is not None:
|
|
263
|
+
parsed = urlparse.urlparse(db_path)
|
|
264
|
+
if ((parsed.hostname == 'localhost' or
|
|
265
|
+
ipaddress.ip_address(parsed.hostname).is_loopback)):
|
|
266
|
+
mutated_user_config.pop('db', None)
|
|
267
|
+
|
|
268
|
+
user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
|
|
269
|
+
dag, use_user_specified_yaml=True)
|
|
270
|
+
|
|
91
271
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
|
92
272
|
|
|
93
273
|
task_names = set()
|
|
274
|
+
priority = None
|
|
94
275
|
for task_ in dag.tasks:
|
|
95
276
|
if task_.name in task_names:
|
|
96
277
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -101,6 +282,42 @@ def launch(
|
|
|
101
282
|
'will be auto-generated) .')
|
|
102
283
|
task_names.add(task_.name)
|
|
103
284
|
|
|
285
|
+
# Check for priority in resources
|
|
286
|
+
task_priority = None
|
|
287
|
+
if task_.resources:
|
|
288
|
+
# Convert set to list to access elements by index
|
|
289
|
+
resources_list = list(task_.resources)
|
|
290
|
+
# Take first resource's priority as reference
|
|
291
|
+
task_priority = resources_list[0].priority
|
|
292
|
+
|
|
293
|
+
# Check all other resources have same priority
|
|
294
|
+
for resource in resources_list[1:]:
|
|
295
|
+
if resource.priority != task_priority:
|
|
296
|
+
with ux_utils.print_exception_no_traceback():
|
|
297
|
+
raise ValueError(
|
|
298
|
+
f'Task {task_.name!r}: All resources must have the '
|
|
299
|
+
'same priority. Found priority '
|
|
300
|
+
f'{resource.priority} but expected {task_priority}.'
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
if task_priority is not None:
|
|
304
|
+
if (priority is not None and priority != task_priority):
|
|
305
|
+
with ux_utils.print_exception_no_traceback():
|
|
306
|
+
raise ValueError(
|
|
307
|
+
'Multiple tasks in the DAG have different priorities. '
|
|
308
|
+
'Either specify a priority in only one task, or set '
|
|
309
|
+
'the same priority for each task.')
|
|
310
|
+
priority = task_priority
|
|
311
|
+
|
|
312
|
+
if priority is None:
|
|
313
|
+
priority = skylet_constants.DEFAULT_PRIORITY
|
|
314
|
+
|
|
315
|
+
if (priority < skylet_constants.MIN_PRIORITY or
|
|
316
|
+
priority > skylet_constants.MAX_PRIORITY):
|
|
317
|
+
raise ValueError(
|
|
318
|
+
f'Priority must be between {skylet_constants.MIN_PRIORITY}'
|
|
319
|
+
f' and {skylet_constants.MAX_PRIORITY}, got {priority}')
|
|
320
|
+
|
|
104
321
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
|
105
322
|
|
|
106
323
|
with rich_utils.safe_status(
|
|
@@ -109,15 +326,13 @@ def launch(
|
|
|
109
326
|
# Check whether cached jobs controller cluster is accessible
|
|
110
327
|
cluster_name = (
|
|
111
328
|
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
|
112
|
-
|
|
113
|
-
if record is not None:
|
|
329
|
+
if global_user_state.cluster_with_name_exists(cluster_name):
|
|
114
330
|
# there is a cached jobs controller cluster
|
|
115
331
|
try:
|
|
116
332
|
# TODO: do something with returned status?
|
|
117
333
|
_, _ = backend_utils.refresh_cluster_status_handle(
|
|
118
334
|
cluster_name=cluster_name,
|
|
119
|
-
force_refresh_statuses=set(status_lib.ClusterStatus)
|
|
120
|
-
acquire_per_cluster_status_lock=False)
|
|
335
|
+
force_refresh_statuses=set(status_lib.ClusterStatus))
|
|
121
336
|
except (exceptions.ClusterOwnerIdentityMismatchError,
|
|
122
337
|
exceptions.CloudUserIdentityError,
|
|
123
338
|
exceptions.ClusterStatusFetchingError) as e:
|
|
@@ -131,100 +346,216 @@ def launch(
|
|
|
131
346
|
f'with:\n\n`sky down {cluster_name} --purge`\n\n'
|
|
132
347
|
f'Reason: {common_utils.format_exception(e)}')
|
|
133
348
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
349
|
+
local_to_controller_file_mounts = _upload_files_to_controller(dag)
|
|
350
|
+
controller = controller_utils.Controllers.JOBS_CONTROLLER
|
|
351
|
+
controller_name = controller.value.cluster_name
|
|
352
|
+
prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
|
|
353
|
+
controller_resources = controller_utils.get_controller_resources(
|
|
354
|
+
controller=controller,
|
|
355
|
+
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
|
356
|
+
|
|
357
|
+
num_jobs = num_jobs if num_jobs is not None else 1
|
|
358
|
+
# We do this assignment after applying the admin policy, so that we don't
|
|
359
|
+
# need to serialize the pool name in the dag. The dag object will be
|
|
360
|
+
# preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
|
|
361
|
+
dag.pool = pool
|
|
362
|
+
consolidation_mode_job_ids = _maybe_submit_job_locally(
|
|
363
|
+
prefix, dag, num_jobs)
|
|
364
|
+
|
|
365
|
+
# This is only needed for non-consolidation mode. For consolidation
|
|
366
|
+
# mode, the controller uses the same catalog as API server.
|
|
367
|
+
modified_catalogs = {} if consolidation_mode_job_ids is not None else (
|
|
368
|
+
service_catalog_common.get_modified_catalog_file_mounts())
|
|
369
|
+
|
|
370
|
+
def _submit_one(
|
|
371
|
+
consolidation_mode_job_id: Optional[int] = None,
|
|
372
|
+
job_rank: Optional[int] = None,
|
|
373
|
+
num_jobs: Optional[int] = None,
|
|
374
|
+
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
375
|
+
rank_suffix = '' if job_rank is None else f'-{job_rank}'
|
|
376
|
+
remote_original_user_yaml_path = (
|
|
377
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.original_user_yaml')
|
|
378
|
+
remote_user_yaml_path = (
|
|
379
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.yaml')
|
|
380
|
+
remote_user_config_path = (
|
|
381
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.config_yaml')
|
|
382
|
+
remote_env_file_path = (
|
|
383
|
+
f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.env')
|
|
384
|
+
with tempfile.NamedTemporaryFile(
|
|
385
|
+
prefix=f'managed-dag-{dag.name}{rank_suffix}-',
|
|
386
|
+
mode='w',
|
|
387
|
+
) as f, tempfile.NamedTemporaryFile(
|
|
388
|
+
prefix=f'managed-user-dag-{dag.name}{rank_suffix}-',
|
|
389
|
+
mode='w',
|
|
390
|
+
) as original_user_yaml_path:
|
|
391
|
+
original_user_yaml_path.write(user_dag_str_user_specified)
|
|
392
|
+
original_user_yaml_path.flush()
|
|
393
|
+
# Copy tasks to avoid race conditions when multiple threads modify
|
|
394
|
+
# the same dag object concurrently. Each thread needs its own copy.
|
|
395
|
+
dag_copy = copy.deepcopy(dag)
|
|
396
|
+
for task_ in dag_copy.tasks:
|
|
397
|
+
if job_rank is not None:
|
|
398
|
+
task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
|
|
399
|
+
task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
|
|
400
|
+
|
|
401
|
+
dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
|
|
402
|
+
|
|
403
|
+
vars_to_fill = {
|
|
404
|
+
'remote_original_user_yaml_path':
|
|
405
|
+
(remote_original_user_yaml_path),
|
|
406
|
+
'original_user_dag_path': original_user_yaml_path.name,
|
|
407
|
+
'remote_user_yaml_path': remote_user_yaml_path,
|
|
408
|
+
'user_yaml_path': f.name,
|
|
409
|
+
'local_to_controller_file_mounts':
|
|
410
|
+
(local_to_controller_file_mounts),
|
|
411
|
+
'jobs_controller': controller_name,
|
|
412
|
+
# Note: actual cluster name will be <task.name>-<managed job ID>
|
|
413
|
+
'dag_name': dag.name,
|
|
414
|
+
'remote_user_config_path': remote_user_config_path,
|
|
415
|
+
'remote_env_file_path': remote_env_file_path,
|
|
416
|
+
'modified_catalogs': modified_catalogs,
|
|
417
|
+
'priority': priority,
|
|
418
|
+
'consolidation_mode_job_id': consolidation_mode_job_id,
|
|
419
|
+
'pool': pool,
|
|
420
|
+
'job_controller_indicator_file':
|
|
421
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
|
|
422
|
+
**controller_utils.shared_controller_vars_to_fill(
|
|
423
|
+
controller,
|
|
424
|
+
remote_user_config_path=remote_user_config_path,
|
|
425
|
+
# TODO(aylei): the mutated config will not be updated
|
|
426
|
+
# afterwards without recreate the controller. Need to
|
|
427
|
+
# revisit this.
|
|
428
|
+
local_user_config=mutated_user_config,
|
|
429
|
+
),
|
|
430
|
+
}
|
|
140
431
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
432
|
+
yaml_path = os.path.join(
|
|
433
|
+
managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
|
|
434
|
+
f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
|
|
435
|
+
)
|
|
436
|
+
common_utils.fill_template(
|
|
437
|
+
managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
|
|
438
|
+
vars_to_fill,
|
|
439
|
+
output_path=yaml_path)
|
|
440
|
+
controller_task = task_lib.Task.from_yaml(yaml_path)
|
|
441
|
+
controller_task.set_resources(controller_resources)
|
|
442
|
+
|
|
443
|
+
controller_task.managed_job_dag = dag_copy
|
|
444
|
+
# pylint: disable=protected-access
|
|
445
|
+
controller_task._metadata = metadata
|
|
446
|
+
|
|
447
|
+
job_identity = ''
|
|
448
|
+
if job_rank is not None:
|
|
449
|
+
job_identity = f' (rank: {job_rank})'
|
|
450
|
+
job_controller_postfix = (' from jobs controller' if
|
|
451
|
+
consolidation_mode_job_id is None else '')
|
|
452
|
+
logger.info(
|
|
453
|
+
f'{colorama.Fore.YELLOW}'
|
|
454
|
+
f'Launching managed job {dag.name!r}{job_identity}'
|
|
455
|
+
f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
|
|
456
|
+
|
|
457
|
+
# Launch with the api server's user hash, so that sky status does
|
|
458
|
+
# not show the owner of the controller as whatever user launched
|
|
459
|
+
# it first.
|
|
460
|
+
with common.with_server_user():
|
|
461
|
+
# Always launch the controller in the default workspace.
|
|
462
|
+
with skypilot_config.local_active_workspace_ctx(
|
|
463
|
+
skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
|
|
464
|
+
# TODO(zhwu): the buckets need to be correctly handled for
|
|
465
|
+
# a specific workspace. For example, if a job is launched in
|
|
466
|
+
# workspace A, but the controller is in workspace B, the
|
|
467
|
+
# intermediate bucket and newly created bucket should be in
|
|
468
|
+
# workspace A.
|
|
469
|
+
if consolidation_mode_job_id is None:
|
|
470
|
+
return execution.launch(
|
|
471
|
+
task=controller_task,
|
|
472
|
+
cluster_name=controller_name,
|
|
473
|
+
stream_logs=stream_logs,
|
|
474
|
+
retry_until_up=True,
|
|
475
|
+
fast=True,
|
|
476
|
+
_request_name=request_names.AdminPolicyRequestName.
|
|
477
|
+
JOBS_LAUNCH_CONTROLLER,
|
|
478
|
+
_disable_controller_check=True)
|
|
479
|
+
# Manually launch the scheduler in consolidation mode.
|
|
480
|
+
local_handle = backend_utils.is_controller_accessible(
|
|
481
|
+
controller=controller, stopped_message='')
|
|
482
|
+
backend = backend_utils.get_backend_from_handle(
|
|
483
|
+
local_handle)
|
|
484
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
485
|
+
with sky_logging.silent():
|
|
486
|
+
backend.sync_file_mounts(
|
|
487
|
+
handle=local_handle,
|
|
488
|
+
all_file_mounts=controller_task.file_mounts,
|
|
489
|
+
storage_mounts=controller_task.storage_mounts)
|
|
490
|
+
run_script = controller_task.run
|
|
491
|
+
assert isinstance(run_script, str)
|
|
492
|
+
# Manually add the env variables to the run script.
|
|
493
|
+
# Originally this is done in ray jobs submission but now we
|
|
494
|
+
# have to do it manually because there is no ray runtime on
|
|
495
|
+
# the API server.
|
|
496
|
+
env_cmds = [
|
|
497
|
+
f'export {k}={v!r}'
|
|
498
|
+
for k, v in controller_task.envs.items()
|
|
499
|
+
]
|
|
500
|
+
run_script = '\n'.join(env_cmds + [run_script])
|
|
501
|
+
# Dump script for high availability recovery.
|
|
502
|
+
managed_job_state.set_ha_recovery_script(
|
|
503
|
+
consolidation_mode_job_id, run_script)
|
|
504
|
+
backend.run_on_head(local_handle, run_script)
|
|
505
|
+
ux_utils.starting_message(
|
|
506
|
+
f'Job submitted, ID: {consolidation_mode_job_id}')
|
|
507
|
+
return consolidation_mode_job_id, local_handle
|
|
508
|
+
|
|
509
|
+
if pool is None:
|
|
510
|
+
if consolidation_mode_job_ids is None:
|
|
511
|
+
return _submit_one()
|
|
512
|
+
assert len(consolidation_mode_job_ids) == 1
|
|
513
|
+
return _submit_one(consolidation_mode_job_ids[0])
|
|
514
|
+
|
|
515
|
+
ids: List[int] = []
|
|
516
|
+
all_handle: Optional[backends.ResourceHandle] = None
|
|
517
|
+
|
|
518
|
+
if num_jobs == 1:
|
|
519
|
+
job_id = (consolidation_mode_job_ids[0]
|
|
520
|
+
if consolidation_mode_job_ids is not None else None)
|
|
521
|
+
jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
|
|
522
|
+
assert jid is not None, (job_id, handle)
|
|
523
|
+
ids.append(jid)
|
|
524
|
+
all_handle = handle
|
|
525
|
+
else:
|
|
526
|
+
# Submit jobs in parallel using ThreadPoolExecutor
|
|
527
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
528
|
+
max_workers=min(num_jobs,
|
|
529
|
+
os.cpu_count() or 1)) as executor:
|
|
530
|
+
# Submit jobs concurrently
|
|
531
|
+
future_to_rank = {}
|
|
532
|
+
for job_rank in range(num_jobs):
|
|
533
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
534
|
+
if consolidation_mode_job_ids is not None else None)
|
|
535
|
+
future = executor.submit(_submit_one, job_id, job_rank,
|
|
536
|
+
num_jobs)
|
|
537
|
+
future_to_rank[future] = job_rank
|
|
538
|
+
|
|
539
|
+
# Collect results in order of job_rank to maintain consistent order.
|
|
540
|
+
results: List[Optional[Tuple[
|
|
541
|
+
int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
|
|
542
|
+
for future in concurrent.futures.as_completed(future_to_rank):
|
|
543
|
+
job_rank = future_to_rank[future]
|
|
544
|
+
try:
|
|
545
|
+
jid, handle = future.result()
|
|
546
|
+
assert jid is not None, (job_id, handle)
|
|
547
|
+
results[job_rank] = (jid, handle)
|
|
548
|
+
all_handle = handle # Keep the last handle.
|
|
549
|
+
except Exception as e:
|
|
550
|
+
logger.error(f'Error launching job {job_rank}: {e}')
|
|
551
|
+
raise e
|
|
552
|
+
|
|
553
|
+
# Extract job IDs in order
|
|
554
|
+
for res in results:
|
|
555
|
+
if res is not None:
|
|
556
|
+
ids.append(res[0])
|
|
557
|
+
|
|
558
|
+
return ids, all_handle
|
|
228
559
|
|
|
229
560
|
|
|
230
561
|
def queue_from_kubernetes_pod(
|
|
@@ -275,7 +606,9 @@ def queue_from_kubernetes_pod(
|
|
|
275
606
|
managed_jobs_runner = provision_lib.get_command_runners(
|
|
276
607
|
'kubernetes', cluster_info)[0]
|
|
277
608
|
|
|
278
|
-
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
609
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
610
|
+
skip_finished=skip_finished,
|
|
611
|
+
fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
|
|
279
612
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
280
613
|
code,
|
|
281
614
|
require_outputs=True,
|
|
@@ -291,7 +624,14 @@ def queue_from_kubernetes_pod(
|
|
|
291
624
|
except exceptions.CommandError as e:
|
|
292
625
|
raise RuntimeError(str(e)) from e
|
|
293
626
|
|
|
294
|
-
jobs = managed_job_utils.load_managed_job_queue(
|
|
627
|
+
jobs, _, result_type, _, _ = managed_job_utils.load_managed_job_queue(
|
|
628
|
+
job_table_payload)
|
|
629
|
+
|
|
630
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
631
|
+
return jobs
|
|
632
|
+
|
|
633
|
+
# Backward compatibility for old jobs controller without filtering
|
|
634
|
+
# TODO(hailong): remove this after 0.12.0
|
|
295
635
|
if skip_finished:
|
|
296
636
|
# Filter out the finished jobs. If a multi-task job is partially
|
|
297
637
|
# finished, we will include all its tasks.
|
|
@@ -322,28 +662,22 @@ def _maybe_restart_controller(
|
|
|
322
662
|
if handle is not None:
|
|
323
663
|
return handle
|
|
324
664
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
665
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
666
|
+
f'Restarting {jobs_controller_type.value.name}...'
|
|
667
|
+
f'{colorama.Style.RESET_ALL}')
|
|
328
668
|
|
|
329
669
|
rich_utils.force_update_status(
|
|
330
670
|
ux_utils.spinner_message(f'{spinner_message} - restarting '
|
|
331
671
|
'controller'))
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
runner.run(
|
|
342
|
-
f'export '
|
|
343
|
-
f'{skylet_constants.USER_ID_ENV_VAR}={common.SERVER_ID!r}; '
|
|
344
|
-
f'{managed_job_constants.DASHBOARD_SETUP_CMD}',
|
|
345
|
-
stream_logs=True,
|
|
346
|
-
)
|
|
672
|
+
with skypilot_config.local_active_workspace_ctx(
|
|
673
|
+
skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
|
|
674
|
+
global_user_state.add_cluster_event(
|
|
675
|
+
jobs_controller_type.value.cluster_name,
|
|
676
|
+
status_lib.ClusterStatus.INIT, 'Jobs controller restarted.',
|
|
677
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
678
|
+
handle = core.start(
|
|
679
|
+
cluster_name=jobs_controller_type.value.cluster_name)
|
|
680
|
+
|
|
347
681
|
controller_status = status_lib.ClusterStatus.UP
|
|
348
682
|
rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
|
|
349
683
|
|
|
@@ -351,10 +685,13 @@ def _maybe_restart_controller(
|
|
|
351
685
|
return handle
|
|
352
686
|
|
|
353
687
|
|
|
688
|
+
# For backwards compatibility
|
|
689
|
+
# TODO(hailong): Remove before 0.12.0.
|
|
354
690
|
@usage_lib.entrypoint
|
|
355
691
|
def queue(refresh: bool,
|
|
356
692
|
skip_finished: bool = False,
|
|
357
|
-
all_users: bool = False
|
|
693
|
+
all_users: bool = False,
|
|
694
|
+
job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
|
|
358
695
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
359
696
|
"""Gets statuses of managed jobs.
|
|
360
697
|
|
|
@@ -368,13 +705,15 @@ def queue(refresh: bool,
|
|
|
368
705
|
'resources': str,
|
|
369
706
|
'submitted_at': (float) timestamp of submission,
|
|
370
707
|
'end_at': (float) timestamp of end,
|
|
371
|
-
'
|
|
708
|
+
'job_duration': (float) duration in seconds,
|
|
372
709
|
'recovery_count': (int) Number of retries,
|
|
373
710
|
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
374
711
|
'cluster_resources': (str) resources of the cluster,
|
|
375
712
|
'region': (str) region of the cluster,
|
|
376
713
|
'user_name': (Optional[str]) job creator's user name,
|
|
377
714
|
'user_hash': (str) job creator's user hash,
|
|
715
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
716
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
378
717
|
}
|
|
379
718
|
]
|
|
380
719
|
Raises:
|
|
@@ -382,51 +721,222 @@ def queue(refresh: bool,
|
|
|
382
721
|
does not exist.
|
|
383
722
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
384
723
|
"""
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
724
|
+
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
|
|
725
|
+
|
|
726
|
+
return jobs
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
@usage_lib.entrypoint
|
|
730
|
+
def queue_v2_api(
|
|
731
|
+
refresh: bool,
|
|
732
|
+
skip_finished: bool = False,
|
|
733
|
+
all_users: bool = False,
|
|
734
|
+
job_ids: Optional[List[int]] = None,
|
|
735
|
+
user_match: Optional[str] = None,
|
|
736
|
+
workspace_match: Optional[str] = None,
|
|
737
|
+
name_match: Optional[str] = None,
|
|
738
|
+
pool_match: Optional[str] = None,
|
|
739
|
+
page: Optional[int] = None,
|
|
740
|
+
limit: Optional[int] = None,
|
|
741
|
+
statuses: Optional[List[str]] = None,
|
|
742
|
+
fields: Optional[List[str]] = None,
|
|
743
|
+
) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
|
|
744
|
+
"""Gets statuses of managed jobs and parse the
|
|
745
|
+
jobs to responses.ManagedJobRecord."""
|
|
746
|
+
jobs, total, status_counts, total_no_filter = queue_v2(
|
|
747
|
+
refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
|
|
748
|
+
name_match, pool_match, page, limit, statuses, fields)
|
|
749
|
+
return [responses.ManagedJobRecord(**job) for job in jobs
|
|
750
|
+
], total, status_counts, total_no_filter
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
@metrics_lib.time_me
|
|
754
|
+
def queue_v2(
|
|
755
|
+
refresh: bool,
|
|
756
|
+
skip_finished: bool = False,
|
|
757
|
+
all_users: bool = False,
|
|
758
|
+
job_ids: Optional[List[int]] = None,
|
|
759
|
+
user_match: Optional[str] = None,
|
|
760
|
+
workspace_match: Optional[str] = None,
|
|
761
|
+
name_match: Optional[str] = None,
|
|
762
|
+
pool_match: Optional[str] = None,
|
|
763
|
+
page: Optional[int] = None,
|
|
764
|
+
limit: Optional[int] = None,
|
|
765
|
+
statuses: Optional[List[str]] = None,
|
|
766
|
+
fields: Optional[List[str]] = None,
|
|
767
|
+
) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
|
|
768
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
769
|
+
"""Gets statuses of managed jobs with filtering.
|
|
770
|
+
|
|
771
|
+
Please refer to sky.cli.job_queue for documentation.
|
|
772
|
+
|
|
773
|
+
Returns:
|
|
774
|
+
jobs: List[Dict[str, Any]]
|
|
775
|
+
[
|
|
776
|
+
{
|
|
777
|
+
'job_id': int,
|
|
778
|
+
'job_name': str,
|
|
779
|
+
'resources': str,
|
|
780
|
+
'submitted_at': (float) timestamp of submission,
|
|
781
|
+
'end_at': (float) timestamp of end,
|
|
782
|
+
'job_duration': (float) duration in seconds,
|
|
783
|
+
'recovery_count': (int) Number of retries,
|
|
784
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
785
|
+
'cluster_resources': (str) resources of the cluster,
|
|
786
|
+
'region': (str) region of the cluster,
|
|
787
|
+
'user_name': (Optional[str]) job creator's user name,
|
|
788
|
+
'user_hash': (str) job creator's user hash,
|
|
789
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
790
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
791
|
+
}
|
|
792
|
+
]
|
|
793
|
+
total: int, total number of jobs after filter
|
|
794
|
+
status_counts: Dict[str, int], status counts after filter
|
|
795
|
+
total_no_filter: int, total number of jobs before filter
|
|
796
|
+
Raises:
|
|
797
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
|
798
|
+
does not exist.
|
|
799
|
+
RuntimeError: if failed to get the managed jobs with ssh.
|
|
800
|
+
"""
|
|
801
|
+
if limit is not None:
|
|
802
|
+
if limit < 1:
|
|
803
|
+
raise ValueError(f'Limit must be at least 1, got {limit}')
|
|
804
|
+
if page is None:
|
|
805
|
+
page = 1
|
|
806
|
+
if page < 1:
|
|
807
|
+
raise ValueError(f'Page must be at least 1, got {page}')
|
|
808
|
+
else:
|
|
809
|
+
if page is not None:
|
|
810
|
+
raise ValueError('Limit must be specified when page is specified')
|
|
811
|
+
|
|
812
|
+
with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
|
|
813
|
+
handle = _maybe_restart_controller(refresh,
|
|
814
|
+
stopped_message='No in-progress '
|
|
815
|
+
'managed jobs.',
|
|
816
|
+
spinner_message='Checking '
|
|
817
|
+
'managed jobs')
|
|
390
818
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
391
819
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
392
820
|
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
821
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
822
|
+
show_jobs_without_user_hash = False
|
|
823
|
+
if not all_users:
|
|
824
|
+
user_hashes = [common_utils.get_user_hash()]
|
|
825
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
826
|
+
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
827
|
+
user_hashes.append(None)
|
|
828
|
+
show_jobs_without_user_hash = True
|
|
829
|
+
elif user_match is not None:
|
|
830
|
+
users = global_user_state.get_user_by_name_match(user_match)
|
|
831
|
+
if not users:
|
|
832
|
+
return [], 0, {}, 0
|
|
833
|
+
user_hashes = [user.id for user in users]
|
|
834
|
+
|
|
835
|
+
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
836
|
+
|
|
837
|
+
if handle.is_grpc_enabled_with_flag:
|
|
838
|
+
try:
|
|
839
|
+
request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
840
|
+
skip_finished=skip_finished,
|
|
841
|
+
accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
|
|
842
|
+
workspaces=accessible_workspaces)),
|
|
843
|
+
job_ids=managed_jobsv1_pb2.JobIds(
|
|
844
|
+
ids=job_ids) if job_ids is not None else None,
|
|
845
|
+
workspace_match=workspace_match,
|
|
846
|
+
name_match=name_match,
|
|
847
|
+
pool_match=pool_match,
|
|
848
|
+
page=page,
|
|
849
|
+
limit=limit,
|
|
850
|
+
# Remove None from user_hashes, as the gRPC server uses the
|
|
851
|
+
# show_jobs_without_user_hash flag instead.
|
|
852
|
+
user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
|
|
853
|
+
user_hash for user_hash in user_hashes
|
|
854
|
+
if user_hash is not None
|
|
855
|
+
]) if user_hashes is not None else None,
|
|
856
|
+
statuses=managed_jobsv1_pb2.Statuses(
|
|
857
|
+
statuses=statuses) if statuses is not None else None,
|
|
858
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
859
|
+
fields=fields) if fields is not None else None,
|
|
860
|
+
show_jobs_without_user_hash=show_jobs_without_user_hash,
|
|
861
|
+
)
|
|
862
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
863
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
864
|
+
handle.get_grpc_channel()).get_managed_job_table(request))
|
|
865
|
+
jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
|
|
866
|
+
return jobs, response.total, dict(
|
|
867
|
+
response.status_counts), response.total_no_filter
|
|
868
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
869
|
+
pass
|
|
870
|
+
|
|
871
|
+
with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
|
|
872
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
873
|
+
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
874
|
+
name_match, pool_match, page, limit, user_hashes, statuses, fields)
|
|
875
|
+
with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
|
|
876
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
877
|
+
handle,
|
|
878
|
+
code,
|
|
879
|
+
require_outputs=True,
|
|
880
|
+
stream_logs=False,
|
|
881
|
+
separate_stderr=True)
|
|
400
882
|
|
|
401
883
|
if returncode != 0:
|
|
402
884
|
logger.error(job_table_payload + stderr)
|
|
403
885
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
404
|
-
f'{returncode}')
|
|
886
|
+
f'{returncode}.\n{job_table_payload + stderr}')
|
|
405
887
|
|
|
406
|
-
jobs =
|
|
888
|
+
with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
|
|
889
|
+
(jobs, total, result_type, total_no_filter, status_counts
|
|
890
|
+
) = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
407
891
|
|
|
408
|
-
if
|
|
892
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
893
|
+
return jobs, total, status_counts, total_no_filter
|
|
409
894
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
415
|
-
return True
|
|
416
|
-
return user_hash == common_utils.get_user_hash()
|
|
895
|
+
# Backward compatibility for old jobs controller without filtering
|
|
896
|
+
# TODO(hailong): remove this after 0.12.0
|
|
897
|
+
with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
|
|
898
|
+
if not all_users:
|
|
417
899
|
|
|
418
|
-
|
|
900
|
+
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
901
|
+
user_hash = job.get('user_hash', None)
|
|
902
|
+
if user_hash is None:
|
|
903
|
+
# For backwards compatibility, we show jobs that do not have
|
|
904
|
+
# a user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
905
|
+
return True
|
|
906
|
+
return user_hash == common_utils.get_user_hash()
|
|
419
907
|
|
|
420
|
-
|
|
421
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
422
|
-
# finished, we will include all its tasks.
|
|
423
|
-
non_finished_tasks = list(
|
|
424
|
-
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
425
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
426
|
-
jobs = list(
|
|
427
|
-
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
908
|
+
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
|
428
909
|
|
|
429
|
-
|
|
910
|
+
jobs = list(
|
|
911
|
+
filter(
|
|
912
|
+
lambda job: job.get('workspace', skylet_constants.
|
|
913
|
+
SKYPILOT_DEFAULT_WORKSPACE) in
|
|
914
|
+
accessible_workspaces, jobs))
|
|
915
|
+
|
|
916
|
+
if skip_finished:
|
|
917
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
|
918
|
+
# finished, we will include all its tasks.
|
|
919
|
+
non_finished_tasks = list(
|
|
920
|
+
filter(lambda job: not job['status'].is_terminal(), jobs))
|
|
921
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
922
|
+
jobs = list(
|
|
923
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
924
|
+
|
|
925
|
+
if job_ids:
|
|
926
|
+
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
927
|
+
|
|
928
|
+
filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
|
|
929
|
+
jobs,
|
|
930
|
+
workspace_match,
|
|
931
|
+
name_match,
|
|
932
|
+
pool_match,
|
|
933
|
+
page=page,
|
|
934
|
+
limit=limit,
|
|
935
|
+
user_match=user_match,
|
|
936
|
+
enable_user_match=True,
|
|
937
|
+
statuses=statuses,
|
|
938
|
+
)
|
|
939
|
+
return filtered_jobs, total, status_counts, total_no_filter
|
|
430
940
|
|
|
431
941
|
|
|
432
942
|
@usage_lib.entrypoint
|
|
@@ -434,7 +944,8 @@ def queue(refresh: bool,
|
|
|
434
944
|
def cancel(name: Optional[str] = None,
|
|
435
945
|
job_ids: Optional[List[int]] = None,
|
|
436
946
|
all: bool = False,
|
|
437
|
-
all_users: bool = False
|
|
947
|
+
all_users: bool = False,
|
|
948
|
+
pool: Optional[str] = None) -> None:
|
|
438
949
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
439
950
|
"""Cancels managed jobs.
|
|
440
951
|
|
|
@@ -444,57 +955,98 @@ def cancel(name: Optional[str] = None,
|
|
|
444
955
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
|
445
956
|
RuntimeError: failed to cancel the job.
|
|
446
957
|
"""
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
958
|
+
with rich_utils.safe_status(
|
|
959
|
+
ux_utils.spinner_message('Cancelling managed jobs')):
|
|
960
|
+
job_ids = [] if job_ids is None else job_ids
|
|
961
|
+
handle = backend_utils.is_controller_accessible(
|
|
962
|
+
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
|
963
|
+
stopped_message='All managed jobs should have finished.')
|
|
964
|
+
|
|
965
|
+
job_id_str = ','.join(map(str, job_ids))
|
|
966
|
+
if sum([
|
|
967
|
+
bool(job_ids), name is not None, pool is not None, all or
|
|
968
|
+
all_users
|
|
969
|
+
]) != 1:
|
|
970
|
+
arguments = []
|
|
971
|
+
arguments += [f'job_ids={job_id_str}'] if job_ids else []
|
|
972
|
+
arguments += [f'name={name}'] if name is not None else []
|
|
973
|
+
arguments += [f'pool={pool}'] if pool is not None else []
|
|
974
|
+
arguments += ['all'] if all else []
|
|
975
|
+
arguments += ['all_users'] if all_users else []
|
|
976
|
+
with ux_utils.print_exception_no_traceback():
|
|
977
|
+
raise ValueError(
|
|
978
|
+
'Can only specify one of JOB_IDS, name, pool, or all/'
|
|
979
|
+
f'all_users. Provided {" ".join(arguments)!r}.')
|
|
462
980
|
|
|
463
|
-
|
|
464
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
465
|
-
if all_users:
|
|
466
|
-
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
|
|
467
|
-
None, all_users=True)
|
|
468
|
-
elif all:
|
|
469
|
-
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
|
|
470
|
-
elif job_ids:
|
|
471
|
-
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(job_ids)
|
|
472
|
-
else:
|
|
473
|
-
assert name is not None, (job_ids, name, all)
|
|
474
|
-
code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
|
|
475
|
-
# The stderr is redirected to stdout
|
|
476
|
-
returncode, stdout, _ = backend.run_on_head(handle,
|
|
477
|
-
code,
|
|
478
|
-
require_outputs=True,
|
|
479
|
-
stream_logs=False)
|
|
480
|
-
try:
|
|
481
|
-
subprocess_utils.handle_returncode(returncode, code,
|
|
482
|
-
'Failed to cancel managed job',
|
|
483
|
-
stdout)
|
|
484
|
-
except exceptions.CommandError as e:
|
|
485
|
-
with ux_utils.print_exception_no_traceback():
|
|
486
|
-
raise RuntimeError(e.error_msg) from e
|
|
981
|
+
job_ids = None if (all_users or all) else job_ids
|
|
487
982
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
983
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
984
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
985
|
+
|
|
986
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
987
|
+
|
|
988
|
+
if not use_legacy:
|
|
989
|
+
current_workspace = skypilot_config.get_active_workspace()
|
|
990
|
+
try:
|
|
991
|
+
request = managed_jobsv1_pb2.CancelJobsRequest(
|
|
992
|
+
current_workspace=current_workspace)
|
|
993
|
+
|
|
994
|
+
if all_users or all or job_ids:
|
|
995
|
+
request.all_users = all_users
|
|
996
|
+
if all:
|
|
997
|
+
request.user_hash = common_utils.get_user_hash()
|
|
998
|
+
if job_ids is not None:
|
|
999
|
+
request.job_ids.CopyFrom(
|
|
1000
|
+
managed_jobsv1_pb2.JobIds(ids=job_ids))
|
|
1001
|
+
elif name is not None:
|
|
1002
|
+
request.job_name = name
|
|
1003
|
+
else:
|
|
1004
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
1005
|
+
request.pool_name = pool
|
|
1006
|
+
|
|
1007
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
1008
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
1009
|
+
handle.get_grpc_channel()).cancel_managed_jobs(request))
|
|
1010
|
+
stdout = response.message
|
|
1011
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
1012
|
+
use_legacy = True
|
|
1013
|
+
|
|
1014
|
+
if use_legacy:
|
|
1015
|
+
if all_users or all or job_ids:
|
|
1016
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
|
|
1017
|
+
job_ids, all_users=all_users)
|
|
1018
|
+
elif name is not None:
|
|
1019
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
|
|
1020
|
+
name)
|
|
1021
|
+
else:
|
|
1022
|
+
assert pool is not None, (job_ids, name, pool, all)
|
|
1023
|
+
code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
|
|
1024
|
+
pool)
|
|
1025
|
+
# The stderr is redirected to stdout
|
|
1026
|
+
returncode, stdout, stderr = backend.run_on_head(
|
|
1027
|
+
handle, code, require_outputs=True, stream_logs=False)
|
|
1028
|
+
try:
|
|
1029
|
+
subprocess_utils.handle_returncode(
|
|
1030
|
+
returncode, code, 'Failed to cancel managed job',
|
|
1031
|
+
stdout + stderr)
|
|
1032
|
+
except exceptions.CommandError as e:
|
|
1033
|
+
with ux_utils.print_exception_no_traceback():
|
|
1034
|
+
raise RuntimeError(e.error_msg) from e
|
|
1035
|
+
|
|
1036
|
+
logger.info(stdout)
|
|
1037
|
+
if 'Multiple jobs found with name' in stdout:
|
|
1038
|
+
with ux_utils.print_exception_no_traceback():
|
|
1039
|
+
raise RuntimeError(
|
|
1040
|
+
'Please specify the job ID instead of the job name.')
|
|
493
1041
|
|
|
494
1042
|
|
|
495
1043
|
@usage_lib.entrypoint
|
|
496
|
-
def tail_logs(name: Optional[str],
|
|
497
|
-
|
|
1044
|
+
def tail_logs(name: Optional[str],
|
|
1045
|
+
job_id: Optional[int],
|
|
1046
|
+
follow: bool,
|
|
1047
|
+
controller: bool,
|
|
1048
|
+
refresh: bool,
|
|
1049
|
+
tail: Optional[int] = None) -> int:
|
|
498
1050
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
499
1051
|
"""Tail logs of managed jobs.
|
|
500
1052
|
|
|
@@ -537,56 +1089,8 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
|
537
1089
|
job_id=job_id,
|
|
538
1090
|
job_name=name,
|
|
539
1091
|
follow=follow,
|
|
540
|
-
controller=controller
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
def start_dashboard_forwarding(refresh: bool = False) -> Tuple[int, int]:
|
|
544
|
-
"""Opens a dashboard for managed jobs (needs controller to be UP)."""
|
|
545
|
-
# TODO(SKY-1212): ideally, the controller/dashboard server should expose the
|
|
546
|
-
# API perhaps via REST. Then here we would (1) not have to use SSH to try to
|
|
547
|
-
# see if the controller is UP first, which is slow; (2) not have to run SSH
|
|
548
|
-
# port forwarding first (we'd just launch a local dashboard which would make
|
|
549
|
-
# REST API calls to the controller dashboard server).
|
|
550
|
-
logger.info('Starting dashboard')
|
|
551
|
-
hint = ('Dashboard is not available if jobs controller is not up. Run '
|
|
552
|
-
'a managed job first or run: sky jobs queue --refresh')
|
|
553
|
-
handle = _maybe_restart_controller(
|
|
554
|
-
refresh=refresh,
|
|
555
|
-
stopped_message=hint,
|
|
556
|
-
spinner_message='Checking jobs controller')
|
|
557
|
-
|
|
558
|
-
# SSH forward a free local port to remote's dashboard port.
|
|
559
|
-
remote_port = skylet_constants.SPOT_DASHBOARD_REMOTE_PORT
|
|
560
|
-
free_port = common_utils.find_free_port(remote_port)
|
|
561
|
-
runner = handle.get_command_runners()[0]
|
|
562
|
-
port_forward_command = ' '.join(
|
|
563
|
-
runner.port_forward_command(port_forward=[(free_port, remote_port)],
|
|
564
|
-
connect_timeout=1))
|
|
565
|
-
port_forward_command = (
|
|
566
|
-
f'{port_forward_command} '
|
|
567
|
-
f'> ~/sky_logs/api_server/dashboard-{common_utils.get_user_hash()}.log '
|
|
568
|
-
'2>&1')
|
|
569
|
-
logger.info(f'Forwarding port: {colorama.Style.DIM}{port_forward_command}'
|
|
570
|
-
f'{colorama.Style.RESET_ALL}')
|
|
571
|
-
|
|
572
|
-
ssh_process = subprocess.Popen(port_forward_command,
|
|
573
|
-
shell=True,
|
|
574
|
-
start_new_session=True)
|
|
575
|
-
time.sleep(3) # Added delay for ssh_command to initialize.
|
|
576
|
-
logger.info(f'{colorama.Fore.GREEN}Dashboard is now available at: '
|
|
577
|
-
f'http://127.0.0.1:{free_port}{colorama.Style.RESET_ALL}')
|
|
578
|
-
|
|
579
|
-
return free_port, ssh_process.pid
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
def stop_dashboard_forwarding(pid: int) -> None:
|
|
583
|
-
# Exit the ssh command when the context manager is closed.
|
|
584
|
-
try:
|
|
585
|
-
os.killpg(os.getpgid(pid), signal.SIGTERM)
|
|
586
|
-
except ProcessLookupError:
|
|
587
|
-
# This happens if jobs controller is auto-stopped.
|
|
588
|
-
pass
|
|
589
|
-
logger.info('Forwarding port closed. Exiting.')
|
|
1092
|
+
controller=controller,
|
|
1093
|
+
tail=tail)
|
|
590
1094
|
|
|
591
1095
|
|
|
592
1096
|
@usage_lib.entrypoint
|
|
@@ -635,3 +1139,73 @@ def download_logs(
|
|
|
635
1139
|
job_name=name,
|
|
636
1140
|
controller=controller,
|
|
637
1141
|
local_dir=local_dir)
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
@usage_lib.entrypoint
|
|
1145
|
+
def pool_apply(
|
|
1146
|
+
task: 'sky.Task',
|
|
1147
|
+
pool_name: str,
|
|
1148
|
+
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
1149
|
+
workers: Optional[int] = None,
|
|
1150
|
+
) -> None:
|
|
1151
|
+
"""Apply a config to a pool."""
|
|
1152
|
+
return impl.apply(task, workers, pool_name, mode, pool=True)
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
@usage_lib.entrypoint
|
|
1156
|
+
# pylint: disable=redefined-builtin
|
|
1157
|
+
def pool_down(
|
|
1158
|
+
pool_names: Optional[Union[str, List[str]]] = None,
|
|
1159
|
+
all: bool = False,
|
|
1160
|
+
purge: bool = False,
|
|
1161
|
+
) -> None:
|
|
1162
|
+
"""Delete a pool."""
|
|
1163
|
+
return impl.down(pool_names, all, purge, pool=True)
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
@usage_lib.entrypoint
|
|
1167
|
+
def pool_status(
|
|
1168
|
+
pool_names: Optional[Union[str,
|
|
1169
|
+
List[str]]] = None,) -> List[Dict[str, Any]]:
|
|
1170
|
+
"""Query a pool."""
|
|
1171
|
+
return impl.status(pool_names, pool=True)
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
|
|
1175
|
+
|
|
1176
|
+
|
|
1177
|
+
@usage_lib.entrypoint
|
|
1178
|
+
def pool_tail_logs(
|
|
1179
|
+
pool_name: str,
|
|
1180
|
+
*,
|
|
1181
|
+
target: ServiceComponentOrStr,
|
|
1182
|
+
worker_id: Optional[int] = None,
|
|
1183
|
+
follow: bool = True,
|
|
1184
|
+
tail: Optional[int] = None,
|
|
1185
|
+
) -> None:
|
|
1186
|
+
"""Tail logs of a pool."""
|
|
1187
|
+
return impl.tail_logs(pool_name,
|
|
1188
|
+
target=target,
|
|
1189
|
+
replica_id=worker_id,
|
|
1190
|
+
follow=follow,
|
|
1191
|
+
tail=tail,
|
|
1192
|
+
pool=True)
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
@usage_lib.entrypoint
|
|
1196
|
+
def pool_sync_down_logs(
|
|
1197
|
+
pool_name: str,
|
|
1198
|
+
*,
|
|
1199
|
+
local_dir: str,
|
|
1200
|
+
targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
|
|
1201
|
+
None] = None,
|
|
1202
|
+
worker_ids: Optional[List[int]] = None,
|
|
1203
|
+
tail: Optional[int] = None,
|
|
1204
|
+
) -> str:
|
|
1205
|
+
"""Sync down logs of a pool."""
|
|
1206
|
+
return impl.sync_down_logs(pool_name,
|
|
1207
|
+
local_dir=local_dir,
|
|
1208
|
+
targets=targets,
|
|
1209
|
+
replica_ids=worker_ids,
|
|
1210
|
+
tail=tail,
|
|
1211
|
+
pool=True)
|