skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/server/server.py
CHANGED
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
"""REST API for managed jobs."""
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
import pathlib
|
|
3
4
|
|
|
4
5
|
import fastapi
|
|
5
|
-
import httpx
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
8
|
+
from sky.jobs import utils as managed_jobs_utils
|
|
8
9
|
from sky.jobs.server import core
|
|
9
|
-
from sky.jobs.server import dashboard_utils
|
|
10
10
|
from sky.server import common as server_common
|
|
11
11
|
from sky.server import stream_utils
|
|
12
12
|
from sky.server.requests import executor
|
|
13
13
|
from sky.server.requests import payloads
|
|
14
|
+
from sky.server.requests import request_names
|
|
14
15
|
from sky.server.requests import requests as api_requests
|
|
15
16
|
from sky.skylet import constants
|
|
16
17
|
from sky.utils import common
|
|
17
|
-
from sky.utils import common_utils
|
|
18
18
|
|
|
19
19
|
logger = sky_logging.init_logger(__name__)
|
|
20
20
|
|
|
@@ -24,22 +24,36 @@ router = fastapi.APIRouter()
|
|
|
24
24
|
@router.post('/launch')
|
|
25
25
|
async def launch(request: fastapi.Request,
|
|
26
26
|
jobs_launch_body: payloads.JobsLaunchBody) -> None:
|
|
27
|
-
|
|
27
|
+
# In consolidation mode, the jobs controller will use sky.launch on the same
|
|
28
|
+
# API server to launch the underlying job cluster. If you start run many
|
|
29
|
+
# jobs.launch requests, some may be blocked for a long time by sky.launch
|
|
30
|
+
# requests triggered by earlier jobs, which leads to confusing behavior as
|
|
31
|
+
# the jobs.launch requests trickle though. Also, since we don't have to
|
|
32
|
+
# actually launch a jobs controller sky cluster, the jobs.launch request is
|
|
33
|
+
# much quicker in consolidation mode. So we avoid the issue by just using
|
|
34
|
+
# the short executor instead - then jobs.launch will not be blocked by
|
|
35
|
+
# sky.launch.
|
|
36
|
+
consolidation_mode = managed_jobs_utils.is_consolidation_mode()
|
|
37
|
+
schedule_type = (api_requests.ScheduleType.SHORT
|
|
38
|
+
if consolidation_mode else api_requests.ScheduleType.LONG)
|
|
39
|
+
await executor.schedule_request_async(
|
|
28
40
|
request_id=request.state.request_id,
|
|
29
|
-
request_name=
|
|
41
|
+
request_name=request_names.RequestName.JOBS_LAUNCH,
|
|
30
42
|
request_body=jobs_launch_body,
|
|
31
43
|
func=core.launch,
|
|
32
|
-
schedule_type=
|
|
44
|
+
schedule_type=schedule_type,
|
|
33
45
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
34
46
|
)
|
|
35
47
|
|
|
36
48
|
|
|
49
|
+
# For backwards compatibility
|
|
50
|
+
# TODO(hailong): Remove before 0.12.0.
|
|
37
51
|
@router.post('/queue')
|
|
38
52
|
async def queue(request: fastapi.Request,
|
|
39
53
|
jobs_queue_body: payloads.JobsQueueBody) -> None:
|
|
40
|
-
executor.
|
|
54
|
+
await executor.schedule_request_async(
|
|
41
55
|
request_id=request.state.request_id,
|
|
42
|
-
request_name=
|
|
56
|
+
request_name=request_names.RequestName.JOBS_QUEUE,
|
|
43
57
|
request_body=jobs_queue_body,
|
|
44
58
|
func=core.queue,
|
|
45
59
|
schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
|
|
@@ -48,12 +62,27 @@ async def queue(request: fastapi.Request,
|
|
|
48
62
|
)
|
|
49
63
|
|
|
50
64
|
|
|
65
|
+
@router.post('/queue/v2')
|
|
66
|
+
async def queue_v2(request: fastapi.Request,
|
|
67
|
+
jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
|
|
68
|
+
await executor.schedule_request_async(
|
|
69
|
+
request_id=request.state.request_id,
|
|
70
|
+
request_name=request_names.RequestName.JOBS_QUEUE_V2,
|
|
71
|
+
request_body=jobs_queue_body_v2,
|
|
72
|
+
func=core.queue_v2_api,
|
|
73
|
+
schedule_type=(api_requests.ScheduleType.LONG
|
|
74
|
+
if jobs_queue_body_v2.refresh else
|
|
75
|
+
api_requests.ScheduleType.SHORT),
|
|
76
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
51
80
|
@router.post('/cancel')
|
|
52
81
|
async def cancel(request: fastapi.Request,
|
|
53
82
|
jobs_cancel_body: payloads.JobsCancelBody) -> None:
|
|
54
|
-
executor.
|
|
83
|
+
await executor.schedule_request_async(
|
|
55
84
|
request_id=request.state.request_id,
|
|
56
|
-
request_name=
|
|
85
|
+
request_name=request_names.RequestName.JOBS_CANCEL,
|
|
57
86
|
request_body=jobs_cancel_body,
|
|
58
87
|
func=core.cancel,
|
|
59
88
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -66,27 +95,39 @@ async def logs(
|
|
|
66
95
|
request: fastapi.Request, jobs_logs_body: payloads.JobsLogsBody,
|
|
67
96
|
background_tasks: fastapi.BackgroundTasks
|
|
68
97
|
) -> fastapi.responses.StreamingResponse:
|
|
69
|
-
|
|
98
|
+
schedule_type = api_requests.ScheduleType.SHORT
|
|
99
|
+
if jobs_logs_body.refresh:
|
|
100
|
+
# When refresh is specified, the job controller might be restarted,
|
|
101
|
+
# which takes longer time to finish. We schedule it to long executor.
|
|
102
|
+
schedule_type = api_requests.ScheduleType.LONG
|
|
103
|
+
if schedule_type == api_requests.ScheduleType.SHORT:
|
|
104
|
+
executor.check_request_thread_executor_available()
|
|
105
|
+
request_task = await executor.prepare_request_async(
|
|
70
106
|
request_id=request.state.request_id,
|
|
71
|
-
request_name=
|
|
107
|
+
request_name=request_names.RequestName.JOBS_LOGS,
|
|
72
108
|
request_body=jobs_logs_body,
|
|
73
109
|
func=core.tail_logs,
|
|
74
|
-
|
|
75
|
-
# should be responsive. However, it can be long running if the user's
|
|
76
|
-
# job keeps running, and we should avoid it taking the SHORT worker
|
|
77
|
-
# indefinitely.
|
|
78
|
-
# When refresh is True we schedule it as LONG because a controller
|
|
79
|
-
# restart might be needed.
|
|
80
|
-
schedule_type=api_requests.ScheduleType.LONG
|
|
81
|
-
if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
|
|
110
|
+
schedule_type=schedule_type,
|
|
82
111
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
83
112
|
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
113
|
+
kill_request_on_disconnect = False
|
|
114
|
+
if schedule_type == api_requests.ScheduleType.SHORT:
|
|
115
|
+
# For short request, run in the coroutine to avoid blocking
|
|
116
|
+
# short workers.
|
|
117
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
118
|
+
# Cancel the coroutine after the request is done or client disconnects
|
|
119
|
+
background_tasks.add_task(task.cancel)
|
|
120
|
+
else:
|
|
121
|
+
executor.schedule_prepared_request(request_task)
|
|
122
|
+
# When runs in long executor process, we should kill the request on
|
|
123
|
+
# disconnect to cancel the running routine.
|
|
124
|
+
kill_request_on_disconnect = True
|
|
125
|
+
|
|
126
|
+
return stream_utils.stream_response_for_long_request(
|
|
87
127
|
request_id=request_task.request_id,
|
|
88
128
|
logs_path=request_task.log_path,
|
|
89
129
|
background_tasks=background_tasks,
|
|
130
|
+
kill_request_on_disconnect=kill_request_on_disconnect,
|
|
90
131
|
)
|
|
91
132
|
|
|
92
133
|
|
|
@@ -101,9 +142,9 @@ async def download_logs(
|
|
|
101
142
|
# We should reuse the original request body, so that the env vars, such as
|
|
102
143
|
# user hash, are kept the same.
|
|
103
144
|
jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
104
|
-
executor.
|
|
145
|
+
await executor.schedule_request_async(
|
|
105
146
|
request_id=request.state.request_id,
|
|
106
|
-
request_name=
|
|
147
|
+
request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
|
|
107
148
|
request_body=jobs_download_logs_body,
|
|
108
149
|
func=core.download_logs,
|
|
109
150
|
schedule_type=api_requests.ScheduleType.LONG
|
|
@@ -112,92 +153,92 @@ async def download_logs(
|
|
|
112
153
|
)
|
|
113
154
|
|
|
114
155
|
|
|
115
|
-
@router.
|
|
116
|
-
async def
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
156
|
+
@router.post('/pool_apply')
|
|
157
|
+
async def pool_apply(request: fastapi.Request,
|
|
158
|
+
jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
|
|
159
|
+
await executor.schedule_request_async(
|
|
160
|
+
request_id=request.state.request_id,
|
|
161
|
+
request_name=request_names.RequestName.JOBS_POOL_APPLY,
|
|
162
|
+
request_body=jobs_pool_apply_body,
|
|
163
|
+
func=core.pool_apply,
|
|
164
|
+
schedule_type=api_requests.ScheduleType.LONG,
|
|
165
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@router.post('/pool_down')
|
|
170
|
+
async def pool_down(request: fastapi.Request,
|
|
171
|
+
jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
|
|
172
|
+
await executor.schedule_request_async(
|
|
173
|
+
request_id=request.state.request_id,
|
|
174
|
+
request_name=request_names.RequestName.JOBS_POOL_DOWN,
|
|
175
|
+
request_body=jobs_pool_down_body,
|
|
176
|
+
func=core.pool_down,
|
|
177
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
178
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@router.post('/pool_status')
|
|
183
|
+
async def pool_status(
|
|
184
|
+
request: fastapi.Request,
|
|
185
|
+
jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
|
|
186
|
+
await executor.schedule_request_async(
|
|
187
|
+
request_id=request.state.request_id,
|
|
188
|
+
request_name=request_names.RequestName.JOBS_POOL_STATUS,
|
|
189
|
+
request_body=jobs_pool_status_body,
|
|
190
|
+
func=core.pool_status,
|
|
191
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
192
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@router.post('/pool_logs')
|
|
197
|
+
async def pool_tail_logs(
|
|
198
|
+
request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
|
|
199
|
+
background_tasks: fastapi.BackgroundTasks
|
|
200
|
+
) -> fastapi.responses.StreamingResponse:
|
|
201
|
+
await executor.schedule_request_async(
|
|
202
|
+
request_id=request.state.request_id,
|
|
203
|
+
request_name=request_names.RequestName.JOBS_POOL_LOGS,
|
|
204
|
+
request_body=log_body,
|
|
205
|
+
func=core.pool_tail_logs,
|
|
206
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
207
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
request_task = await api_requests.get_request_async(
|
|
211
|
+
request.state.request_id, fields=['request_id'])
|
|
212
|
+
|
|
213
|
+
return stream_utils.stream_response_for_long_request(
|
|
214
|
+
request_id=request_task.request_id,
|
|
215
|
+
# req.log_path is derived from request_id,
|
|
216
|
+
# so it's ok to just grab the request_id in the above query.
|
|
217
|
+
logs_path=request_task.log_path,
|
|
218
|
+
background_tasks=background_tasks,
|
|
219
|
+
kill_request_on_disconnect=True,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@router.post('/pool_sync-down-logs')
|
|
224
|
+
async def pool_download_logs(
|
|
225
|
+
request: fastapi.Request,
|
|
226
|
+
download_logs_body: payloads.JobsPoolDownloadLogsBody,
|
|
227
|
+
) -> None:
|
|
228
|
+
user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
|
|
229
|
+
timestamp = sky_logging.get_run_timestamp()
|
|
230
|
+
logs_dir_on_api_server = (
|
|
231
|
+
pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
|
|
232
|
+
'pool' / f'{download_logs_body.pool_name}_{timestamp}')
|
|
233
|
+
logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
|
|
234
|
+
# We should reuse the original request body, so that the env vars, such as
|
|
235
|
+
# user hash, are kept the same.
|
|
236
|
+
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
237
|
+
await executor.schedule_request_async(
|
|
238
|
+
request_id=request.state.request_id,
|
|
239
|
+
request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
|
|
240
|
+
request_body=download_logs_body,
|
|
241
|
+
func=core.pool_sync_down_logs,
|
|
242
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
243
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
244
|
+
)
|
sky/jobs/server/utils.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Utility functions for managed jobs."""
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
from sky import backends
|
|
5
|
+
from sky import exceptions
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import common as adaptors_common
|
|
8
|
+
from sky.backends import backend_utils
|
|
9
|
+
from sky.backends import cloud_vm_ray_backend
|
|
10
|
+
from sky.jobs import utils as managed_job_utils
|
|
11
|
+
from sky.skylet import constants as skylet_constants
|
|
12
|
+
from sky.utils import controller_utils
|
|
13
|
+
|
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
|
15
|
+
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
18
|
+
else:
|
|
19
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
20
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
21
|
+
|
|
22
|
+
_MANAGED_JOB_FIELDS_TO_GET = [
|
|
23
|
+
'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
|
|
24
|
+
'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
29
|
+
"""Check if controller has version mismatch and non-terminal jobs exist.
|
|
30
|
+
Raises:
|
|
31
|
+
ValueError: If there's a version mismatch and non-terminal jobs exist.
|
|
32
|
+
sky.exceptions.ClusterNotUpError: If the controller is not accessible.
|
|
33
|
+
"""
|
|
34
|
+
# Get the current local SKYLET_VERSION
|
|
35
|
+
local_version = skylet_constants.SKYLET_VERSION
|
|
36
|
+
|
|
37
|
+
# Get controller handle (works the same in both normal and
|
|
38
|
+
# consolidation mode)
|
|
39
|
+
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
|
40
|
+
handle = backend_utils.is_controller_accessible(
|
|
41
|
+
controller=jobs_controller_type,
|
|
42
|
+
stopped_message='Jobs controller is not running.')
|
|
43
|
+
|
|
44
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
45
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
46
|
+
|
|
47
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
48
|
+
|
|
49
|
+
if not use_legacy:
|
|
50
|
+
try:
|
|
51
|
+
version_request = managed_jobsv1_pb2.GetVersionRequest()
|
|
52
|
+
version_response = backend_utils.invoke_skylet_with_retries(
|
|
53
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
54
|
+
handle.get_grpc_channel(
|
|
55
|
+
)).get_managed_job_controller_version(version_request))
|
|
56
|
+
controller_version = version_response.controller_version
|
|
57
|
+
|
|
58
|
+
job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
59
|
+
skip_finished=True,
|
|
60
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
61
|
+
fields=_MANAGED_JOB_FIELDS_TO_GET),
|
|
62
|
+
)
|
|
63
|
+
job_table_response = backend_utils.invoke_skylet_with_retries(
|
|
64
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
65
|
+
handle.get_grpc_channel()).get_managed_job_table(
|
|
66
|
+
job_table_request))
|
|
67
|
+
jobs = managed_job_utils.decode_managed_job_protos(
|
|
68
|
+
job_table_response.jobs)
|
|
69
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
70
|
+
use_legacy = True
|
|
71
|
+
|
|
72
|
+
if use_legacy:
|
|
73
|
+
# Get controller version and raw job table
|
|
74
|
+
code = managed_job_utils.ManagedJobCodeGen.get_version()
|
|
75
|
+
|
|
76
|
+
returncode, output, stderr = backend.run_on_head(handle,
|
|
77
|
+
code,
|
|
78
|
+
require_outputs=True,
|
|
79
|
+
stream_logs=False,
|
|
80
|
+
separate_stderr=True)
|
|
81
|
+
|
|
82
|
+
if returncode != 0:
|
|
83
|
+
logger.error(output + stderr)
|
|
84
|
+
raise ValueError('Failed to check controller version with '
|
|
85
|
+
f'returncode: {returncode}.\n{output + stderr}')
|
|
86
|
+
|
|
87
|
+
# Parse the output to extract controller version (split only on first
|
|
88
|
+
# newline)
|
|
89
|
+
output_parts = output.strip().split('\n', 1)
|
|
90
|
+
|
|
91
|
+
# Extract controller version from first line
|
|
92
|
+
if not output_parts[0].startswith('controller_version:'):
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f'Expected controller version in first line, got: {output}')
|
|
95
|
+
|
|
96
|
+
controller_version = output_parts[0].split(':', 1)[1]
|
|
97
|
+
|
|
98
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
99
|
+
skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
|
|
100
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
101
|
+
handle,
|
|
102
|
+
code,
|
|
103
|
+
require_outputs=True,
|
|
104
|
+
stream_logs=False,
|
|
105
|
+
separate_stderr=True)
|
|
106
|
+
|
|
107
|
+
if returncode != 0:
|
|
108
|
+
logger.error(job_table_payload + stderr)
|
|
109
|
+
raise ValueError('Failed to fetch managed jobs with returncode: '
|
|
110
|
+
f'{returncode}.\n{job_table_payload + stderr}')
|
|
111
|
+
|
|
112
|
+
jobs, _, _, _, _ = (
|
|
113
|
+
managed_job_utils.load_managed_job_queue(job_table_payload))
|
|
114
|
+
|
|
115
|
+
# Process locally: check version match and filter non-terminal jobs
|
|
116
|
+
version_matches = (controller_version == local_version or
|
|
117
|
+
int(controller_version) > 17)
|
|
118
|
+
non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
|
|
119
|
+
has_non_terminal_jobs = len(non_terminal_jobs) > 0
|
|
120
|
+
|
|
121
|
+
if not version_matches and has_non_terminal_jobs:
|
|
122
|
+
# Format job table locally using the same method as queue()
|
|
123
|
+
formatted_job_table = managed_job_utils.format_job_table(
|
|
124
|
+
non_terminal_jobs,
|
|
125
|
+
pool_status=None,
|
|
126
|
+
show_all=False,
|
|
127
|
+
show_user=False)
|
|
128
|
+
|
|
129
|
+
error_msg = (
|
|
130
|
+
f'Controller SKYLET_VERSION ({controller_version}) does not match '
|
|
131
|
+
f'current version ({local_version}), and there are non-terminal '
|
|
132
|
+
'jobs on the controller. Please wait for all jobs to complete or '
|
|
133
|
+
'cancel them before launching new jobs with the updated version.'
|
|
134
|
+
f'\n\nCurrent non-terminal jobs:\n{formatted_job_table}')
|
|
135
|
+
|
|
136
|
+
raise ValueError(error_msg)
|