skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/__init__.py
CHANGED
|
@@ -5,6 +5,11 @@ from sky.jobs.client.sdk import cancel
|
|
|
5
5
|
from sky.jobs.client.sdk import dashboard
|
|
6
6
|
from sky.jobs.client.sdk import download_logs
|
|
7
7
|
from sky.jobs.client.sdk import launch
|
|
8
|
+
from sky.jobs.client.sdk import pool_apply
|
|
9
|
+
from sky.jobs.client.sdk import pool_down
|
|
10
|
+
from sky.jobs.client.sdk import pool_status
|
|
11
|
+
from sky.jobs.client.sdk import pool_sync_down_logs
|
|
12
|
+
from sky.jobs.client.sdk import pool_tail_logs
|
|
8
13
|
from sky.jobs.client.sdk import queue
|
|
9
14
|
from sky.jobs.client.sdk import tail_logs
|
|
10
15
|
from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Dict, List, Optional, Union
|
|
5
|
-
import webbrowser
|
|
4
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
6
5
|
|
|
7
6
|
import click
|
|
8
7
|
|
|
@@ -10,34 +9,47 @@ from sky import sky_logging
|
|
|
10
9
|
from sky.adaptors import common as adaptors_common
|
|
11
10
|
from sky.client import common as client_common
|
|
12
11
|
from sky.client import sdk
|
|
12
|
+
from sky.schemas.api import responses
|
|
13
|
+
from sky.serve.client import impl
|
|
13
14
|
from sky.server import common as server_common
|
|
15
|
+
from sky.server import rest
|
|
16
|
+
from sky.server import versions
|
|
14
17
|
from sky.server.requests import payloads
|
|
18
|
+
from sky.server.requests import request_names
|
|
15
19
|
from sky.skylet import constants
|
|
16
20
|
from sky.usage import usage_lib
|
|
21
|
+
from sky.utils import admin_policy_utils
|
|
17
22
|
from sky.utils import common_utils
|
|
23
|
+
from sky.utils import context
|
|
18
24
|
from sky.utils import dag_utils
|
|
19
25
|
|
|
20
26
|
if typing.TYPE_CHECKING:
|
|
21
27
|
import io
|
|
22
|
-
|
|
23
|
-
import requests
|
|
28
|
+
import webbrowser
|
|
24
29
|
|
|
25
30
|
import sky
|
|
31
|
+
from sky import backends
|
|
32
|
+
from sky.serve import serve_utils
|
|
26
33
|
else:
|
|
27
|
-
|
|
34
|
+
# only used in dashboard()
|
|
35
|
+
webbrowser = adaptors_common.LazyImport('webbrowser')
|
|
28
36
|
|
|
29
37
|
logger = sky_logging.init_logger(__name__)
|
|
30
38
|
|
|
31
39
|
|
|
40
|
+
@context.contextual
|
|
32
41
|
@usage_lib.entrypoint
|
|
33
42
|
@server_common.check_server_healthy_or_start
|
|
34
43
|
def launch(
|
|
35
44
|
task: Union['sky.Task', 'sky.Dag'],
|
|
36
45
|
name: Optional[str] = None,
|
|
46
|
+
pool: Optional[str] = None,
|
|
47
|
+
num_jobs: Optional[int] = None,
|
|
37
48
|
# Internal only:
|
|
38
49
|
# pylint: disable=invalid-name
|
|
39
50
|
_need_confirmation: bool = False,
|
|
40
|
-
) -> server_common.RequestId
|
|
51
|
+
) -> server_common.RequestId[Tuple[Optional[int],
|
|
52
|
+
Optional['backends.ResourceHandle']]]:
|
|
41
53
|
"""Launches a managed job.
|
|
42
54
|
|
|
43
55
|
Please refer to sky.cli.job_launch for documentation.
|
|
@@ -62,36 +74,70 @@ def launch(
|
|
|
62
74
|
chain dag.
|
|
63
75
|
sky.exceptions.NotSupportedError: the feature is not supported.
|
|
64
76
|
"""
|
|
77
|
+
remote_api_version = versions.get_remote_api_version()
|
|
78
|
+
if (pool is not None and
|
|
79
|
+
(remote_api_version is None or remote_api_version < 12)):
|
|
80
|
+
raise click.UsageError('Pools are not supported in your API server. '
|
|
81
|
+
'Please upgrade to a newer API server to use '
|
|
82
|
+
'pools.')
|
|
83
|
+
if pool is None and num_jobs is not None:
|
|
84
|
+
raise click.UsageError('Cannot specify num_jobs without pool.')
|
|
65
85
|
|
|
66
86
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
if
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
88
|
+
dag,
|
|
89
|
+
request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH,
|
|
90
|
+
at_client_side=True) as dag:
|
|
91
|
+
sdk.validate(dag)
|
|
92
|
+
if _need_confirmation:
|
|
93
|
+
job_identity = 'a managed job'
|
|
94
|
+
if pool is None:
|
|
95
|
+
optimize_request_id = sdk.optimize(dag)
|
|
96
|
+
sdk.stream_and_get(optimize_request_id)
|
|
97
|
+
else:
|
|
98
|
+
pool_status_request_id = pool_status(pool)
|
|
99
|
+
pool_statuses = sdk.get(pool_status_request_id)
|
|
100
|
+
if not pool_statuses:
|
|
101
|
+
raise click.UsageError(f'Pool {pool!r} not found.')
|
|
102
|
+
resources = pool_statuses[0]['requested_resources_str']
|
|
103
|
+
click.secho(f'Use resources from pool {pool!r}: {resources}.',
|
|
104
|
+
fg='green')
|
|
105
|
+
if num_jobs is not None:
|
|
106
|
+
job_identity = f'{num_jobs} managed jobs'
|
|
107
|
+
prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
|
|
108
|
+
if prompt is not None:
|
|
109
|
+
click.confirm(prompt,
|
|
110
|
+
default=True,
|
|
111
|
+
abort=True,
|
|
112
|
+
show_default=True)
|
|
113
|
+
|
|
114
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
|
115
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
|
116
|
+
body = payloads.JobsLaunchBody(
|
|
117
|
+
task=dag_str,
|
|
118
|
+
name=name,
|
|
119
|
+
pool=pool,
|
|
120
|
+
num_jobs=num_jobs,
|
|
121
|
+
)
|
|
122
|
+
response = server_common.make_authenticated_request(
|
|
123
|
+
'POST',
|
|
124
|
+
'/jobs/launch',
|
|
125
|
+
json=json.loads(body.model_dump_json()),
|
|
126
|
+
timeout=(5, None))
|
|
127
|
+
return server_common.get_request_id(response)
|
|
88
128
|
|
|
89
129
|
|
|
90
130
|
@usage_lib.entrypoint
|
|
91
131
|
@server_common.check_server_healthy_or_start
|
|
92
|
-
def queue(
|
|
93
|
-
|
|
94
|
-
|
|
132
|
+
def queue(
|
|
133
|
+
refresh: bool,
|
|
134
|
+
skip_finished: bool = False,
|
|
135
|
+
all_users: bool = False,
|
|
136
|
+
job_ids: Optional[List[int]] = None,
|
|
137
|
+
limit: Optional[int] = None,
|
|
138
|
+
fields: Optional[List[str]] = None,
|
|
139
|
+
) -> server_common.RequestId[Union[List[responses.ManagedJobRecord], Tuple[
|
|
140
|
+
List[responses.ManagedJobRecord], int, Dict[str, int], int]]]:
|
|
95
141
|
"""Gets statuses of managed jobs.
|
|
96
142
|
|
|
97
143
|
Please refer to sky.cli.job_queue for documentation.
|
|
@@ -100,12 +146,15 @@ def queue(refresh: bool,
|
|
|
100
146
|
refresh: Whether to restart the jobs controller if it is stopped.
|
|
101
147
|
skip_finished: Whether to skip finished jobs.
|
|
102
148
|
all_users: Whether to show all users' jobs.
|
|
149
|
+
job_ids: IDs of the managed jobs to show.
|
|
150
|
+
limit: Number of jobs to show.
|
|
151
|
+
fields: Fields to get for the managed jobs.
|
|
103
152
|
|
|
104
153
|
Returns:
|
|
105
154
|
The request ID of the queue request.
|
|
106
155
|
|
|
107
156
|
Request Returns:
|
|
108
|
-
job_records (List[
|
|
157
|
+
job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
|
|
109
158
|
containing the information of a job.
|
|
110
159
|
|
|
111
160
|
.. code-block:: python
|
|
@@ -117,11 +166,13 @@ def queue(refresh: bool,
|
|
|
117
166
|
'resources': (str) resources of the job,
|
|
118
167
|
'submitted_at': (float) timestamp of submission,
|
|
119
168
|
'end_at': (float) timestamp of end,
|
|
120
|
-
'
|
|
169
|
+
'job_duration': (float) duration in seconds,
|
|
121
170
|
'recovery_count': (int) Number of retries,
|
|
122
171
|
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
123
172
|
'cluster_resources': (str) resources of the cluster,
|
|
124
173
|
'region': (str) region of the cluster,
|
|
174
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
175
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
125
176
|
}
|
|
126
177
|
]
|
|
127
178
|
|
|
@@ -130,17 +181,31 @@ def queue(refresh: bool,
|
|
|
130
181
|
does not exist.
|
|
131
182
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
132
183
|
"""
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
184
|
+
remote_api_version = versions.get_remote_api_version()
|
|
185
|
+
if remote_api_version and remote_api_version >= 18:
|
|
186
|
+
body = payloads.JobsQueueV2Body(
|
|
187
|
+
refresh=refresh,
|
|
188
|
+
skip_finished=skip_finished,
|
|
189
|
+
all_users=all_users,
|
|
190
|
+
job_ids=job_ids,
|
|
191
|
+
limit=limit,
|
|
192
|
+
fields=fields,
|
|
193
|
+
)
|
|
194
|
+
path = '/jobs/queue/v2'
|
|
195
|
+
else:
|
|
196
|
+
body = payloads.JobsQueueBody(
|
|
197
|
+
refresh=refresh,
|
|
198
|
+
skip_finished=skip_finished,
|
|
199
|
+
all_users=all_users,
|
|
200
|
+
job_ids=job_ids,
|
|
201
|
+
)
|
|
202
|
+
path = '/jobs/queue'
|
|
203
|
+
|
|
204
|
+
response = server_common.make_authenticated_request(
|
|
205
|
+
'POST',
|
|
206
|
+
path,
|
|
140
207
|
json=json.loads(body.model_dump_json()),
|
|
141
|
-
timeout=(5, None)
|
|
142
|
-
cookies=server_common.get_api_cookie_jar(),
|
|
143
|
-
)
|
|
208
|
+
timeout=(5, None))
|
|
144
209
|
return server_common.get_request_id(response=response)
|
|
145
210
|
|
|
146
211
|
|
|
@@ -148,10 +213,11 @@ def queue(refresh: bool,
|
|
|
148
213
|
@server_common.check_server_healthy_or_start
|
|
149
214
|
def cancel(
|
|
150
215
|
name: Optional[str] = None,
|
|
151
|
-
job_ids: Optional[
|
|
216
|
+
job_ids: Optional[Sequence[int]] = None,
|
|
152
217
|
all: bool = False, # pylint: disable=redefined-builtin
|
|
153
218
|
all_users: bool = False,
|
|
154
|
-
|
|
219
|
+
pool: Optional[str] = None,
|
|
220
|
+
) -> server_common.RequestId[None]:
|
|
155
221
|
"""Cancels managed jobs.
|
|
156
222
|
|
|
157
223
|
Please refer to sky.cli.job_cancel for documentation.
|
|
@@ -161,6 +227,7 @@ def cancel(
|
|
|
161
227
|
job_ids: IDs of the managed jobs to cancel.
|
|
162
228
|
all: Whether to cancel all managed jobs.
|
|
163
229
|
all_users: Whether to cancel all managed jobs from all users.
|
|
230
|
+
pool: Pool name to cancel.
|
|
164
231
|
|
|
165
232
|
Returns:
|
|
166
233
|
The request ID of the cancel request.
|
|
@@ -169,29 +236,37 @@ def cancel(
|
|
|
169
236
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
|
170
237
|
RuntimeError: failed to cancel the job.
|
|
171
238
|
"""
|
|
239
|
+
remote_api_version = versions.get_remote_api_version()
|
|
240
|
+
if (pool is not None and
|
|
241
|
+
(remote_api_version is None or remote_api_version < 12)):
|
|
242
|
+
raise click.UsageError('Pools are not supported in your API server. '
|
|
243
|
+
'Please upgrade to a newer API server to use '
|
|
244
|
+
'pools.')
|
|
172
245
|
body = payloads.JobsCancelBody(
|
|
173
246
|
name=name,
|
|
174
247
|
job_ids=job_ids,
|
|
175
248
|
all=all,
|
|
176
249
|
all_users=all_users,
|
|
250
|
+
pool=pool,
|
|
177
251
|
)
|
|
178
|
-
response =
|
|
179
|
-
|
|
252
|
+
response = server_common.make_authenticated_request(
|
|
253
|
+
'POST',
|
|
254
|
+
'/jobs/cancel',
|
|
180
255
|
json=json.loads(body.model_dump_json()),
|
|
181
|
-
timeout=(5, None)
|
|
182
|
-
cookies=server_common.get_api_cookie_jar(),
|
|
183
|
-
)
|
|
256
|
+
timeout=(5, None))
|
|
184
257
|
return server_common.get_request_id(response=response)
|
|
185
258
|
|
|
186
259
|
|
|
187
260
|
@usage_lib.entrypoint
|
|
188
261
|
@server_common.check_server_healthy_or_start
|
|
262
|
+
@rest.retry_transient_errors()
|
|
189
263
|
def tail_logs(name: Optional[str] = None,
|
|
190
264
|
job_id: Optional[int] = None,
|
|
191
265
|
follow: bool = True,
|
|
192
266
|
controller: bool = False,
|
|
193
267
|
refresh: bool = False,
|
|
194
|
-
|
|
268
|
+
tail: Optional[int] = None,
|
|
269
|
+
output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
|
|
195
270
|
"""Tails logs of managed jobs.
|
|
196
271
|
|
|
197
272
|
You can provide either a job name or a job ID to tail logs. If both are not
|
|
@@ -203,6 +278,7 @@ def tail_logs(name: Optional[str] = None,
|
|
|
203
278
|
follow: Whether to follow the logs.
|
|
204
279
|
controller: Whether to tail logs from the jobs controller.
|
|
205
280
|
refresh: Whether to restart the jobs controller if it is stopped.
|
|
281
|
+
tail: Number of lines to tail from the end of the log file.
|
|
206
282
|
output_stream: The stream to write the logs to. If None, print to the
|
|
207
283
|
console.
|
|
208
284
|
|
|
@@ -210,6 +286,8 @@ def tail_logs(name: Optional[str] = None,
|
|
|
210
286
|
Exit code based on success or failure of the job. 0 if success,
|
|
211
287
|
100 if the job failed. See exceptions.JobExitCode for possible exit
|
|
212
288
|
codes.
|
|
289
|
+
Will return None if follow is False
|
|
290
|
+
(see note in sky/client/sdk.py::stream_response)
|
|
213
291
|
|
|
214
292
|
Request Raises:
|
|
215
293
|
ValueError: invalid arguments.
|
|
@@ -221,16 +299,23 @@ def tail_logs(name: Optional[str] = None,
|
|
|
221
299
|
follow=follow,
|
|
222
300
|
controller=controller,
|
|
223
301
|
refresh=refresh,
|
|
302
|
+
tail=tail,
|
|
224
303
|
)
|
|
225
|
-
response =
|
|
226
|
-
|
|
304
|
+
response = server_common.make_authenticated_request(
|
|
305
|
+
'POST',
|
|
306
|
+
'/jobs/logs',
|
|
227
307
|
json=json.loads(body.model_dump_json()),
|
|
228
308
|
stream=True,
|
|
229
|
-
timeout=(5, None)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
309
|
+
timeout=(5, None))
|
|
310
|
+
request_id: server_common.RequestId[int] = server_common.get_request_id(
|
|
311
|
+
response)
|
|
312
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
|
313
|
+
# streaming point on retry.
|
|
314
|
+
return sdk.stream_response(request_id=request_id,
|
|
315
|
+
response=response,
|
|
316
|
+
output_stream=output_stream,
|
|
317
|
+
resumable=(tail == 0),
|
|
318
|
+
get_result=follow)
|
|
234
319
|
|
|
235
320
|
|
|
236
321
|
@usage_lib.entrypoint
|
|
@@ -267,18 +352,18 @@ def download_logs(
|
|
|
267
352
|
controller=controller,
|
|
268
353
|
local_dir=local_dir,
|
|
269
354
|
)
|
|
270
|
-
response =
|
|
271
|
-
|
|
355
|
+
response = server_common.make_authenticated_request(
|
|
356
|
+
'POST',
|
|
357
|
+
'/jobs/download_logs',
|
|
272
358
|
json=json.loads(body.model_dump_json()),
|
|
273
|
-
timeout=(5, None)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
job_id_remote_path_dict = sdk.stream_and_get(
|
|
277
|
-
server_common.get_request_id(response))
|
|
359
|
+
timeout=(5, None))
|
|
360
|
+
request_id: server_common.RequestId[Dict[
|
|
361
|
+
str, str]] = server_common.get_request_id(response)
|
|
362
|
+
job_id_remote_path_dict = sdk.stream_and_get(request_id)
|
|
278
363
|
remote2local_path_dict = client_common.download_logs_from_api_server(
|
|
279
364
|
job_id_remote_path_dict.values())
|
|
280
365
|
return {
|
|
281
|
-
job_id: remote2local_path_dict[remote_path]
|
|
366
|
+
int(job_id): remote2local_path_dict[remote_path]
|
|
282
367
|
for job_id, remote_path in job_id_remote_path_dict.items()
|
|
283
368
|
}
|
|
284
369
|
|
|
@@ -314,3 +399,95 @@ def dashboard() -> None:
|
|
|
314
399
|
url = f'{api_server_url}/jobs/dashboard?{params}'
|
|
315
400
|
logger.info(f'Opening dashboard in browser: {url}')
|
|
316
401
|
webbrowser.open(url)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
@context.contextual
|
|
405
|
+
@usage_lib.entrypoint
|
|
406
|
+
@server_common.check_server_healthy_or_start
|
|
407
|
+
@versions.minimal_api_version(12)
|
|
408
|
+
def pool_apply(
|
|
409
|
+
task: Optional[Union['sky.Task', 'sky.Dag']],
|
|
410
|
+
pool_name: str,
|
|
411
|
+
mode: 'serve_utils.UpdateMode',
|
|
412
|
+
workers: Optional[int] = None,
|
|
413
|
+
# Internal only:
|
|
414
|
+
# pylint: disable=invalid-name
|
|
415
|
+
_need_confirmation: bool = False
|
|
416
|
+
) -> server_common.RequestId[None]:
|
|
417
|
+
"""Apply a config to a pool."""
|
|
418
|
+
remote_api_version = versions.get_remote_api_version()
|
|
419
|
+
if (workers is not None and
|
|
420
|
+
(remote_api_version is None or remote_api_version < 19)):
|
|
421
|
+
raise click.UsageError('Updating the number of workers in a pool is '
|
|
422
|
+
'not supported in your API server. Please '
|
|
423
|
+
'upgrade to a newer API server to use this '
|
|
424
|
+
'feature.')
|
|
425
|
+
return impl.apply(task,
|
|
426
|
+
workers,
|
|
427
|
+
pool_name,
|
|
428
|
+
mode,
|
|
429
|
+
pool=True,
|
|
430
|
+
_need_confirmation=_need_confirmation)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
@usage_lib.entrypoint
|
|
434
|
+
@server_common.check_server_healthy_or_start
|
|
435
|
+
@versions.minimal_api_version(12)
|
|
436
|
+
def pool_down(
|
|
437
|
+
pool_names: Optional[Union[str, List[str]]],
|
|
438
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
|
439
|
+
purge: bool = False,
|
|
440
|
+
) -> server_common.RequestId[None]:
|
|
441
|
+
"""Delete a pool."""
|
|
442
|
+
return impl.down(pool_names, all, purge, pool=True)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
@usage_lib.entrypoint
|
|
446
|
+
@server_common.check_server_healthy_or_start
|
|
447
|
+
@versions.minimal_api_version(12)
|
|
448
|
+
def pool_status(
|
|
449
|
+
pool_names: Optional[Union[str, List[str]]],
|
|
450
|
+
) -> server_common.RequestId[List[Dict[str, Any]]]:
|
|
451
|
+
"""Query a pool."""
|
|
452
|
+
return impl.status(pool_names, pool=True)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
@usage_lib.entrypoint
|
|
456
|
+
@server_common.check_server_healthy_or_start
|
|
457
|
+
@rest.retry_transient_errors()
|
|
458
|
+
@versions.minimal_api_version(16)
|
|
459
|
+
def pool_tail_logs(pool_name: str,
|
|
460
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
461
|
+
worker_id: Optional[int] = None,
|
|
462
|
+
follow: bool = True,
|
|
463
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
|
464
|
+
tail: Optional[int] = None) -> None:
|
|
465
|
+
"""Tails logs of a pool."""
|
|
466
|
+
return impl.tail_logs(pool_name,
|
|
467
|
+
target,
|
|
468
|
+
worker_id,
|
|
469
|
+
follow,
|
|
470
|
+
output_stream,
|
|
471
|
+
tail,
|
|
472
|
+
pool=True)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
@usage_lib.entrypoint
|
|
476
|
+
@server_common.check_server_healthy_or_start
|
|
477
|
+
@rest.retry_transient_errors()
|
|
478
|
+
@versions.minimal_api_version(16)
|
|
479
|
+
def pool_sync_down_logs(pool_name: str,
|
|
480
|
+
local_dir: str,
|
|
481
|
+
*,
|
|
482
|
+
targets: Optional[Union[
|
|
483
|
+
str, 'serve_utils.ServiceComponent', Sequence[Union[
|
|
484
|
+
str, 'serve_utils.ServiceComponent']]]] = None,
|
|
485
|
+
worker_ids: Optional[List[int]] = None,
|
|
486
|
+
tail: Optional[int] = None) -> None:
|
|
487
|
+
"""Sync down logs of a pool."""
|
|
488
|
+
return impl.sync_down_logs(pool_name,
|
|
489
|
+
local_dir,
|
|
490
|
+
targets=targets,
|
|
491
|
+
replica_ids=worker_ids,
|
|
492
|
+
tail=tail,
|
|
493
|
+
pool=True)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Async SDK functions for managed jobs."""
|
|
2
|
+
import typing
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sky import backends
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import common as adaptors_common
|
|
8
|
+
from sky.client import sdk_async
|
|
9
|
+
from sky.jobs.client import sdk
|
|
10
|
+
from sky.schemas.api import responses
|
|
11
|
+
from sky.skylet import constants
|
|
12
|
+
from sky.usage import usage_lib
|
|
13
|
+
from sky.utils import common_utils
|
|
14
|
+
from sky.utils import context_utils
|
|
15
|
+
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
import io
|
|
18
|
+
|
|
19
|
+
import requests
|
|
20
|
+
|
|
21
|
+
import sky
|
|
22
|
+
else:
|
|
23
|
+
requests = adaptors_common.LazyImport('requests')
|
|
24
|
+
|
|
25
|
+
logger = sky_logging.init_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@usage_lib.entrypoint
|
|
29
|
+
async def launch(
|
|
30
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
31
|
+
name: Optional[str] = None,
|
|
32
|
+
pool: Optional[str] = None,
|
|
33
|
+
num_jobs: Optional[int] = None,
|
|
34
|
+
# Internal only:
|
|
35
|
+
# pylint: disable=invalid-name
|
|
36
|
+
_need_confirmation: bool = False,
|
|
37
|
+
stream_logs: Optional[
|
|
38
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
|
|
39
|
+
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
40
|
+
"""Async version of launch() that launches a managed job."""
|
|
41
|
+
request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
|
|
42
|
+
num_jobs, _need_confirmation)
|
|
43
|
+
if stream_logs is not None:
|
|
44
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
45
|
+
else:
|
|
46
|
+
return await sdk_async.get(request_id)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@usage_lib.entrypoint
|
|
50
|
+
async def queue(
|
|
51
|
+
refresh: bool,
|
|
52
|
+
skip_finished: bool = False,
|
|
53
|
+
all_users: bool = False,
|
|
54
|
+
job_ids: Optional[List[int]] = None,
|
|
55
|
+
limit: Optional[int] = None,
|
|
56
|
+
fields: Optional[List[str]] = None,
|
|
57
|
+
stream_logs: Optional[
|
|
58
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
59
|
+
) -> Union[List[responses.ManagedJobRecord], Tuple[
|
|
60
|
+
List[responses.ManagedJobRecord], int, Dict[str, int], int]]:
|
|
61
|
+
"""Async version of queue() that gets statuses of managed jobs."""
|
|
62
|
+
request_id = await context_utils.to_thread(sdk.queue, refresh,
|
|
63
|
+
skip_finished, all_users,
|
|
64
|
+
job_ids, limit, fields)
|
|
65
|
+
if stream_logs is not None:
|
|
66
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
67
|
+
else:
|
|
68
|
+
return await sdk_async.get(request_id)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@usage_lib.entrypoint
|
|
72
|
+
async def cancel(
|
|
73
|
+
name: Optional[str] = None,
|
|
74
|
+
job_ids: Optional[List[int]] = None,
|
|
75
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
|
76
|
+
all_users: bool = False,
|
|
77
|
+
stream_logs: Optional[
|
|
78
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Async version of cancel() that cancels managed jobs."""
|
|
81
|
+
request_id = await context_utils.to_thread(sdk.cancel, name, job_ids, all,
|
|
82
|
+
all_users)
|
|
83
|
+
if stream_logs is not None:
|
|
84
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
85
|
+
else:
|
|
86
|
+
return await sdk_async.get(request_id)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@usage_lib.entrypoint
|
|
90
|
+
async def tail_logs(cluster_name: str,
|
|
91
|
+
job_id: Optional[int],
|
|
92
|
+
follow: bool,
|
|
93
|
+
tail: int = 0,
|
|
94
|
+
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
|
95
|
+
"""Async version of tail_logs() that tails the logs of a job."""
|
|
96
|
+
return await context_utils.to_thread(
|
|
97
|
+
sdk.tail_logs,
|
|
98
|
+
cluster_name,
|
|
99
|
+
job_id,
|
|
100
|
+
follow,
|
|
101
|
+
tail,
|
|
102
|
+
output_stream,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@usage_lib.entrypoint
|
|
107
|
+
async def download_logs(
|
|
108
|
+
name: Optional[str],
|
|
109
|
+
job_id: Optional[int],
|
|
110
|
+
refresh: bool,
|
|
111
|
+
controller: bool,
|
|
112
|
+
local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[int, str]:
|
|
113
|
+
"""Async version of download_logs() that syncs down logs of managed jobs."""
|
|
114
|
+
return await context_utils.to_thread(sdk.download_logs, name, job_id,
|
|
115
|
+
refresh, controller, local_dir)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@usage_lib.entrypoint
|
|
119
|
+
async def dashboard() -> None:
|
|
120
|
+
"""Async version of dashboard() that starts a dashboard for managed jobs."""
|
|
121
|
+
return await context_utils.to_thread(sdk.dashboard)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# Deprecated functions
|
|
125
|
+
spot_launch = common_utils.deprecated_function(
|
|
126
|
+
launch,
|
|
127
|
+
name='sky.jobs.launch',
|
|
128
|
+
deprecated_name='spot_launch',
|
|
129
|
+
removing_version='0.8.0',
|
|
130
|
+
override_argument={'use_spot': True})
|
|
131
|
+
spot_queue = common_utils.deprecated_function(queue,
|
|
132
|
+
name='sky.jobs.queue',
|
|
133
|
+
deprecated_name='spot_queue',
|
|
134
|
+
removing_version='0.8.0')
|
|
135
|
+
spot_cancel = common_utils.deprecated_function(cancel,
|
|
136
|
+
name='sky.jobs.cancel',
|
|
137
|
+
deprecated_name='spot_cancel',
|
|
138
|
+
removing_version='0.8.0')
|
|
139
|
+
spot_tail_logs = common_utils.deprecated_function(
|
|
140
|
+
tail_logs,
|
|
141
|
+
name='sky.jobs.tail_logs',
|
|
142
|
+
deprecated_name='spot_tail_logs',
|
|
143
|
+
removing_version='0.8.0')
|
sky/jobs/constants.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Constants used for Managed Jobs."""
|
|
2
|
+
import os
|
|
2
3
|
from typing import Any, Dict, Union
|
|
3
4
|
|
|
4
5
|
from sky.skylet import constants as skylet_constants
|
|
@@ -9,17 +10,15 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
|
|
9
10
|
|
|
10
11
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
|
11
12
|
|
|
13
|
+
JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
|
|
14
|
+
|
|
15
|
+
CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
|
|
16
|
+
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
12
17
|
# Resources as a dict for the jobs controller.
|
|
13
|
-
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
|
14
|
-
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
|
15
|
-
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
|
16
|
-
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
|
17
|
-
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
|
18
|
-
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
|
19
18
|
# We use 50 GB disk size to reduce the cost.
|
|
20
19
|
CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
|
|
21
20
|
'cpus': '4+',
|
|
22
|
-
'memory': '
|
|
21
|
+
'memory': '4x',
|
|
23
22
|
'disk_size': 50
|
|
24
23
|
}
|
|
25
24
|
|
|
@@ -47,7 +46,9 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
47
46
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
48
47
|
# change for the jobs/utils, we need to bump this version and update
|
|
49
48
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
50
|
-
|
|
49
|
+
# WARNING: If you update this due to a codegen change, make sure to make the
|
|
50
|
+
# corresponding change in the ManagedJobsService AND bump the SKYLET_VERSION.
|
|
51
|
+
MANAGED_JOBS_VERSION = 12
|
|
51
52
|
|
|
52
53
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
|
53
54
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|