skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/__init__.py
CHANGED
|
@@ -5,7 +5,13 @@ from sky.jobs.client.sdk import cancel
|
|
|
5
5
|
from sky.jobs.client.sdk import dashboard
|
|
6
6
|
from sky.jobs.client.sdk import download_logs
|
|
7
7
|
from sky.jobs.client.sdk import launch
|
|
8
|
+
from sky.jobs.client.sdk import pool_apply
|
|
9
|
+
from sky.jobs.client.sdk import pool_down
|
|
10
|
+
from sky.jobs.client.sdk import pool_status
|
|
11
|
+
from sky.jobs.client.sdk import pool_sync_down_logs
|
|
12
|
+
from sky.jobs.client.sdk import pool_tail_logs
|
|
8
13
|
from sky.jobs.client.sdk import queue
|
|
14
|
+
from sky.jobs.client.sdk import queue_v2
|
|
9
15
|
from sky.jobs.client.sdk import tail_logs
|
|
10
16
|
from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
|
|
11
17
|
from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
|
|
@@ -33,6 +39,7 @@ __all__ = [
|
|
|
33
39
|
'cancel',
|
|
34
40
|
'launch',
|
|
35
41
|
'queue',
|
|
42
|
+
'queue_v2',
|
|
36
43
|
'tail_logs',
|
|
37
44
|
'dashboard',
|
|
38
45
|
'download_logs',
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Dict, List, Optional, Union
|
|
5
|
-
import webbrowser
|
|
4
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
6
5
|
|
|
7
6
|
import click
|
|
8
7
|
|
|
@@ -10,34 +9,47 @@ from sky import sky_logging
|
|
|
10
9
|
from sky.adaptors import common as adaptors_common
|
|
11
10
|
from sky.client import common as client_common
|
|
12
11
|
from sky.client import sdk
|
|
12
|
+
from sky.schemas.api import responses
|
|
13
|
+
from sky.serve.client import impl
|
|
13
14
|
from sky.server import common as server_common
|
|
15
|
+
from sky.server import rest
|
|
16
|
+
from sky.server import versions
|
|
14
17
|
from sky.server.requests import payloads
|
|
18
|
+
from sky.server.requests import request_names
|
|
15
19
|
from sky.skylet import constants
|
|
16
20
|
from sky.usage import usage_lib
|
|
21
|
+
from sky.utils import admin_policy_utils
|
|
17
22
|
from sky.utils import common_utils
|
|
23
|
+
from sky.utils import context
|
|
18
24
|
from sky.utils import dag_utils
|
|
19
25
|
|
|
20
26
|
if typing.TYPE_CHECKING:
|
|
21
27
|
import io
|
|
22
|
-
|
|
23
|
-
import requests
|
|
28
|
+
import webbrowser
|
|
24
29
|
|
|
25
30
|
import sky
|
|
31
|
+
from sky import backends
|
|
32
|
+
from sky.serve import serve_utils
|
|
26
33
|
else:
|
|
27
|
-
|
|
34
|
+
# only used in dashboard()
|
|
35
|
+
webbrowser = adaptors_common.LazyImport('webbrowser')
|
|
28
36
|
|
|
29
37
|
logger = sky_logging.init_logger(__name__)
|
|
30
38
|
|
|
31
39
|
|
|
40
|
+
@context.contextual
|
|
32
41
|
@usage_lib.entrypoint
|
|
33
42
|
@server_common.check_server_healthy_or_start
|
|
34
43
|
def launch(
|
|
35
44
|
task: Union['sky.Task', 'sky.Dag'],
|
|
36
45
|
name: Optional[str] = None,
|
|
46
|
+
pool: Optional[str] = None,
|
|
47
|
+
num_jobs: Optional[int] = None,
|
|
37
48
|
# Internal only:
|
|
38
49
|
# pylint: disable=invalid-name
|
|
39
50
|
_need_confirmation: bool = False,
|
|
40
|
-
) -> server_common.RequestId
|
|
51
|
+
) -> server_common.RequestId[Tuple[Optional[int],
|
|
52
|
+
Optional['backends.ResourceHandle']]]:
|
|
41
53
|
"""Launches a managed job.
|
|
42
54
|
|
|
43
55
|
Please refer to sky.cli.job_launch for documentation.
|
|
@@ -62,50 +74,166 @@ def launch(
|
|
|
62
74
|
chain dag.
|
|
63
75
|
sky.exceptions.NotSupportedError: the feature is not supported.
|
|
64
76
|
"""
|
|
77
|
+
remote_api_version = versions.get_remote_api_version()
|
|
78
|
+
if (pool is not None and
|
|
79
|
+
(remote_api_version is None or remote_api_version < 12)):
|
|
80
|
+
raise click.UsageError('Pools are not supported in your API server. '
|
|
81
|
+
'Please upgrade to a newer API server to use '
|
|
82
|
+
'pools.')
|
|
83
|
+
if pool is None and num_jobs is not None:
|
|
84
|
+
raise click.UsageError('Cannot specify num_jobs without pool.')
|
|
65
85
|
|
|
66
86
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
87
|
+
|
|
88
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
89
|
+
dag,
|
|
90
|
+
request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH,
|
|
91
|
+
at_client_side=True) as dag:
|
|
92
|
+
sdk.validate(dag)
|
|
93
|
+
if _need_confirmation:
|
|
94
|
+
job_identity = 'a managed job'
|
|
95
|
+
if pool is None:
|
|
96
|
+
optimize_request_id = sdk.optimize(dag)
|
|
97
|
+
sdk.stream_and_get(optimize_request_id)
|
|
98
|
+
else:
|
|
99
|
+
pool_status_request_id = pool_status(pool)
|
|
100
|
+
pool_statuses = sdk.get(pool_status_request_id)
|
|
101
|
+
if not pool_statuses:
|
|
102
|
+
raise click.UsageError(f'Pool {pool!r} not found.')
|
|
103
|
+
resources = pool_statuses[0]['requested_resources_str']
|
|
104
|
+
click.secho(f'Use resources from pool {pool!r}: {resources}.',
|
|
105
|
+
fg='green')
|
|
106
|
+
if num_jobs is not None:
|
|
107
|
+
job_identity = f'{num_jobs} managed jobs'
|
|
108
|
+
prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
|
|
109
|
+
if prompt is not None:
|
|
110
|
+
click.confirm(prompt,
|
|
111
|
+
default=True,
|
|
112
|
+
abort=True,
|
|
113
|
+
show_default=True)
|
|
114
|
+
|
|
115
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
|
116
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
|
117
|
+
body = payloads.JobsLaunchBody(
|
|
118
|
+
task=dag_str,
|
|
119
|
+
name=name,
|
|
120
|
+
pool=pool,
|
|
121
|
+
num_jobs=num_jobs,
|
|
122
|
+
)
|
|
123
|
+
response = server_common.make_authenticated_request(
|
|
124
|
+
'POST',
|
|
125
|
+
'/jobs/launch',
|
|
126
|
+
json=json.loads(body.model_dump_json()),
|
|
127
|
+
timeout=(5, None))
|
|
128
|
+
return server_common.get_request_id(response)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@usage_lib.entrypoint
|
|
132
|
+
@server_common.check_server_healthy_or_start
|
|
133
|
+
@versions.minimal_api_version(18)
|
|
134
|
+
def queue_v2(
|
|
135
|
+
refresh: bool,
|
|
136
|
+
skip_finished: bool = False,
|
|
137
|
+
all_users: bool = False,
|
|
138
|
+
job_ids: Optional[List[int]] = None,
|
|
139
|
+
limit: Optional[int] = None,
|
|
140
|
+
fields: Optional[List[str]] = None,
|
|
141
|
+
) -> server_common.RequestId[Tuple[List[responses.ManagedJobRecord], int, Dict[
|
|
142
|
+
str, int], int]]:
|
|
143
|
+
"""Gets statuses of managed jobs.
|
|
144
|
+
|
|
145
|
+
Please refer to sky.cli.job_queue for documentation.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
refresh: Whether to restart the jobs controller if it is stopped.
|
|
149
|
+
skip_finished: Whether to skip finished jobs.
|
|
150
|
+
all_users: Whether to show all users' jobs.
|
|
151
|
+
job_ids: IDs of the managed jobs to show.
|
|
152
|
+
limit: Number of jobs to show.
|
|
153
|
+
fields: Fields to get for the managed jobs.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
The request ID of the queue request.
|
|
157
|
+
|
|
158
|
+
Request Returns:
|
|
159
|
+
job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
|
|
160
|
+
containing the information of a job.
|
|
161
|
+
|
|
162
|
+
.. code-block:: python
|
|
163
|
+
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
'job_id': (int) job id,
|
|
167
|
+
'job_name': (str) job name,
|
|
168
|
+
'resources': (str) resources of the job,
|
|
169
|
+
'submitted_at': (float) timestamp of submission,
|
|
170
|
+
'end_at': (float) timestamp of end,
|
|
171
|
+
'job_duration': (float) duration in seconds,
|
|
172
|
+
'recovery_count': (int) Number of retries,
|
|
173
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
174
|
+
'cluster_resources': (str) resources of the cluster,
|
|
175
|
+
'region': (str) region of the cluster,
|
|
176
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
177
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
178
|
+
}
|
|
179
|
+
]
|
|
180
|
+
total (int): Total number of jobs after filter,
|
|
181
|
+
status_counts (Dict[str, int]): Status counts after filter,
|
|
182
|
+
total_no_filter (int): Total number of jobs before filter,
|
|
183
|
+
|
|
184
|
+
Request Raises:
|
|
185
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
|
186
|
+
does not exist.
|
|
187
|
+
RuntimeError: if failed to get the managed jobs with ssh.
|
|
188
|
+
"""
|
|
189
|
+
body = payloads.JobsQueueV2Body(
|
|
190
|
+
refresh=refresh,
|
|
191
|
+
skip_finished=skip_finished,
|
|
192
|
+
all_users=all_users,
|
|
193
|
+
job_ids=job_ids,
|
|
194
|
+
limit=limit,
|
|
195
|
+
fields=fields,
|
|
80
196
|
)
|
|
81
|
-
|
|
82
|
-
|
|
197
|
+
path = '/jobs/queue/v2'
|
|
198
|
+
response = server_common.make_authenticated_request(
|
|
199
|
+
'POST',
|
|
200
|
+
path,
|
|
83
201
|
json=json.loads(body.model_dump_json()),
|
|
84
|
-
timeout=(5, None)
|
|
85
|
-
|
|
86
|
-
)
|
|
87
|
-
return server_common.get_request_id(response)
|
|
202
|
+
timeout=(5, None))
|
|
203
|
+
return server_common.get_request_id(response=response)
|
|
88
204
|
|
|
89
205
|
|
|
206
|
+
# Deprecated. Please use queue_v2 instead for better performance.
|
|
207
|
+
# In https://github.com/skypilot-org/skypilot/pull/7695, the `queue` function
|
|
208
|
+
# is updated to return new typed data for performance improvement if the API
|
|
209
|
+
# server supports it, which breaks the backward compatibility.
|
|
210
|
+
# In https://github.com/skypilot-org/skypilot/pull/8015, we revert the change
|
|
211
|
+
# and add a new function `queue_v2` to return the new typed data.
|
|
90
212
|
@usage_lib.entrypoint
|
|
91
213
|
@server_common.check_server_healthy_or_start
|
|
92
|
-
def queue(
|
|
93
|
-
|
|
94
|
-
|
|
214
|
+
def queue(
|
|
215
|
+
refresh: bool,
|
|
216
|
+
skip_finished: bool = False,
|
|
217
|
+
all_users: bool = False,
|
|
218
|
+
job_ids: Optional[List[int]] = None
|
|
219
|
+
) -> server_common.RequestId[List[responses.ManagedJobRecord]]:
|
|
95
220
|
"""Gets statuses of managed jobs.
|
|
96
221
|
|
|
222
|
+
Deprecated. Please use queue_v2 instead for better performance.
|
|
223
|
+
|
|
97
224
|
Please refer to sky.cli.job_queue for documentation.
|
|
98
225
|
|
|
99
226
|
Args:
|
|
100
227
|
refresh: Whether to restart the jobs controller if it is stopped.
|
|
101
228
|
skip_finished: Whether to skip finished jobs.
|
|
102
229
|
all_users: Whether to show all users' jobs.
|
|
230
|
+
job_ids: IDs of the managed jobs to show.
|
|
103
231
|
|
|
104
232
|
Returns:
|
|
105
233
|
The request ID of the queue request.
|
|
106
234
|
|
|
107
235
|
Request Returns:
|
|
108
|
-
job_records (List[
|
|
236
|
+
job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
|
|
109
237
|
containing the information of a job.
|
|
110
238
|
|
|
111
239
|
.. code-block:: python
|
|
@@ -117,11 +245,13 @@ def queue(refresh: bool,
|
|
|
117
245
|
'resources': (str) resources of the job,
|
|
118
246
|
'submitted_at': (float) timestamp of submission,
|
|
119
247
|
'end_at': (float) timestamp of end,
|
|
120
|
-
'
|
|
248
|
+
'job_duration': (float) duration in seconds,
|
|
121
249
|
'recovery_count': (int) Number of retries,
|
|
122
250
|
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
123
251
|
'cluster_resources': (str) resources of the cluster,
|
|
124
252
|
'region': (str) region of the cluster,
|
|
253
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
254
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
125
255
|
}
|
|
126
256
|
]
|
|
127
257
|
|
|
@@ -134,13 +264,13 @@ def queue(refresh: bool,
|
|
|
134
264
|
refresh=refresh,
|
|
135
265
|
skip_finished=skip_finished,
|
|
136
266
|
all_users=all_users,
|
|
267
|
+
job_ids=job_ids,
|
|
137
268
|
)
|
|
138
|
-
response =
|
|
139
|
-
|
|
269
|
+
response = server_common.make_authenticated_request(
|
|
270
|
+
'POST',
|
|
271
|
+
'/jobs/queue',
|
|
140
272
|
json=json.loads(body.model_dump_json()),
|
|
141
|
-
timeout=(5, None)
|
|
142
|
-
cookies=server_common.get_api_cookie_jar(),
|
|
143
|
-
)
|
|
273
|
+
timeout=(5, None))
|
|
144
274
|
return server_common.get_request_id(response=response)
|
|
145
275
|
|
|
146
276
|
|
|
@@ -148,10 +278,11 @@ def queue(refresh: bool,
|
|
|
148
278
|
@server_common.check_server_healthy_or_start
|
|
149
279
|
def cancel(
|
|
150
280
|
name: Optional[str] = None,
|
|
151
|
-
job_ids: Optional[
|
|
281
|
+
job_ids: Optional[Sequence[int]] = None,
|
|
152
282
|
all: bool = False, # pylint: disable=redefined-builtin
|
|
153
283
|
all_users: bool = False,
|
|
154
|
-
|
|
284
|
+
pool: Optional[str] = None,
|
|
285
|
+
) -> server_common.RequestId[None]:
|
|
155
286
|
"""Cancels managed jobs.
|
|
156
287
|
|
|
157
288
|
Please refer to sky.cli.job_cancel for documentation.
|
|
@@ -161,6 +292,7 @@ def cancel(
|
|
|
161
292
|
job_ids: IDs of the managed jobs to cancel.
|
|
162
293
|
all: Whether to cancel all managed jobs.
|
|
163
294
|
all_users: Whether to cancel all managed jobs from all users.
|
|
295
|
+
pool: Pool name to cancel.
|
|
164
296
|
|
|
165
297
|
Returns:
|
|
166
298
|
The request ID of the cancel request.
|
|
@@ -169,29 +301,37 @@ def cancel(
|
|
|
169
301
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
|
170
302
|
RuntimeError: failed to cancel the job.
|
|
171
303
|
"""
|
|
304
|
+
remote_api_version = versions.get_remote_api_version()
|
|
305
|
+
if (pool is not None and
|
|
306
|
+
(remote_api_version is None or remote_api_version < 12)):
|
|
307
|
+
raise click.UsageError('Pools are not supported in your API server. '
|
|
308
|
+
'Please upgrade to a newer API server to use '
|
|
309
|
+
'pools.')
|
|
172
310
|
body = payloads.JobsCancelBody(
|
|
173
311
|
name=name,
|
|
174
312
|
job_ids=job_ids,
|
|
175
313
|
all=all,
|
|
176
314
|
all_users=all_users,
|
|
315
|
+
pool=pool,
|
|
177
316
|
)
|
|
178
|
-
response =
|
|
179
|
-
|
|
317
|
+
response = server_common.make_authenticated_request(
|
|
318
|
+
'POST',
|
|
319
|
+
'/jobs/cancel',
|
|
180
320
|
json=json.loads(body.model_dump_json()),
|
|
181
|
-
timeout=(5, None)
|
|
182
|
-
cookies=server_common.get_api_cookie_jar(),
|
|
183
|
-
)
|
|
321
|
+
timeout=(5, None))
|
|
184
322
|
return server_common.get_request_id(response=response)
|
|
185
323
|
|
|
186
324
|
|
|
187
325
|
@usage_lib.entrypoint
|
|
188
326
|
@server_common.check_server_healthy_or_start
|
|
327
|
+
@rest.retry_transient_errors()
|
|
189
328
|
def tail_logs(name: Optional[str] = None,
|
|
190
329
|
job_id: Optional[int] = None,
|
|
191
330
|
follow: bool = True,
|
|
192
331
|
controller: bool = False,
|
|
193
332
|
refresh: bool = False,
|
|
194
|
-
|
|
333
|
+
tail: Optional[int] = None,
|
|
334
|
+
output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
|
|
195
335
|
"""Tails logs of managed jobs.
|
|
196
336
|
|
|
197
337
|
You can provide either a job name or a job ID to tail logs. If both are not
|
|
@@ -203,6 +343,7 @@ def tail_logs(name: Optional[str] = None,
|
|
|
203
343
|
follow: Whether to follow the logs.
|
|
204
344
|
controller: Whether to tail logs from the jobs controller.
|
|
205
345
|
refresh: Whether to restart the jobs controller if it is stopped.
|
|
346
|
+
tail: Number of lines to tail from the end of the log file.
|
|
206
347
|
output_stream: The stream to write the logs to. If None, print to the
|
|
207
348
|
console.
|
|
208
349
|
|
|
@@ -210,6 +351,8 @@ def tail_logs(name: Optional[str] = None,
|
|
|
210
351
|
Exit code based on success or failure of the job. 0 if success,
|
|
211
352
|
100 if the job failed. See exceptions.JobExitCode for possible exit
|
|
212
353
|
codes.
|
|
354
|
+
Will return None if follow is False
|
|
355
|
+
(see note in sky/client/sdk.py::stream_response)
|
|
213
356
|
|
|
214
357
|
Request Raises:
|
|
215
358
|
ValueError: invalid arguments.
|
|
@@ -221,16 +364,23 @@ def tail_logs(name: Optional[str] = None,
|
|
|
221
364
|
follow=follow,
|
|
222
365
|
controller=controller,
|
|
223
366
|
refresh=refresh,
|
|
367
|
+
tail=tail,
|
|
224
368
|
)
|
|
225
|
-
response =
|
|
226
|
-
|
|
369
|
+
response = server_common.make_authenticated_request(
|
|
370
|
+
'POST',
|
|
371
|
+
'/jobs/logs',
|
|
227
372
|
json=json.loads(body.model_dump_json()),
|
|
228
373
|
stream=True,
|
|
229
|
-
timeout=(5, None)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
374
|
+
timeout=(5, None))
|
|
375
|
+
request_id: server_common.RequestId[int] = server_common.get_request_id(
|
|
376
|
+
response)
|
|
377
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
|
378
|
+
# streaming point on retry.
|
|
379
|
+
return sdk.stream_response(request_id=request_id,
|
|
380
|
+
response=response,
|
|
381
|
+
output_stream=output_stream,
|
|
382
|
+
resumable=(tail == 0),
|
|
383
|
+
get_result=follow)
|
|
234
384
|
|
|
235
385
|
|
|
236
386
|
@usage_lib.entrypoint
|
|
@@ -267,18 +417,18 @@ def download_logs(
|
|
|
267
417
|
controller=controller,
|
|
268
418
|
local_dir=local_dir,
|
|
269
419
|
)
|
|
270
|
-
response =
|
|
271
|
-
|
|
420
|
+
response = server_common.make_authenticated_request(
|
|
421
|
+
'POST',
|
|
422
|
+
'/jobs/download_logs',
|
|
272
423
|
json=json.loads(body.model_dump_json()),
|
|
273
|
-
timeout=(5, None)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
job_id_remote_path_dict = sdk.stream_and_get(
|
|
277
|
-
server_common.get_request_id(response))
|
|
424
|
+
timeout=(5, None))
|
|
425
|
+
request_id: server_common.RequestId[Dict[
|
|
426
|
+
str, str]] = server_common.get_request_id(response)
|
|
427
|
+
job_id_remote_path_dict = sdk.stream_and_get(request_id)
|
|
278
428
|
remote2local_path_dict = client_common.download_logs_from_api_server(
|
|
279
429
|
job_id_remote_path_dict.values())
|
|
280
430
|
return {
|
|
281
|
-
job_id: remote2local_path_dict[remote_path]
|
|
431
|
+
int(job_id): remote2local_path_dict[remote_path]
|
|
282
432
|
for job_id, remote_path in job_id_remote_path_dict.items()
|
|
283
433
|
}
|
|
284
434
|
|
|
@@ -314,3 +464,95 @@ def dashboard() -> None:
|
|
|
314
464
|
url = f'{api_server_url}/jobs/dashboard?{params}'
|
|
315
465
|
logger.info(f'Opening dashboard in browser: {url}')
|
|
316
466
|
webbrowser.open(url)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
@context.contextual
|
|
470
|
+
@usage_lib.entrypoint
|
|
471
|
+
@server_common.check_server_healthy_or_start
|
|
472
|
+
@versions.minimal_api_version(12)
|
|
473
|
+
def pool_apply(
|
|
474
|
+
task: Optional[Union['sky.Task', 'sky.Dag']],
|
|
475
|
+
pool_name: str,
|
|
476
|
+
mode: 'serve_utils.UpdateMode',
|
|
477
|
+
workers: Optional[int] = None,
|
|
478
|
+
# Internal only:
|
|
479
|
+
# pylint: disable=invalid-name
|
|
480
|
+
_need_confirmation: bool = False
|
|
481
|
+
) -> server_common.RequestId[None]:
|
|
482
|
+
"""Apply a config to a pool."""
|
|
483
|
+
remote_api_version = versions.get_remote_api_version()
|
|
484
|
+
if (workers is not None and
|
|
485
|
+
(remote_api_version is None or remote_api_version < 19)):
|
|
486
|
+
raise click.UsageError('Updating the number of workers in a pool is '
|
|
487
|
+
'not supported in your API server. Please '
|
|
488
|
+
'upgrade to a newer API server to use this '
|
|
489
|
+
'feature.')
|
|
490
|
+
return impl.apply(task,
|
|
491
|
+
workers,
|
|
492
|
+
pool_name,
|
|
493
|
+
mode,
|
|
494
|
+
pool=True,
|
|
495
|
+
_need_confirmation=_need_confirmation)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
@usage_lib.entrypoint
|
|
499
|
+
@server_common.check_server_healthy_or_start
|
|
500
|
+
@versions.minimal_api_version(12)
|
|
501
|
+
def pool_down(
|
|
502
|
+
pool_names: Optional[Union[str, List[str]]],
|
|
503
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
|
504
|
+
purge: bool = False,
|
|
505
|
+
) -> server_common.RequestId[None]:
|
|
506
|
+
"""Delete a pool."""
|
|
507
|
+
return impl.down(pool_names, all, purge, pool=True)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
@usage_lib.entrypoint
|
|
511
|
+
@server_common.check_server_healthy_or_start
|
|
512
|
+
@versions.minimal_api_version(12)
|
|
513
|
+
def pool_status(
|
|
514
|
+
pool_names: Optional[Union[str, List[str]]],
|
|
515
|
+
) -> server_common.RequestId[List[Dict[str, Any]]]:
|
|
516
|
+
"""Query a pool."""
|
|
517
|
+
return impl.status(pool_names, pool=True)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
@usage_lib.entrypoint
|
|
521
|
+
@server_common.check_server_healthy_or_start
|
|
522
|
+
@rest.retry_transient_errors()
|
|
523
|
+
@versions.minimal_api_version(16)
|
|
524
|
+
def pool_tail_logs(pool_name: str,
|
|
525
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
526
|
+
worker_id: Optional[int] = None,
|
|
527
|
+
follow: bool = True,
|
|
528
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
|
529
|
+
tail: Optional[int] = None) -> None:
|
|
530
|
+
"""Tails logs of a pool."""
|
|
531
|
+
return impl.tail_logs(pool_name,
|
|
532
|
+
target,
|
|
533
|
+
worker_id,
|
|
534
|
+
follow,
|
|
535
|
+
output_stream,
|
|
536
|
+
tail,
|
|
537
|
+
pool=True)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
@usage_lib.entrypoint
|
|
541
|
+
@server_common.check_server_healthy_or_start
|
|
542
|
+
@rest.retry_transient_errors()
|
|
543
|
+
@versions.minimal_api_version(16)
|
|
544
|
+
def pool_sync_down_logs(pool_name: str,
|
|
545
|
+
local_dir: str,
|
|
546
|
+
*,
|
|
547
|
+
targets: Optional[Union[
|
|
548
|
+
str, 'serve_utils.ServiceComponent', Sequence[Union[
|
|
549
|
+
str, 'serve_utils.ServiceComponent']]]] = None,
|
|
550
|
+
worker_ids: Optional[List[int]] = None,
|
|
551
|
+
tail: Optional[int] = None) -> None:
|
|
552
|
+
"""Sync down logs of a pool."""
|
|
553
|
+
return impl.sync_down_logs(pool_name,
|
|
554
|
+
local_dir,
|
|
555
|
+
targets=targets,
|
|
556
|
+
replica_ids=worker_ids,
|
|
557
|
+
tail=tail,
|
|
558
|
+
pool=True)
|