skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/__init__.py
CHANGED
|
@@ -11,6 +11,7 @@ from sky.jobs.client.sdk import pool_status
|
|
|
11
11
|
from sky.jobs.client.sdk import pool_sync_down_logs
|
|
12
12
|
from sky.jobs.client.sdk import pool_tail_logs
|
|
13
13
|
from sky.jobs.client.sdk import queue
|
|
14
|
+
from sky.jobs.client.sdk import queue_v2
|
|
14
15
|
from sky.jobs.client.sdk import tail_logs
|
|
15
16
|
from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
|
|
16
17
|
from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
|
|
@@ -38,6 +39,7 @@ __all__ = [
|
|
|
38
39
|
'cancel',
|
|
39
40
|
'launch',
|
|
40
41
|
'queue',
|
|
42
|
+
'queue_v2',
|
|
41
43
|
'tail_logs',
|
|
42
44
|
'dashboard',
|
|
43
45
|
'download_logs',
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -9,11 +9,13 @@ from sky import sky_logging
|
|
|
9
9
|
from sky.adaptors import common as adaptors_common
|
|
10
10
|
from sky.client import common as client_common
|
|
11
11
|
from sky.client import sdk
|
|
12
|
+
from sky.schemas.api import responses
|
|
12
13
|
from sky.serve.client import impl
|
|
13
14
|
from sky.server import common as server_common
|
|
14
15
|
from sky.server import rest
|
|
15
16
|
from sky.server import versions
|
|
16
17
|
from sky.server.requests import payloads
|
|
18
|
+
from sky.server.requests import request_names
|
|
17
19
|
from sky.skylet import constants
|
|
18
20
|
from sky.usage import usage_lib
|
|
19
21
|
from sky.utils import admin_policy_utils
|
|
@@ -82,8 +84,11 @@ def launch(
|
|
|
82
84
|
raise click.UsageError('Cannot specify num_jobs without pool.')
|
|
83
85
|
|
|
84
86
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
87
|
+
|
|
85
88
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
86
|
-
dag,
|
|
89
|
+
dag,
|
|
90
|
+
request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH,
|
|
91
|
+
at_client_side=True) as dag:
|
|
87
92
|
sdk.validate(dag)
|
|
88
93
|
if _need_confirmation:
|
|
89
94
|
job_identity = 'a managed job'
|
|
@@ -123,6 +128,87 @@ def launch(
|
|
|
123
128
|
return server_common.get_request_id(response)
|
|
124
129
|
|
|
125
130
|
|
|
131
|
+
@usage_lib.entrypoint
|
|
132
|
+
@server_common.check_server_healthy_or_start
|
|
133
|
+
@versions.minimal_api_version(18)
|
|
134
|
+
def queue_v2(
|
|
135
|
+
refresh: bool,
|
|
136
|
+
skip_finished: bool = False,
|
|
137
|
+
all_users: bool = False,
|
|
138
|
+
job_ids: Optional[List[int]] = None,
|
|
139
|
+
limit: Optional[int] = None,
|
|
140
|
+
fields: Optional[List[str]] = None,
|
|
141
|
+
) -> server_common.RequestId[Tuple[List[responses.ManagedJobRecord], int, Dict[
|
|
142
|
+
str, int], int]]:
|
|
143
|
+
"""Gets statuses of managed jobs.
|
|
144
|
+
|
|
145
|
+
Please refer to sky.cli.job_queue for documentation.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
refresh: Whether to restart the jobs controller if it is stopped.
|
|
149
|
+
skip_finished: Whether to skip finished jobs.
|
|
150
|
+
all_users: Whether to show all users' jobs.
|
|
151
|
+
job_ids: IDs of the managed jobs to show.
|
|
152
|
+
limit: Number of jobs to show.
|
|
153
|
+
fields: Fields to get for the managed jobs.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
The request ID of the queue request.
|
|
157
|
+
|
|
158
|
+
Request Returns:
|
|
159
|
+
job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
|
|
160
|
+
containing the information of a job.
|
|
161
|
+
|
|
162
|
+
.. code-block:: python
|
|
163
|
+
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
'job_id': (int) job id,
|
|
167
|
+
'job_name': (str) job name,
|
|
168
|
+
'resources': (str) resources of the job,
|
|
169
|
+
'submitted_at': (float) timestamp of submission,
|
|
170
|
+
'end_at': (float) timestamp of end,
|
|
171
|
+
'job_duration': (float) duration in seconds,
|
|
172
|
+
'recovery_count': (int) Number of retries,
|
|
173
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
174
|
+
'cluster_resources': (str) resources of the cluster,
|
|
175
|
+
'region': (str) region of the cluster,
|
|
176
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
177
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
178
|
+
}
|
|
179
|
+
]
|
|
180
|
+
total (int): Total number of jobs after filter,
|
|
181
|
+
status_counts (Dict[str, int]): Status counts after filter,
|
|
182
|
+
total_no_filter (int): Total number of jobs before filter,
|
|
183
|
+
|
|
184
|
+
Request Raises:
|
|
185
|
+
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
|
186
|
+
does not exist.
|
|
187
|
+
RuntimeError: if failed to get the managed jobs with ssh.
|
|
188
|
+
"""
|
|
189
|
+
body = payloads.JobsQueueV2Body(
|
|
190
|
+
refresh=refresh,
|
|
191
|
+
skip_finished=skip_finished,
|
|
192
|
+
all_users=all_users,
|
|
193
|
+
job_ids=job_ids,
|
|
194
|
+
limit=limit,
|
|
195
|
+
fields=fields,
|
|
196
|
+
)
|
|
197
|
+
path = '/jobs/queue/v2'
|
|
198
|
+
response = server_common.make_authenticated_request(
|
|
199
|
+
'POST',
|
|
200
|
+
path,
|
|
201
|
+
json=json.loads(body.model_dump_json()),
|
|
202
|
+
timeout=(5, None))
|
|
203
|
+
return server_common.get_request_id(response=response)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# Deprecated. Please use queue_v2 instead for better performance.
|
|
207
|
+
# In https://github.com/skypilot-org/skypilot/pull/7695, the `queue` function
|
|
208
|
+
# is updated to return new typed data for performance improvement if the API
|
|
209
|
+
# server supports it, which breaks the backward compatibility.
|
|
210
|
+
# In https://github.com/skypilot-org/skypilot/pull/8015, we revert the change
|
|
211
|
+
# and add a new function `queue_v2` to return the new typed data.
|
|
126
212
|
@usage_lib.entrypoint
|
|
127
213
|
@server_common.check_server_healthy_or_start
|
|
128
214
|
def queue(
|
|
@@ -130,9 +216,11 @@ def queue(
|
|
|
130
216
|
skip_finished: bool = False,
|
|
131
217
|
all_users: bool = False,
|
|
132
218
|
job_ids: Optional[List[int]] = None
|
|
133
|
-
) -> server_common.RequestId[List[
|
|
219
|
+
) -> server_common.RequestId[List[responses.ManagedJobRecord]]:
|
|
134
220
|
"""Gets statuses of managed jobs.
|
|
135
221
|
|
|
222
|
+
Deprecated. Please use queue_v2 instead for better performance.
|
|
223
|
+
|
|
136
224
|
Please refer to sky.cli.job_queue for documentation.
|
|
137
225
|
|
|
138
226
|
Args:
|
|
@@ -145,7 +233,7 @@ def queue(
|
|
|
145
233
|
The request ID of the queue request.
|
|
146
234
|
|
|
147
235
|
Request Returns:
|
|
148
|
-
job_records (List[
|
|
236
|
+
job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
|
|
149
237
|
containing the information of a job.
|
|
150
238
|
|
|
151
239
|
.. code-block:: python
|
|
@@ -383,15 +471,24 @@ def dashboard() -> None:
|
|
|
383
471
|
@server_common.check_server_healthy_or_start
|
|
384
472
|
@versions.minimal_api_version(12)
|
|
385
473
|
def pool_apply(
|
|
386
|
-
task: Union['sky.Task', 'sky.Dag'],
|
|
474
|
+
task: Optional[Union['sky.Task', 'sky.Dag']],
|
|
387
475
|
pool_name: str,
|
|
388
476
|
mode: 'serve_utils.UpdateMode',
|
|
477
|
+
workers: Optional[int] = None,
|
|
389
478
|
# Internal only:
|
|
390
479
|
# pylint: disable=invalid-name
|
|
391
480
|
_need_confirmation: bool = False
|
|
392
481
|
) -> server_common.RequestId[None]:
|
|
393
482
|
"""Apply a config to a pool."""
|
|
483
|
+
remote_api_version = versions.get_remote_api_version()
|
|
484
|
+
if (workers is not None and
|
|
485
|
+
(remote_api_version is None or remote_api_version < 19)):
|
|
486
|
+
raise click.UsageError('Updating the number of workers in a pool is '
|
|
487
|
+
'not supported in your API server. Please '
|
|
488
|
+
'upgrade to a newer API server to use this '
|
|
489
|
+
'feature.')
|
|
394
490
|
return impl.apply(task,
|
|
491
|
+
workers,
|
|
395
492
|
pool_name,
|
|
396
493
|
mode,
|
|
397
494
|
pool=True,
|
sky/jobs/client/sdk_async.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
"""Async SDK functions for managed jobs."""
|
|
2
2
|
import typing
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
4
4
|
|
|
5
5
|
from sky import backends
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import common as adaptors_common
|
|
8
8
|
from sky.client import sdk_async
|
|
9
9
|
from sky.jobs.client import sdk
|
|
10
|
+
from sky.schemas.api import responses
|
|
10
11
|
from sky.skylet import constants
|
|
11
12
|
from sky.usage import usage_lib
|
|
12
13
|
from sky.utils import common_utils
|
|
@@ -28,6 +29,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
28
29
|
async def launch(
|
|
29
30
|
task: Union['sky.Task', 'sky.Dag'],
|
|
30
31
|
name: Optional[str] = None,
|
|
32
|
+
pool: Optional[str] = None,
|
|
33
|
+
num_jobs: Optional[int] = None,
|
|
31
34
|
# Internal only:
|
|
32
35
|
# pylint: disable=invalid-name
|
|
33
36
|
_need_confirmation: bool = False,
|
|
@@ -35,8 +38,29 @@ async def launch(
|
|
|
35
38
|
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
|
|
36
39
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
37
40
|
"""Async version of launch() that launches a managed job."""
|
|
38
|
-
request_id = await context_utils.to_thread(sdk.launch, task, name,
|
|
39
|
-
_need_confirmation)
|
|
41
|
+
request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
|
|
42
|
+
num_jobs, _need_confirmation)
|
|
43
|
+
if stream_logs is not None:
|
|
44
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
45
|
+
else:
|
|
46
|
+
return await sdk_async.get(request_id)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@usage_lib.entrypoint
|
|
50
|
+
async def queue_v2(
|
|
51
|
+
refresh: bool,
|
|
52
|
+
skip_finished: bool = False,
|
|
53
|
+
all_users: bool = False,
|
|
54
|
+
job_ids: Optional[List[int]] = None,
|
|
55
|
+
limit: Optional[int] = None,
|
|
56
|
+
fields: Optional[List[str]] = None,
|
|
57
|
+
stream_logs: Optional[
|
|
58
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
59
|
+
) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
|
|
60
|
+
"""Async version of queue_v2() that gets statuses of managed jobs."""
|
|
61
|
+
request_id = await context_utils.to_thread(sdk.queue_v2, refresh,
|
|
62
|
+
skip_finished, all_users,
|
|
63
|
+
job_ids, limit, fields)
|
|
40
64
|
if stream_logs is not None:
|
|
41
65
|
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
42
66
|
else:
|
|
@@ -48,12 +72,14 @@ async def queue(
|
|
|
48
72
|
refresh: bool,
|
|
49
73
|
skip_finished: bool = False,
|
|
50
74
|
all_users: bool = False,
|
|
75
|
+
job_ids: Optional[List[int]] = None,
|
|
51
76
|
stream_logs: Optional[
|
|
52
77
|
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
53
|
-
) -> List[
|
|
78
|
+
) -> List[responses.ManagedJobRecord]:
|
|
54
79
|
"""Async version of queue() that gets statuses of managed jobs."""
|
|
55
80
|
request_id = await context_utils.to_thread(sdk.queue, refresh,
|
|
56
|
-
skip_finished, all_users
|
|
81
|
+
skip_finished, all_users,
|
|
82
|
+
job_ids)
|
|
57
83
|
if stream_logs is not None:
|
|
58
84
|
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
59
85
|
else:
|
sky/jobs/constants.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Constants used for Managed Jobs."""
|
|
2
|
+
import os
|
|
2
3
|
from typing import Any, Dict, Union
|
|
3
4
|
|
|
4
5
|
from sky.skylet import constants as skylet_constants
|
|
@@ -9,17 +10,21 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
|
|
9
10
|
|
|
10
11
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
|
11
12
|
|
|
13
|
+
JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
|
|
14
|
+
|
|
15
|
+
CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
|
|
16
|
+
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
17
|
+
|
|
18
|
+
# The consolidation mode lock ensures that if multiple API servers are running
|
|
19
|
+
# at the same time (e.g. during a rolling update), recovery can only happen once
|
|
20
|
+
# the previous API server has exited.
|
|
21
|
+
CONSOLIDATION_MODE_LOCK_ID = '~/.sky/consolidation_mode_lock'
|
|
22
|
+
|
|
12
23
|
# Resources as a dict for the jobs controller.
|
|
13
|
-
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
|
14
|
-
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
|
15
|
-
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
|
16
|
-
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
|
17
|
-
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
|
18
|
-
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
|
19
24
|
# We use 50 GB disk size to reduce the cost.
|
|
20
25
|
CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
|
|
21
26
|
'cpus': '4+',
|
|
22
|
-
'memory': '
|
|
27
|
+
'memory': '4x',
|
|
23
28
|
'disk_size': 50
|
|
24
29
|
}
|
|
25
30
|
|
|
@@ -47,7 +52,9 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
47
52
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
48
53
|
# change for the jobs/utils, we need to bump this version and update
|
|
49
54
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
50
|
-
|
|
55
|
+
# WARNING: If you update this due to a codegen change, make sure to make the
|
|
56
|
+
# corresponding change in the ManagedJobsService AND bump the SKYLET_VERSION.
|
|
57
|
+
MANAGED_JOBS_VERSION = 12
|
|
51
58
|
|
|
52
59
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
|
53
60
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|