skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/constants.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
"""Constants for SkyPilot."""
|
|
2
|
-
import os
|
|
3
2
|
from typing import List, Tuple
|
|
4
3
|
|
|
5
4
|
from packaging import version
|
|
@@ -7,6 +6,23 @@ from packaging import version
|
|
|
7
6
|
import sky
|
|
8
7
|
from sky.setup_files import dependencies
|
|
9
8
|
|
|
9
|
+
# The base directory for all SkyPilot runtime artifacts.
|
|
10
|
+
# Historically, we have always used $HOME, but we couldn't
|
|
11
|
+
# do that for Slurm, because $HOME typically points to a NFS
|
|
12
|
+
# mounted directory, which does not work well with SQLite.
|
|
13
|
+
# https://sqlite.org/faq.html#q5
|
|
14
|
+
# Additionally, having the skypilot-runtime python venv be
|
|
15
|
+
# on an NFS makes things very slow.
|
|
16
|
+
SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
|
|
17
|
+
# Same as above but for use within python code instead of shell commands.
|
|
18
|
+
# Example usage:
|
|
19
|
+
# os.path.join(
|
|
20
|
+
# os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
|
|
21
|
+
# '.sky/jobs.db')
|
|
22
|
+
SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
|
|
23
|
+
# We keep sky_logs and sky_workdir in $HOME, because
|
|
24
|
+
# these are artifacts that users can access, and having
|
|
25
|
+
# them be in $HOME makes it more convenient.
|
|
10
26
|
SKY_LOGS_DIRECTORY = '~/sky_logs'
|
|
11
27
|
SKY_REMOTE_WORKDIR = '~/sky_workdir'
|
|
12
28
|
SKY_IGNORE_FILE = '.skyignore'
|
|
@@ -25,22 +41,23 @@ SKY_REMOTE_RAY_PORT_DICT_STR = (
|
|
|
25
41
|
f'"ray_dashboard_port":{SKY_REMOTE_RAY_DASHBOARD_PORT}}}')
|
|
26
42
|
# The file contains the ports of the Ray cluster that SkyPilot launched,
|
|
27
43
|
# i.e. the PORT_DICT_STR above.
|
|
28
|
-
SKY_REMOTE_RAY_PORT_FILE = '
|
|
44
|
+
SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
|
|
29
45
|
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
|
|
30
46
|
SKY_REMOTE_RAY_VERSION = '2.9.3'
|
|
31
47
|
|
|
48
|
+
SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
|
|
32
49
|
# We store the absolute path of the python executable (/opt/conda/bin/python3)
|
|
33
50
|
# in this file, so that any future internal commands that need to use python
|
|
34
51
|
# can use this path. This is useful for the case where the user has a custom
|
|
35
52
|
# conda environment as a default environment, which is not the same as the one
|
|
36
53
|
# used for installing SkyPilot runtime (ray and skypilot).
|
|
37
|
-
SKY_PYTHON_PATH_FILE = '
|
|
38
|
-
SKY_RAY_PATH_FILE = '
|
|
54
|
+
SKY_PYTHON_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/python_path'
|
|
55
|
+
SKY_RAY_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/ray_path'
|
|
39
56
|
SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
40
57
|
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
|
|
41
58
|
'which python3')
|
|
42
59
|
# Python executable, e.g., /opt/conda/bin/python3
|
|
43
|
-
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
|
|
60
|
+
SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
|
|
44
61
|
# Prefer SKY_UV_PIP_CMD, which is faster.
|
|
45
62
|
# TODO(cooperc): remove remaining usage (GCP TPU setup).
|
|
46
63
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
@@ -52,21 +69,29 @@ SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
|
|
|
52
69
|
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
|
|
53
70
|
# Separate env for SkyPilot runtime dependencies.
|
|
54
71
|
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
|
|
55
|
-
SKY_REMOTE_PYTHON_ENV: str = f'
|
|
72
|
+
SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
56
73
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
57
74
|
# uv is used for venv and pip, much faster than python implementations.
|
|
58
75
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
59
|
-
|
|
76
|
+
# set UV_SYSTEM_PYTHON to false in case the
|
|
77
|
+
# user provided docker image set it to true.
|
|
78
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
79
|
+
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
80
|
+
f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
|
|
60
81
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
61
82
|
SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
62
83
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
63
84
|
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
|
64
85
|
SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
|
|
65
|
-
|
|
66
|
-
|
|
86
|
+
SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
|
|
87
|
+
'--no-project --no-config')
|
|
88
|
+
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
|
|
89
|
+
# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
|
|
90
|
+
# not work when conda is used.
|
|
67
91
|
DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
68
92
|
'export PATH='
|
|
69
|
-
f'$(echo $PATH | sed "s|$(echo
|
|
93
|
+
f'$(echo $PATH | sed "s|$(echo {SKY_REMOTE_PYTHON_ENV})/bin:||") && '
|
|
94
|
+
'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
|
|
70
95
|
|
|
71
96
|
# Prefix for SkyPilot environment variables
|
|
72
97
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
@@ -91,14 +116,17 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
91
116
|
# cluster yaml is updated.
|
|
92
117
|
#
|
|
93
118
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
94
|
-
SKYLET_VERSION = '
|
|
119
|
+
SKYLET_VERSION = '27'
|
|
95
120
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
96
121
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
97
122
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
98
123
|
SKYLET_LIB_VERSION = 4
|
|
99
|
-
SKYLET_VERSION_FILE = '
|
|
124
|
+
SKYLET_VERSION_FILE = '.sky/skylet_version'
|
|
125
|
+
SKYLET_LOG_FILE = '.sky/skylet.log'
|
|
126
|
+
SKYLET_PID_FILE = '.sky/skylet_pid'
|
|
127
|
+
SKYLET_PORT_FILE = '.sky/skylet_port'
|
|
100
128
|
SKYLET_GRPC_PORT = 46590
|
|
101
|
-
SKYLET_GRPC_TIMEOUT_SECONDS =
|
|
129
|
+
SKYLET_GRPC_TIMEOUT_SECONDS = 10
|
|
102
130
|
|
|
103
131
|
# Docker default options
|
|
104
132
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
|
@@ -150,7 +178,7 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
150
178
|
# because for some images, conda is already installed, but not initialized.
|
|
151
179
|
# In this case, we need to initialize conda and set auto_activate_base to
|
|
152
180
|
# true.
|
|
153
|
-
'{ bash Miniconda3-Linux.sh -b; '
|
|
181
|
+
'{ bash Miniconda3-Linux.sh -b || true; '
|
|
154
182
|
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
|
|
155
183
|
# Caller should replace {conda_auto_activate} with either true or false.
|
|
156
184
|
'conda config --set auto_activate_base {conda_auto_activate} && '
|
|
@@ -172,7 +200,7 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
172
200
|
'fi;'
|
|
173
201
|
# Install uv for venv management and pip installation.
|
|
174
202
|
f'{SKY_UV_INSTALL_CMD};'
|
|
175
|
-
# Create a separate
|
|
203
|
+
# Create a separate python environment for SkyPilot dependencies.
|
|
176
204
|
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
|
|
177
205
|
# Do NOT use --system-site-packages here, because if users upgrade any
|
|
178
206
|
# packages in the base env, they interfere with skypilot dependencies.
|
|
@@ -217,7 +245,9 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
217
245
|
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
|
218
246
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
|
219
247
|
f'|| {RAY_STATUS} || '
|
|
220
|
-
|
|
248
|
+
# The pydantic-core==2.41.3 for arm seems corrupted
|
|
249
|
+
# so we need to avoid that specific version.
|
|
250
|
+
f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
|
|
221
251
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
|
222
252
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
|
223
253
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
|
@@ -229,9 +259,24 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
229
259
|
'export PATH=$PATH:$HOME/.local/bin; '
|
|
230
260
|
# Writes ray path to file if it does not exist or the file is empty.
|
|
231
261
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
|
232
|
-
f'{{ {
|
|
262
|
+
f'{{ {SKY_UV_RUN_CMD} '
|
|
233
263
|
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
|
|
234
264
|
|
|
265
|
+
# Copy SkyPilot templates from the installed wheel to ~/sky_templates.
|
|
266
|
+
# This must run after the skypilot wheel is installed.
|
|
267
|
+
COPY_SKYPILOT_TEMPLATES_COMMANDS = (
|
|
268
|
+
f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
269
|
+
f'{SKY_PYTHON_CMD} -c \''
|
|
270
|
+
'import sky_templates, shutil, os; '
|
|
271
|
+
'src = os.path.dirname(sky_templates.__file__); '
|
|
272
|
+
'dst = os.path.expanduser(\"~/sky_templates\"); '
|
|
273
|
+
'print(f\"Copying templates from {src} to {dst}...\"); '
|
|
274
|
+
'shutil.copytree(src, dst, dirs_exist_ok=True); '
|
|
275
|
+
'print(f\"Templates copied successfully\")\'; '
|
|
276
|
+
# Make scripts executable.
|
|
277
|
+
'find ~/sky_templates -type f ! -name "*.py" ! -name "*.md" '
|
|
278
|
+
'-exec chmod +x {} \\; ')
|
|
279
|
+
|
|
235
280
|
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
|
236
281
|
f'{SKY_UV_INSTALL_CMD};'
|
|
237
282
|
f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
|
|
@@ -322,6 +367,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
|
|
|
322
367
|
# controller_utils.translate_local_file_mounts_to_two_hop().
|
|
323
368
|
FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
|
|
324
369
|
|
|
370
|
+
# For passing in CPU and memory limits to the controller pod when running
|
|
371
|
+
# in k8s. Right now, we only use this for the jobs controller, but we may
|
|
372
|
+
# use this for the serve controller as well in the future.
|
|
373
|
+
# These files are written to disk by the skylet, who reads it from env vars
|
|
374
|
+
# passed by the backend when starting the skylet (start_skylet_on_head_node).
|
|
375
|
+
CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
|
|
376
|
+
CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
|
|
377
|
+
|
|
325
378
|
# Used when an managed jobs are created and
|
|
326
379
|
# files are synced up to the cloud.
|
|
327
380
|
FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
|
|
@@ -353,6 +406,8 @@ SERVICE_ACCOUNT_TOKEN_ENV_VAR = (
|
|
|
353
406
|
# SkyPilot environment variables
|
|
354
407
|
SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
|
|
355
408
|
SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
|
|
409
|
+
SKYPILOT_SETUP_NUM_GPUS_PER_NODE = (
|
|
410
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SETUP_NUM_GPUS_PER_NODE')
|
|
356
411
|
SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
|
|
357
412
|
SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
|
|
358
413
|
|
|
@@ -371,7 +426,9 @@ RCLONE_CACHE_REFRESH_INTERVAL = 10
|
|
|
371
426
|
OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
372
427
|
('docker', 'run_options'),
|
|
373
428
|
('nvidia_gpus', 'disable_ecc'),
|
|
429
|
+
('ssh', 'custom_metadata'),
|
|
374
430
|
('ssh', 'pod_config'),
|
|
431
|
+
('ssh', 'provision_timeout'),
|
|
375
432
|
('kubernetes', 'custom_metadata'),
|
|
376
433
|
('kubernetes', 'pod_config'),
|
|
377
434
|
('kubernetes', 'provision_timeout'),
|
|
@@ -381,13 +438,31 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
381
438
|
('gcp', 'enable_gvnic'),
|
|
382
439
|
('gcp', 'enable_gpu_direct'),
|
|
383
440
|
('gcp', 'placement_policy'),
|
|
441
|
+
('active_workspace',),
|
|
384
442
|
]
|
|
385
443
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
386
444
|
# we skip the following keys because they are meant to be client-side configs.
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
445
|
+
# Also, we skip the consolidation mode config as those should be only set on
|
|
446
|
+
# the API server side.
|
|
447
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
448
|
+
('api_server',),
|
|
449
|
+
('allowed_clouds',),
|
|
450
|
+
('workspaces',),
|
|
451
|
+
('db',),
|
|
452
|
+
('daemons',),
|
|
453
|
+
# TODO(kevin,tian): Override the whole controller config once our test
|
|
454
|
+
# infrastructure supports setting dynamic server side configs.
|
|
455
|
+
# Tests that are affected:
|
|
456
|
+
# - test_managed_jobs_ha_kill_starting
|
|
457
|
+
# - test_managed_jobs_ha_kill_running
|
|
458
|
+
# - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
|
|
459
|
+
# LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
|
|
460
|
+
# but the configs won't be applied)
|
|
461
|
+
('jobs', 'controller', 'consolidation_mode'),
|
|
462
|
+
('serve', 'controller', 'consolidation_mode'),
|
|
463
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
464
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
465
|
+
]
|
|
391
466
|
|
|
392
467
|
# Constants for Azure blob storage
|
|
393
468
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
|
@@ -421,6 +496,11 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
|
421
496
|
# TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
|
|
422
497
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
|
423
498
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
499
|
+
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
|
|
500
|
+
IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
|
|
501
|
+
|
|
502
|
+
SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
|
|
503
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
|
|
424
504
|
|
|
425
505
|
# Environment variable that is set to 'true' if metrics are enabled.
|
|
426
506
|
ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
|
|
@@ -436,6 +516,7 @@ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
|
|
|
436
516
|
# authentication is enabled in the API server.
|
|
437
517
|
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
|
438
518
|
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
|
519
|
+
SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
|
|
439
520
|
ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
|
|
440
521
|
|
|
441
522
|
# Enable debug logging for requests.
|
|
@@ -447,11 +528,12 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
|
447
528
|
# BEGIN constants used for service catalog.
|
|
448
529
|
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
|
|
449
530
|
HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
|
|
450
|
-
CATALOG_SCHEMA_VERSION = '
|
|
531
|
+
CATALOG_SCHEMA_VERSION = 'v8'
|
|
451
532
|
CATALOG_DIR = '~/.sky/catalogs'
|
|
452
533
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
453
534
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
454
|
-
'paperspace', 'do', 'nebius', 'ssh',
|
|
535
|
+
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
|
|
536
|
+
'hyperbolic', 'seeweb', 'shadeform')
|
|
455
537
|
# END constants used for service catalog.
|
|
456
538
|
|
|
457
539
|
# The user ID of the SkyPilot system.
|
|
@@ -503,8 +585,11 @@ DEFAULT_PRIORITY = 0
|
|
|
503
585
|
GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
|
|
504
586
|
COST_REPORT_DEFAULT_DAYS = 30
|
|
505
587
|
|
|
506
|
-
# The directory for file locks.
|
|
507
|
-
SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
|
|
508
|
-
|
|
509
588
|
ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
510
589
|
'DEBUG_LOOP_LAG_THRESHOLD_MS')
|
|
590
|
+
|
|
591
|
+
ARM64_ARCH = 'arm64'
|
|
592
|
+
X86_64_ARCH = 'x86_64'
|
|
593
|
+
|
|
594
|
+
SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
|
|
595
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
|
sky/skylet/events.py
CHANGED
|
@@ -11,7 +11,8 @@ import psutil
|
|
|
11
11
|
from sky import clouds
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.backends import cloud_vm_ray_backend
|
|
14
|
-
from sky.jobs import
|
|
14
|
+
from sky.jobs import constants as managed_job_constants
|
|
15
|
+
from sky.jobs import scheduler
|
|
15
16
|
from sky.jobs import state as managed_job_state
|
|
16
17
|
from sky.jobs import utils as managed_job_utils
|
|
17
18
|
from sky.serve import serve_utils
|
|
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
|
|
|
21
22
|
from sky.usage import usage_lib
|
|
22
23
|
from sky.utils import cluster_utils
|
|
23
24
|
from sky.utils import registry
|
|
25
|
+
from sky.utils import subprocess_utils
|
|
24
26
|
from sky.utils import ux_utils
|
|
25
27
|
from sky.utils import yaml_utils
|
|
26
28
|
|
|
@@ -45,6 +47,9 @@ class SkyletEvent:
|
|
|
45
47
|
EVENT_CHECKING_INTERVAL_SECONDS))
|
|
46
48
|
self._n = 0
|
|
47
49
|
|
|
50
|
+
def start(self):
|
|
51
|
+
pass
|
|
52
|
+
|
|
48
53
|
def run(self):
|
|
49
54
|
self._n = (self._n + 1) % self._event_interval
|
|
50
55
|
if self._n % self._event_interval == 0:
|
|
@@ -73,18 +78,60 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
73
78
|
"""Skylet event for updating and scheduling managed jobs."""
|
|
74
79
|
EVENT_INTERVAL_SECONDS = 300
|
|
75
80
|
|
|
81
|
+
def start(self):
|
|
82
|
+
cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
|
|
83
|
+
if cpus_env_var is not None:
|
|
84
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
|
|
85
|
+
'w',
|
|
86
|
+
encoding='utf-8') as f:
|
|
87
|
+
f.write(cpus_env_var)
|
|
88
|
+
memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
|
89
|
+
if memory_env_var is not None:
|
|
90
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
91
|
+
'w',
|
|
92
|
+
encoding='utf-8') as f:
|
|
93
|
+
f.write(memory_env_var)
|
|
94
|
+
|
|
76
95
|
def _run(self):
|
|
96
|
+
if not os.path.exists(
|
|
97
|
+
os.path.expanduser(
|
|
98
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)
|
|
99
|
+
) and not managed_job_utils.is_consolidation_mode():
|
|
100
|
+
# Note: since the skylet is started before the user setup (in
|
|
101
|
+
# jobs-controller.yaml.j2) runs, it's possible that we hit this
|
|
102
|
+
# before the indicator file is written. However, since we will wait
|
|
103
|
+
# EVENT_INTERVAL_SECONDS before the first run, this should be very
|
|
104
|
+
# unlikely.
|
|
105
|
+
logger.info('No jobs controller indicator file found.')
|
|
106
|
+
all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
|
|
107
|
+
if not all_job_ids:
|
|
108
|
+
logger.info('No jobs running. Stopping controllers.')
|
|
109
|
+
# TODO(cooperc): Move this to a shared function also called by
|
|
110
|
+
# sdk.api_stop(). (#7229)
|
|
111
|
+
try:
|
|
112
|
+
records = scheduler.get_controller_process_records()
|
|
113
|
+
if records is not None:
|
|
114
|
+
for record in records:
|
|
115
|
+
if managed_job_utils.controller_process_alive(
|
|
116
|
+
record, quiet=False):
|
|
117
|
+
subprocess_utils.kill_children_processes(
|
|
118
|
+
parent_pids=[record.pid], force=True)
|
|
119
|
+
os.remove(
|
|
120
|
+
os.path.expanduser(
|
|
121
|
+
scheduler.JOB_CONTROLLER_PID_PATH))
|
|
122
|
+
except Exception as e: # pylint: disable=broad-except
|
|
123
|
+
# in case we get perm issues or something is messed up, just
|
|
124
|
+
# ignore it and assume the process is dead
|
|
125
|
+
logger.error(
|
|
126
|
+
f'Error looking at job controller pid file: {e}')
|
|
127
|
+
pass
|
|
128
|
+
logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
|
|
129
|
+
'indicator file hasn\'t been written yet.')
|
|
130
|
+
return
|
|
131
|
+
|
|
77
132
|
logger.info('=== Updating managed job status ===')
|
|
78
133
|
managed_job_utils.update_managed_jobs_statuses()
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class ManagedJobSchedulingEvent(SkyletEvent):
|
|
82
|
-
"""Skylet event for scheduling managed jobs."""
|
|
83
|
-
EVENT_INTERVAL_SECONDS = 20
|
|
84
|
-
|
|
85
|
-
def _run(self):
|
|
86
|
-
logger.info('=== Scheduling next jobs ===')
|
|
87
|
-
managed_job_scheduler.maybe_schedule_next_jobs()
|
|
134
|
+
scheduler.maybe_start_controllers()
|
|
88
135
|
|
|
89
136
|
|
|
90
137
|
class ServiceUpdateEvent(SkyletEvent):
|
|
@@ -275,8 +322,15 @@ class AutostopEvent(SkyletEvent):
|
|
|
275
322
|
cluster_name_on_cloud = cluster_config['cluster_name']
|
|
276
323
|
is_cluster_multinode = cluster_config['max_workers'] > 0
|
|
277
324
|
|
|
325
|
+
# Clear AWS credentials from environment to force boto3 to use IAM
|
|
326
|
+
# role attached to the instance (lowest priority in credential chain).
|
|
327
|
+
# This allows the cluster to stop/terminate itself using its IAM role.
|
|
278
328
|
os.environ.pop('AWS_ACCESS_KEY_ID', None)
|
|
279
329
|
os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
|
|
330
|
+
os.environ.pop('AWS_SESSION_TOKEN', None)
|
|
331
|
+
# Point boto3 to /dev/null to skip reading credentials from files.
|
|
332
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
|
|
333
|
+
os.environ['AWS_CONFIG_FILE'] = '/dev/null'
|
|
280
334
|
|
|
281
335
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
282
336
|
# stopping/terminating of the cluster.
|