skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/setup_files/dependencies.py
CHANGED
|
@@ -48,9 +48,18 @@ install_requires = [
|
|
|
48
48
|
# (https://github.com/yaml/pyyaml/issues/601)
|
|
49
49
|
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
|
|
50
50
|
'pyyaml > 3.13, != 5.4.*',
|
|
51
|
+
'ijson',
|
|
52
|
+
'orjson',
|
|
51
53
|
'requests',
|
|
54
|
+
# SkyPilot inherits from uvicorn.Server to customize the behavior of
|
|
55
|
+
# uvicorn, so we need to pin uvicorn version to avoid potential break
|
|
56
|
+
# changes.
|
|
57
|
+
# Notes for current version check:
|
|
58
|
+
# - uvicorn 0.33.0 is the latest version that supports Python 3.8
|
|
59
|
+
# - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
|
|
60
|
+
# behavior.
|
|
61
|
+
'uvicorn[standard] >=0.33.0, <0.36.0',
|
|
52
62
|
'fastapi',
|
|
53
|
-
'uvicorn[standard]',
|
|
54
63
|
# Some pydantic versions are not compatible with ray. Adopted from ray's
|
|
55
64
|
# setup.py:
|
|
56
65
|
# https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254
|
|
@@ -63,6 +72,8 @@ install_requires = [
|
|
|
63
72
|
'setproctitle',
|
|
64
73
|
'sqlalchemy',
|
|
65
74
|
'psycopg2-binary',
|
|
75
|
+
'aiosqlite',
|
|
76
|
+
'asyncpg',
|
|
66
77
|
# TODO(hailong): These three dependencies should be removed after we make
|
|
67
78
|
# the client-side actually not importing them.
|
|
68
79
|
'casbin',
|
|
@@ -70,13 +81,13 @@ install_requires = [
|
|
|
70
81
|
# Required for API server metrics
|
|
71
82
|
'prometheus_client>=0.8.0',
|
|
72
83
|
'passlib',
|
|
73
|
-
'bcrypt',
|
|
84
|
+
'bcrypt==4.0.1',
|
|
74
85
|
'pyjwt',
|
|
75
86
|
'gitpython',
|
|
87
|
+
'paramiko',
|
|
76
88
|
'types-paramiko',
|
|
77
89
|
'alembic',
|
|
78
90
|
'aiohttp',
|
|
79
|
-
'aiosqlite',
|
|
80
91
|
'anyio',
|
|
81
92
|
]
|
|
82
93
|
|
|
@@ -94,6 +105,10 @@ GRPC = 'grpcio>=1.63.0'
|
|
|
94
105
|
PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
|
|
95
106
|
|
|
96
107
|
server_dependencies = [
|
|
108
|
+
# TODO: Some of these dependencies are also specified in install_requires,
|
|
109
|
+
# so they are redundant here. We should figure out if they are only needed
|
|
110
|
+
# on the server (should remove from install_requires), or if they are needed
|
|
111
|
+
# on the client (should remove from here).
|
|
97
112
|
'casbin',
|
|
98
113
|
'sqlalchemy_adapter',
|
|
99
114
|
'passlib',
|
|
@@ -103,14 +118,16 @@ server_dependencies = [
|
|
|
103
118
|
GRPC,
|
|
104
119
|
PROTOBUF,
|
|
105
120
|
'aiosqlite',
|
|
121
|
+
'greenlet',
|
|
106
122
|
]
|
|
107
123
|
|
|
108
124
|
local_ray = [
|
|
109
125
|
# Lower version of ray will cause dependency conflict for
|
|
110
126
|
# click/grpcio/protobuf.
|
|
111
|
-
#
|
|
127
|
+
# Ray 2.6.1+ resolved cluster launcher bugs
|
|
128
|
+
# and grpcio issues on Apple Silicon.
|
|
112
129
|
# https://github.com/ray-project/ray/releases/tag/ray-2.6.1
|
|
113
|
-
'ray[default] >= 2.
|
|
130
|
+
'ray[default] >= 2.6.1',
|
|
114
131
|
]
|
|
115
132
|
|
|
116
133
|
remote = [
|
|
@@ -132,11 +149,19 @@ aws_dependencies = [
|
|
|
132
149
|
'colorama < 0.4.5',
|
|
133
150
|
]
|
|
134
151
|
|
|
152
|
+
# Kubernetes 32.0.0 has an authentication bug:
|
|
153
|
+
# https://github.com/kubernetes-client/python/issues/2333
|
|
154
|
+
kubernetes_dependencies = [
|
|
155
|
+
'kubernetes>=20.0.0,!=32.0.0',
|
|
156
|
+
'websockets',
|
|
157
|
+
'python-dateutil',
|
|
158
|
+
]
|
|
159
|
+
|
|
135
160
|
# azure-cli cannot be installed normally by uv, so we need to work around it in
|
|
136
161
|
# a few places.
|
|
137
162
|
AZURE_CLI = 'azure-cli>=2.65.0'
|
|
138
163
|
|
|
139
|
-
|
|
164
|
+
cloud_dependencies: Dict[str, List[str]] = {
|
|
140
165
|
'aws': aws_dependencies,
|
|
141
166
|
# TODO(zongheng): azure-cli is huge and takes a long time to install.
|
|
142
167
|
# Tracked in: https://github.com/Azure/azure-cli/issues/7387
|
|
@@ -172,20 +197,23 @@ extras_require: Dict[str, List[str]] = {
|
|
|
172
197
|
'docker': ['docker'] + local_ray,
|
|
173
198
|
'lambda': [], # No dependencies needed for lambda
|
|
174
199
|
'cloudflare': aws_dependencies,
|
|
200
|
+
'coreweave': aws_dependencies + kubernetes_dependencies,
|
|
175
201
|
'scp': local_ray,
|
|
176
202
|
'oci': ['oci'],
|
|
177
|
-
|
|
178
|
-
'
|
|
179
|
-
'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
|
|
180
|
-
],
|
|
181
|
-
'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'],
|
|
182
|
-
'remote': remote,
|
|
203
|
+
'kubernetes': kubernetes_dependencies,
|
|
204
|
+
'ssh': kubernetes_dependencies,
|
|
183
205
|
# For the container registry auth api. Reference:
|
|
184
206
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
185
|
-
|
|
207
|
+
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
|
|
208
|
+
# stdlib provides tomllib; on lower versions we depend on tomli explicitly.
|
|
209
|
+
# Instead of installing tomli conditionally, we install it explicitly.
|
|
210
|
+
# This is because the conditional installation of tomli does not work
|
|
211
|
+
# with controller package installation code.
|
|
212
|
+
'runpod': ['runpod>=1.6.1', 'tomli'],
|
|
186
213
|
'fluidstack': [], # No dependencies needed for fluidstack
|
|
187
214
|
'cudo': ['cudo-compute>=0.1.10'],
|
|
188
215
|
'paperspace': [], # No dependencies needed for paperspace
|
|
216
|
+
'primeintellect': [], # No dependencies needed for primeintellect
|
|
189
217
|
'do': ['pydo>=0.3.0', 'azure-core>=1.24.0', 'azure-common'],
|
|
190
218
|
'vast': ['vastai-sdk>=0.1.12'],
|
|
191
219
|
'vsphere': [
|
|
@@ -198,19 +226,25 @@ extras_require: Dict[str, List[str]] = {
|
|
|
198
226
|
# 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
|
|
199
227
|
],
|
|
200
228
|
'nebius': [
|
|
201
|
-
|
|
229
|
+
# Nebius requires grpcio and protobuf, so we need to include
|
|
230
|
+
# our constraints here.
|
|
231
|
+
'nebius>=0.3.12',
|
|
232
|
+
GRPC,
|
|
233
|
+
PROTOBUF,
|
|
202
234
|
] + aws_dependencies,
|
|
203
235
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
204
|
-
'
|
|
236
|
+
'seeweb': ['ecsapi==0.4.0'],
|
|
237
|
+
'shadeform': [], # No dependencies needed for shadeform
|
|
238
|
+
'slurm': [], # No dependencies needed for slurm
|
|
205
239
|
}
|
|
206
240
|
|
|
207
241
|
# Calculate which clouds should be included in the [all] installation.
|
|
208
|
-
clouds_for_all = set(
|
|
209
|
-
clouds_for_all.remove('remote')
|
|
242
|
+
clouds_for_all = set(cloud_dependencies)
|
|
210
243
|
|
|
211
244
|
if sys.version_info < (3, 10):
|
|
212
245
|
# Nebius needs python3.10. If python 3.9 [all] will not install nebius
|
|
213
246
|
clouds_for_all.remove('nebius')
|
|
247
|
+
clouds_for_all.remove('seeweb')
|
|
214
248
|
|
|
215
249
|
if sys.version_info >= (3, 12):
|
|
216
250
|
# The version of ray we use does not work with >= 3.12, so avoid clouds
|
|
@@ -220,5 +254,16 @@ if sys.version_info >= (3, 12):
|
|
|
220
254
|
# TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
|
|
221
255
|
clouds_for_all.remove('vast')
|
|
222
256
|
|
|
223
|
-
|
|
224
|
-
|
|
257
|
+
cloud_extras = {
|
|
258
|
+
cloud: dependencies + server_dependencies
|
|
259
|
+
for cloud, dependencies in cloud_dependencies.items()
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
extras_require: Dict[str, List[str]] = {
|
|
263
|
+
# Include server_dependencies with each cloud.
|
|
264
|
+
**cloud_extras,
|
|
265
|
+
'all': list(set().union(*[cloud_extras[cloud] for cloud in clouds_for_all])
|
|
266
|
+
),
|
|
267
|
+
'remote': remote,
|
|
268
|
+
'server': server_dependencies,
|
|
269
|
+
}
|
sky/setup_files/setup.py
CHANGED
|
@@ -148,47 +148,47 @@ if os.path.exists(readme_filepath):
|
|
|
148
148
|
long_description = io.open(readme_filepath, 'r', encoding='utf-8').read()
|
|
149
149
|
long_description = parse_readme(long_description)
|
|
150
150
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
setuptools.setup(
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
)
|
|
151
|
+
if __name__ == '__main__':
|
|
152
|
+
atexit.register(revert_commit_hash)
|
|
153
|
+
replace_commit_hash()
|
|
154
|
+
setuptools.setup(
|
|
155
|
+
# NOTE: this affects the package.whl wheel name. When changing this (if
|
|
156
|
+
# ever), you must grep for '.whl' and change all corresponding wheel paths
|
|
157
|
+
# (templates/*.j2 and wheel_utils.py).
|
|
158
|
+
name='skypilot-nightly',
|
|
159
|
+
version=find_version(),
|
|
160
|
+
packages=setuptools.find_packages(),
|
|
161
|
+
author='SkyPilot Team',
|
|
162
|
+
license='Apache 2.0',
|
|
163
|
+
readme='README.md',
|
|
164
|
+
description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
|
|
165
|
+
long_description=long_description,
|
|
166
|
+
long_description_content_type='text/markdown',
|
|
167
|
+
setup_requires=['wheel'],
|
|
168
|
+
requires_python='>=3.7',
|
|
169
|
+
install_requires=dependencies['install_requires'],
|
|
170
|
+
extras_require=dependencies['extras_require'],
|
|
171
|
+
entry_points={
|
|
172
|
+
'console_scripts': ['sky = sky.cli:cli'],
|
|
173
|
+
},
|
|
174
|
+
include_package_data=True,
|
|
175
|
+
classifiers=[
|
|
176
|
+
'Programming Language :: Python :: 3.7',
|
|
177
|
+
'Programming Language :: Python :: 3.8',
|
|
178
|
+
'Programming Language :: Python :: 3.9',
|
|
179
|
+
'Programming Language :: Python :: 3.10',
|
|
180
|
+
'Programming Language :: Python :: 3.11',
|
|
181
|
+
'Programming Language :: Python :: 3.12',
|
|
182
|
+
'Programming Language :: Python :: 3.13',
|
|
183
|
+
'License :: OSI Approved :: Apache Software License',
|
|
184
|
+
'Operating System :: OS Independent',
|
|
185
|
+
'Topic :: Software Development :: Libraries :: Python Modules',
|
|
186
|
+
'Topic :: System :: Distributed Computing',
|
|
187
|
+
],
|
|
188
|
+
project_urls={
|
|
189
|
+
'Homepage': 'https://github.com/skypilot-org/skypilot',
|
|
190
|
+
'Issues': 'https://github.com/skypilot-org/skypilot/issues',
|
|
191
|
+
'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
|
|
192
|
+
'Documentation': 'https://docs.skypilot.co/',
|
|
193
|
+
},
|
|
194
|
+
)
|
sky/sky_logging.py
CHANGED
|
@@ -85,7 +85,7 @@ class EnvAwareHandler(rich_utils.RichSafeStreamHandler):
|
|
|
85
85
|
@level.setter
|
|
86
86
|
def level(self, level):
|
|
87
87
|
# pylint: disable=protected-access
|
|
88
|
-
self._level = logging._checkLevel(level)
|
|
88
|
+
self._level = logging._checkLevel(level) # type: ignore[attr-defined]
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
_root_logger = logging.getLogger('sky')
|
|
@@ -109,7 +109,6 @@ def _setup_logger():
|
|
|
109
109
|
global _default_handler
|
|
110
110
|
if _default_handler is None:
|
|
111
111
|
_default_handler = EnvAwareHandler(sys.stdout)
|
|
112
|
-
_default_handler.flush = sys.stdout.flush # type: ignore
|
|
113
112
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
114
113
|
_default_handler.setLevel(logging.DEBUG)
|
|
115
114
|
else:
|
|
@@ -129,7 +128,6 @@ def _setup_logger():
|
|
|
129
128
|
for logger_name in _SENSITIVE_LOGGER:
|
|
130
129
|
logger = logging.getLogger(logger_name)
|
|
131
130
|
handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
|
|
132
|
-
handler_to_logger.flush = sys.stdout.flush # type: ignore
|
|
133
131
|
logger.addHandler(handler_to_logger)
|
|
134
132
|
logger.setLevel(logging.INFO)
|
|
135
133
|
if _show_logging_prefix():
|
|
@@ -148,7 +146,8 @@ def reload_logger():
|
|
|
148
146
|
such as SKYPILOT_DEBUG.
|
|
149
147
|
"""
|
|
150
148
|
global _default_handler
|
|
151
|
-
|
|
149
|
+
if _default_handler is not None:
|
|
150
|
+
_root_logger.removeHandler(_default_handler)
|
|
152
151
|
_default_handler = None
|
|
153
152
|
_setup_logger()
|
|
154
153
|
|
|
@@ -212,12 +211,21 @@ def logging_enabled(logger: logging.Logger, level: int) -> bool:
|
|
|
212
211
|
|
|
213
212
|
|
|
214
213
|
@contextlib.contextmanager
|
|
215
|
-
def silent():
|
|
214
|
+
def silent(should_silence: bool = True):
|
|
216
215
|
"""Make all sky_logging.print() and logger.{info, warning...} silent.
|
|
217
216
|
|
|
218
217
|
We preserve the ERROR level logging, so that errors are
|
|
219
218
|
still printed.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
should_silence: Whether to actually suppress the logging. If False, this
|
|
222
|
+
is a no-op context manager. Provided for convenience when we want to
|
|
223
|
+
suppress logging conditionally.
|
|
220
224
|
"""
|
|
225
|
+
if not should_silence:
|
|
226
|
+
yield
|
|
227
|
+
return
|
|
228
|
+
|
|
221
229
|
global print
|
|
222
230
|
previous_level = _root_logger.level
|
|
223
231
|
previous_is_silent = is_silent()
|
sky/skylet/attempt_skylet.py
CHANGED
|
@@ -1,51 +1,143 @@
|
|
|
1
1
|
"""Restarts skylet if version does not match"""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import signal
|
|
4
5
|
import subprocess
|
|
6
|
+
from typing import List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import psutil
|
|
5
9
|
|
|
6
10
|
from sky.skylet import constants
|
|
11
|
+
from sky.skylet import runtime_utils
|
|
12
|
+
from sky.utils import common_utils
|
|
13
|
+
|
|
14
|
+
VERSION_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_VERSION_FILE)
|
|
15
|
+
SKYLET_LOG_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_LOG_FILE)
|
|
16
|
+
PID_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PID_FILE)
|
|
17
|
+
PORT_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_PORT_FILE)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is_running_skylet_process(pid: int) -> bool:
|
|
21
|
+
if pid <= 0:
|
|
22
|
+
return False
|
|
23
|
+
try:
|
|
24
|
+
process = psutil.Process(pid)
|
|
25
|
+
if not process.is_running():
|
|
26
|
+
return False
|
|
27
|
+
# Check if command line contains the skylet module identifier
|
|
28
|
+
cmdline = process.cmdline()
|
|
29
|
+
return any('sky.skylet.skylet' in arg for arg in cmdline)
|
|
30
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
|
|
31
|
+
OSError) as e:
|
|
32
|
+
print(f'Error checking if skylet process {pid} is running: {e}')
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _find_running_skylet_pids() -> List[int]:
|
|
37
|
+
if os.path.exists(PID_FILE):
|
|
38
|
+
try:
|
|
39
|
+
with open(PID_FILE, 'r', encoding='utf-8') as pid_file:
|
|
40
|
+
pid = int(pid_file.read().strip())
|
|
41
|
+
if _is_running_skylet_process(pid):
|
|
42
|
+
return [pid]
|
|
43
|
+
except (OSError, ValueError, IOError) as e:
|
|
44
|
+
# Don't fallback to grep-based detection as the existence of the
|
|
45
|
+
# PID file implies that we are on the new version, and there is
|
|
46
|
+
# possibility of there being multiple skylet processes running,
|
|
47
|
+
# and we don't want to accidentally kill the wrong skylet(s).
|
|
48
|
+
print(f'Error reading PID file {PID_FILE}: {e}')
|
|
49
|
+
return []
|
|
50
|
+
else:
|
|
51
|
+
# Fall back to grep-based detection for backward compatibility.
|
|
52
|
+
pids = []
|
|
53
|
+
# We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
|
|
54
|
+
# because need to handle the backward compatibility of the old skylet
|
|
55
|
+
# started before #3326, which does not use the full path to python.
|
|
56
|
+
proc = subprocess.run(
|
|
57
|
+
'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"',
|
|
58
|
+
shell=True,
|
|
59
|
+
check=False,
|
|
60
|
+
capture_output=True,
|
|
61
|
+
text=True)
|
|
62
|
+
if proc.returncode == 0:
|
|
63
|
+
# Parse the output to extract PIDs (column 2)
|
|
64
|
+
for line in proc.stdout.strip().split('\n'):
|
|
65
|
+
if line:
|
|
66
|
+
parts = line.split()
|
|
67
|
+
if len(parts) >= 2:
|
|
68
|
+
try:
|
|
69
|
+
pids.append(int(parts[1]))
|
|
70
|
+
except ValueError:
|
|
71
|
+
continue
|
|
72
|
+
return pids
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _check_version_match() -> Tuple[bool, Optional[str]]:
|
|
76
|
+
"""Check if the version file matches the current skylet version.
|
|
7
77
|
|
|
8
|
-
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (version_match: bool, version: str or None)
|
|
80
|
+
"""
|
|
81
|
+
version: Optional[str] = None
|
|
82
|
+
if os.path.exists(VERSION_FILE):
|
|
83
|
+
try:
|
|
84
|
+
with open(VERSION_FILE, 'r', encoding='utf-8') as f:
|
|
85
|
+
version = f.read().strip()
|
|
86
|
+
return version == constants.SKYLET_VERSION, version
|
|
87
|
+
except (OSError, IOError):
|
|
88
|
+
pass
|
|
89
|
+
return False, version
|
|
9
90
|
|
|
10
91
|
|
|
11
92
|
def restart_skylet():
|
|
12
93
|
# Kills old skylet if it is running.
|
|
13
94
|
# TODO(zhwu): make the killing graceful, e.g., use a signal to tell
|
|
14
95
|
# skylet to exit, instead of directly killing it.
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
96
|
+
|
|
97
|
+
# Find and kill running skylet processes
|
|
98
|
+
for pid in _find_running_skylet_pids():
|
|
99
|
+
try:
|
|
100
|
+
os.kill(pid, signal.SIGKILL)
|
|
101
|
+
# Wait until process fully terminates so its socket gets released.
|
|
102
|
+
# Without this, find_free_port may race with the kernel closing the
|
|
103
|
+
# socket and fail to bind to the port that's supposed to be free.
|
|
104
|
+
psutil.Process(pid).wait(timeout=5)
|
|
105
|
+
except (OSError, ProcessLookupError, psutil.NoSuchProcess,
|
|
106
|
+
psutil.TimeoutExpired):
|
|
107
|
+
# Process died between detection and kill, or timeout waiting
|
|
108
|
+
pass
|
|
109
|
+
# Clean up the PID file
|
|
110
|
+
try:
|
|
111
|
+
os.remove(PID_FILE)
|
|
112
|
+
except OSError:
|
|
113
|
+
pass # Best effort cleanup
|
|
114
|
+
|
|
115
|
+
# TODO(kevin): Handle race conditions here. Race conditions can only
|
|
116
|
+
# happen on Slurm, where there could be multiple clusters running in
|
|
117
|
+
# one network namespace. For other clouds, the behaviour will be that
|
|
118
|
+
# it always gets port 46590 (default port).
|
|
119
|
+
port = common_utils.find_free_port(constants.SKYLET_GRPC_PORT)
|
|
23
120
|
subprocess.run(
|
|
24
121
|
# We have made sure that `attempt_skylet.py` is executed with the
|
|
25
122
|
# skypilot runtime env activated, so that skylet can access the cloud
|
|
26
123
|
# CLI tools.
|
|
27
|
-
f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet'
|
|
28
|
-
'
|
|
124
|
+
f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet '
|
|
125
|
+
f'--port={port} '
|
|
126
|
+
f'>> {SKYLET_LOG_FILE} 2>&1 & echo $! > {PID_FILE}',
|
|
29
127
|
shell=True,
|
|
30
128
|
check=True)
|
|
129
|
+
|
|
130
|
+
with open(PORT_FILE, 'w', encoding='utf-8') as pf:
|
|
131
|
+
pf.write(str(port))
|
|
132
|
+
|
|
31
133
|
with open(VERSION_FILE, 'w', encoding='utf-8') as v_f:
|
|
32
134
|
v_f.write(constants.SKYLET_VERSION)
|
|
33
135
|
|
|
34
136
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
shell=True,
|
|
38
|
-
check=False)
|
|
39
|
-
|
|
40
|
-
running = (proc.returncode == 0)
|
|
137
|
+
# Check if our skylet is running
|
|
138
|
+
running = bool(_find_running_skylet_pids())
|
|
41
139
|
|
|
42
|
-
version_match =
|
|
43
|
-
found_version = None
|
|
44
|
-
if os.path.exists(VERSION_FILE):
|
|
45
|
-
with open(VERSION_FILE, 'r', encoding='utf-8') as f:
|
|
46
|
-
found_version = f.read().strip()
|
|
47
|
-
if found_version == constants.SKYLET_VERSION:
|
|
48
|
-
version_match = True
|
|
140
|
+
version_match, found_version = _check_version_match()
|
|
49
141
|
|
|
50
142
|
version_string = (f' (found version {found_version}, new version '
|
|
51
143
|
f'{constants.SKYLET_VERSION})')
|
sky/skylet/configs.py
CHANGED
|
@@ -5,6 +5,7 @@ import pathlib
|
|
|
5
5
|
import threading
|
|
6
6
|
from typing import Callable, Optional, Union
|
|
7
7
|
|
|
8
|
+
from sky.skylet import runtime_utils
|
|
8
9
|
from sky.utils.db import db_utils
|
|
9
10
|
|
|
10
11
|
_DB_PATH = None
|
|
@@ -29,7 +30,8 @@ def init_db(func: Callable):
|
|
|
29
30
|
|
|
30
31
|
with _db_init_lock:
|
|
31
32
|
if _DB_PATH is None:
|
|
32
|
-
_DB_PATH =
|
|
33
|
+
_DB_PATH = runtime_utils.get_runtime_dir_path(
|
|
34
|
+
'.sky/skylet_config.db')
|
|
33
35
|
os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
|
|
34
36
|
with db_utils.safe_cursor(
|
|
35
37
|
_DB_PATH
|