skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Shadeform provisioner."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.shadeform.config import bootstrap_instances
|
|
4
|
+
from sky.provision.shadeform.instance import cleanup_ports
|
|
5
|
+
from sky.provision.shadeform.instance import get_cluster_info
|
|
6
|
+
from sky.provision.shadeform.instance import open_ports
|
|
7
|
+
from sky.provision.shadeform.instance import query_instances
|
|
8
|
+
from sky.provision.shadeform.instance import run_instances
|
|
9
|
+
from sky.provision.shadeform.instance import stop_instances
|
|
10
|
+
from sky.provision.shadeform.instance import terminate_instances
|
|
11
|
+
from sky.provision.shadeform.instance import wait_instances
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Shadeform configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
|
|
12
|
+
return config
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""Shadeform instance provisioning."""
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky.provision import common
|
|
9
|
+
from sky.provision.shadeform import shadeform_utils
|
|
10
|
+
from sky.utils import status_lib
|
|
11
|
+
|
|
12
|
+
POLL_INTERVAL = 10
|
|
13
|
+
INSTANCE_READY_TIMEOUT = 3600
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Status mapping from Shadeform to SkyPilot
|
|
18
|
+
SHADEFORM_STATUS_MAP = {
|
|
19
|
+
'creating': status_lib.ClusterStatus.INIT,
|
|
20
|
+
'pending_provider': status_lib.ClusterStatus.INIT,
|
|
21
|
+
'pending': status_lib.ClusterStatus.INIT,
|
|
22
|
+
'active': status_lib.ClusterStatus.UP,
|
|
23
|
+
'deleted': status_lib.ClusterStatus.STOPPED,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_cluster_instances(cluster_name_on_cloud: str) -> Dict[str, Any]:
|
|
28
|
+
"""Get all instances belonging to a cluster."""
|
|
29
|
+
try:
|
|
30
|
+
response = shadeform_utils.get_instances()
|
|
31
|
+
instances = response.get('instances', [])
|
|
32
|
+
|
|
33
|
+
cluster_instances = {}
|
|
34
|
+
possible_names = [
|
|
35
|
+
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
for instance in instances:
|
|
39
|
+
if instance.get('name') in possible_names:
|
|
40
|
+
cluster_instances[instance['id']] = instance
|
|
41
|
+
|
|
42
|
+
return cluster_instances
|
|
43
|
+
except (ValueError, KeyError, requests.exceptions.RequestException) as e:
|
|
44
|
+
logger.warning(f'Failed to get instances: {e}')
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
49
|
+
"""Get the head instance ID from a list of instances."""
|
|
50
|
+
for instance_id, instance in instances.items():
|
|
51
|
+
if instance.get('name', '').endswith('-head'):
|
|
52
|
+
return instance_id
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _wait_for_instances_ready(cluster_name_on_cloud: str,
|
|
57
|
+
expected_count: int,
|
|
58
|
+
timeout: int = INSTANCE_READY_TIMEOUT) -> bool:
|
|
59
|
+
"""Wait for instances to be ready (active state with SSH access)."""
|
|
60
|
+
start_time = time.time()
|
|
61
|
+
|
|
62
|
+
while time.time() - start_time < timeout:
|
|
63
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
64
|
+
ready_count = 0
|
|
65
|
+
|
|
66
|
+
for instance in instances.values():
|
|
67
|
+
if (instance.get('status') == 'active' and
|
|
68
|
+
instance.get('ip') is not None and
|
|
69
|
+
instance.get('ssh_port') is not None):
|
|
70
|
+
ready_count += 1
|
|
71
|
+
|
|
72
|
+
logger.info(f'Waiting for instances to be ready: '
|
|
73
|
+
f'({ready_count}/{expected_count})')
|
|
74
|
+
|
|
75
|
+
if ready_count >= expected_count:
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
time.sleep(POLL_INTERVAL)
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
84
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
85
|
+
"""Run instances for the given cluster."""
|
|
86
|
+
del cluster_name # unused - we use cluster_name_on_cloud
|
|
87
|
+
logger.info(f'Running instances for cluster {cluster_name_on_cloud} '
|
|
88
|
+
f'in region {region}')
|
|
89
|
+
logger.debug(f'DEBUG: region type={type(region)}, value={region!r}')
|
|
90
|
+
logger.debug(f'DEBUG: config node_config={config.node_config}')
|
|
91
|
+
|
|
92
|
+
# Check existing instances
|
|
93
|
+
existing_instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
94
|
+
head_instance_id = _get_head_instance_id(existing_instances)
|
|
95
|
+
|
|
96
|
+
# Filter active instances
|
|
97
|
+
active_instances = {
|
|
98
|
+
iid: inst
|
|
99
|
+
for iid, inst in existing_instances.items()
|
|
100
|
+
if inst.get('status') == 'active'
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
current_count = len(active_instances)
|
|
104
|
+
target_count = config.count
|
|
105
|
+
|
|
106
|
+
logger.info(f'Current instances: {current_count}, target: {target_count}')
|
|
107
|
+
|
|
108
|
+
if current_count >= target_count:
|
|
109
|
+
if head_instance_id is None:
|
|
110
|
+
raise RuntimeError(
|
|
111
|
+
f'Cluster {cluster_name_on_cloud} has no head node')
|
|
112
|
+
logger.info(f'Cluster already has {current_count} instances, '
|
|
113
|
+
f'no need to start more')
|
|
114
|
+
return common.ProvisionRecord(
|
|
115
|
+
provider_name='shadeform',
|
|
116
|
+
cluster_name=cluster_name_on_cloud,
|
|
117
|
+
region=region,
|
|
118
|
+
zone=None, # Shadeform doesn't use separate zones
|
|
119
|
+
head_instance_id=head_instance_id,
|
|
120
|
+
resumed_instance_ids=[],
|
|
121
|
+
created_instance_ids=[])
|
|
122
|
+
|
|
123
|
+
# Create new instances
|
|
124
|
+
to_create = target_count - current_count
|
|
125
|
+
created_instance_ids = []
|
|
126
|
+
|
|
127
|
+
for _ in range(to_create):
|
|
128
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
|
129
|
+
instance_name = f'{cluster_name_on_cloud}-{node_type}'
|
|
130
|
+
|
|
131
|
+
# Extract configuration from node_config
|
|
132
|
+
|
|
133
|
+
# The node_config contains instance specs including InstanceType
|
|
134
|
+
# which follows the format: {cloud_provider}_{instance_type}
|
|
135
|
+
# (e.g., "massedcompute_A6000_basex2")
|
|
136
|
+
node_config = config.node_config
|
|
137
|
+
assert 'InstanceType' in node_config, \
|
|
138
|
+
'InstanceType must be present in node_config'
|
|
139
|
+
|
|
140
|
+
# Parse the instance type to extract cloud provider and instance specs
|
|
141
|
+
# Expected format: "{cloud}_{instance_type}" where cloud is provider
|
|
142
|
+
# (massedcompute, scaleway, lambda, etc.)
|
|
143
|
+
instance_type_full = node_config['InstanceType']
|
|
144
|
+
assert (isinstance(instance_type_full, str) and
|
|
145
|
+
'_' in instance_type_full), \
|
|
146
|
+
f'InstanceType must be in format cloud_instance_type, got: ' \
|
|
147
|
+
f'{instance_type_full}'
|
|
148
|
+
|
|
149
|
+
instance_type_split = instance_type_full.split('_')
|
|
150
|
+
assert len(instance_type_split) >= 2, \
|
|
151
|
+
f'InstanceType must contain at least one underscore, got: ' \
|
|
152
|
+
f'{instance_type_full}'
|
|
153
|
+
|
|
154
|
+
# Extract cloud provider (first part) and instance type (remaining)
|
|
155
|
+
# Example: "massedcompute_A6000-basex2" -> cloud="massedcompute",
|
|
156
|
+
# instance_type="A6000-basex2"
|
|
157
|
+
cloud = instance_type_split[0]
|
|
158
|
+
instance_type = '_'.join(instance_type_split[1:])
|
|
159
|
+
|
|
160
|
+
# Shadeform uses underscores instead of hyphens
|
|
161
|
+
instance_type = instance_type.replace('-', '_')
|
|
162
|
+
|
|
163
|
+
if instance_type.endswith('B'):
|
|
164
|
+
instance_type = instance_type[:-1]
|
|
165
|
+
|
|
166
|
+
# Replace "GBx" with "Gx" (case sensitive)
|
|
167
|
+
if 'GBx' in instance_type:
|
|
168
|
+
instance_type = instance_type.replace('GBx', 'Gx')
|
|
169
|
+
|
|
170
|
+
assert cloud, 'Cloud provider cannot be empty'
|
|
171
|
+
assert instance_type, 'Instance type cannot be empty'
|
|
172
|
+
|
|
173
|
+
# Get SSH key ID for authentication - this is optional and may be None
|
|
174
|
+
ssh_key_id = config.authentication_config.get('ssh_key_id')
|
|
175
|
+
|
|
176
|
+
create_config = {
|
|
177
|
+
'cloud': cloud,
|
|
178
|
+
'region': region,
|
|
179
|
+
'shade_instance_type': instance_type,
|
|
180
|
+
'name': instance_name,
|
|
181
|
+
'ssh_key_id': ssh_key_id
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
logger.info(f'Creating {node_type} instance: {instance_name}')
|
|
186
|
+
response = shadeform_utils.create_instance(create_config)
|
|
187
|
+
instance_id = response['id']
|
|
188
|
+
created_instance_ids.append(instance_id)
|
|
189
|
+
|
|
190
|
+
if head_instance_id is None:
|
|
191
|
+
head_instance_id = instance_id
|
|
192
|
+
|
|
193
|
+
logger.info(f'Created instance {instance_id} ({node_type})')
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f'Failed to create instance: {e}')
|
|
197
|
+
# Clean up any created instances
|
|
198
|
+
for iid in created_instance_ids:
|
|
199
|
+
try:
|
|
200
|
+
shadeform_utils.delete_instance(iid)
|
|
201
|
+
except requests.exceptions.RequestException as cleanup_e:
|
|
202
|
+
logger.warning(
|
|
203
|
+
f'Failed to cleanup instance {iid}: {cleanup_e}')
|
|
204
|
+
raise
|
|
205
|
+
|
|
206
|
+
# Wait for all instances to be ready
|
|
207
|
+
logger.info('Waiting for instances to become ready...')
|
|
208
|
+
if not _wait_for_instances_ready(cluster_name_on_cloud, target_count):
|
|
209
|
+
raise RuntimeError('Timed out waiting for instances to be ready')
|
|
210
|
+
|
|
211
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
212
|
+
|
|
213
|
+
return common.ProvisionRecord(provider_name='shadeform',
|
|
214
|
+
cluster_name=cluster_name_on_cloud,
|
|
215
|
+
region=region,
|
|
216
|
+
zone=region,
|
|
217
|
+
head_instance_id=head_instance_id,
|
|
218
|
+
resumed_instance_ids=[],
|
|
219
|
+
created_instance_ids=created_instance_ids)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
|
223
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
|
224
|
+
"""Wait for instances to reach the specified state."""
|
|
225
|
+
del region, cluster_name_on_cloud, state # unused
|
|
226
|
+
# For Shadeform, instances are ready when they reach 'active' status
|
|
227
|
+
# This is already handled in run_instances
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def stop_instances(cluster_name_on_cloud: str,
|
|
231
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
232
|
+
worker_only: bool = False) -> None:
|
|
233
|
+
"""Stop instances (not supported by Shadeform)."""
|
|
234
|
+
del cluster_name_on_cloud, provider_config, worker_only # unused
|
|
235
|
+
raise NotImplementedError(
|
|
236
|
+
'Stopping instances is not supported by Shadeform')
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def terminate_instances(cluster_name_on_cloud: str,
|
|
240
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
241
|
+
worker_only: bool = False) -> None:
|
|
242
|
+
"""Terminate instances."""
|
|
243
|
+
del provider_config # unused
|
|
244
|
+
logger.info(f'Terminating instances for cluster {cluster_name_on_cloud}')
|
|
245
|
+
|
|
246
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
247
|
+
|
|
248
|
+
if not instances:
|
|
249
|
+
logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
instances_to_delete = instances
|
|
253
|
+
if worker_only:
|
|
254
|
+
# Only delete worker nodes, not head
|
|
255
|
+
instances_to_delete = {
|
|
256
|
+
iid: inst
|
|
257
|
+
for iid, inst in instances.items()
|
|
258
|
+
if not inst.get('name', '').endswith('-head')
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
for instance_id, instance in instances_to_delete.items():
|
|
262
|
+
try:
|
|
263
|
+
logger.info(
|
|
264
|
+
f'Terminating instance {instance_id} ({instance.get("name")})')
|
|
265
|
+
shadeform_utils.delete_instance(instance_id)
|
|
266
|
+
except requests.exceptions.RequestException as e:
|
|
267
|
+
logger.warning(f'Failed to terminate instance {instance_id}: {e}')
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def get_cluster_info(
|
|
271
|
+
region: str,
|
|
272
|
+
cluster_name_on_cloud: str,
|
|
273
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
|
274
|
+
"""Get cluster information."""
|
|
275
|
+
del region, provider_config # unused
|
|
276
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
277
|
+
|
|
278
|
+
if not instances:
|
|
279
|
+
return common.ClusterInfo(instances={},
|
|
280
|
+
head_instance_id=None,
|
|
281
|
+
provider_name='shadeform')
|
|
282
|
+
|
|
283
|
+
head_instance_id = _get_head_instance_id(instances)
|
|
284
|
+
|
|
285
|
+
# Convert instance format for ClusterInfo
|
|
286
|
+
cluster_instances = {}
|
|
287
|
+
for instance_id, instance in instances.items():
|
|
288
|
+
instance_info = common.InstanceInfo(
|
|
289
|
+
instance_id=instance_id,
|
|
290
|
+
internal_ip=instance.get('ip', ''),
|
|
291
|
+
external_ip=instance.get('ip', ''),
|
|
292
|
+
ssh_port=instance.get('ssh_port', 22),
|
|
293
|
+
tags={},
|
|
294
|
+
)
|
|
295
|
+
# ClusterInfo expects Dict[InstanceId, List[InstanceInfo]]
|
|
296
|
+
cluster_instances[instance_id] = [instance_info]
|
|
297
|
+
|
|
298
|
+
ssh_user = 'shadeform' # default
|
|
299
|
+
if head_instance_id is not None:
|
|
300
|
+
ssh_user = instances.get(head_instance_id,
|
|
301
|
+
{}).get('ssh_user', 'shadeform')
|
|
302
|
+
|
|
303
|
+
return common.ClusterInfo(instances=cluster_instances,
|
|
304
|
+
head_instance_id=head_instance_id,
|
|
305
|
+
provider_name='shadeform',
|
|
306
|
+
ssh_user=ssh_user)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def query_instances(
|
|
310
|
+
cluster_name: str,
|
|
311
|
+
cluster_name_on_cloud: str,
|
|
312
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
313
|
+
non_terminated_only: bool = True,
|
|
314
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
315
|
+
"""Query the status of instances."""
|
|
316
|
+
del cluster_name, provider_config # unused
|
|
317
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
318
|
+
|
|
319
|
+
if not instances:
|
|
320
|
+
return {}
|
|
321
|
+
|
|
322
|
+
status_map: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
323
|
+
Optional[str]]] = {}
|
|
324
|
+
for instance_id, instance in instances.items():
|
|
325
|
+
shadeform_status = instance.get('status', 'unknown')
|
|
326
|
+
sky_status = SHADEFORM_STATUS_MAP.get(shadeform_status,
|
|
327
|
+
status_lib.ClusterStatus.INIT)
|
|
328
|
+
|
|
329
|
+
if (non_terminated_only and
|
|
330
|
+
sky_status == status_lib.ClusterStatus.STOPPED):
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
status_map[instance_id] = (sky_status, None)
|
|
334
|
+
|
|
335
|
+
return status_map
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def open_ports(cluster_name_on_cloud: str,
|
|
339
|
+
ports: List[str],
|
|
340
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
341
|
+
"""Open ports (not supported by Shadeform)."""
|
|
342
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
343
|
+
raise NotImplementedError()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def cleanup_ports(cluster_name_on_cloud: str,
|
|
347
|
+
ports: List[str],
|
|
348
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
349
|
+
"""Cleanup ports (not supported by Shadeform)."""
|
|
350
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
351
|
+
# Nothing to cleanup since we don't support dynamic port opening
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Shadeform API utilities."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from sky.adaptors import common
|
|
7
|
+
|
|
8
|
+
# Lazy import to avoid dependency on external packages
|
|
9
|
+
requests = common.LazyImport('requests')
|
|
10
|
+
|
|
11
|
+
# Shadeform API configuration
|
|
12
|
+
SHADEFORM_API_BASE = 'https://api.shadeform.ai/v1'
|
|
13
|
+
SHADEFORM_API_KEY_PATH = '~/.shadeform/api_key'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_api_key() -> str:
|
|
17
|
+
"""Get Shadeform API key from file."""
|
|
18
|
+
api_key_path = os.path.expanduser(SHADEFORM_API_KEY_PATH)
|
|
19
|
+
if not os.path.exists(api_key_path):
|
|
20
|
+
raise FileNotFoundError(
|
|
21
|
+
f'Shadeform API key not found at {api_key_path}. '
|
|
22
|
+
'Please save your API key to this file.')
|
|
23
|
+
|
|
24
|
+
with open(api_key_path, 'r', encoding='utf-8') as f:
|
|
25
|
+
api_key = f.read().strip()
|
|
26
|
+
|
|
27
|
+
if not api_key:
|
|
28
|
+
raise ValueError(f'Shadeform API key is empty in {api_key_path}')
|
|
29
|
+
|
|
30
|
+
return api_key
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def make_request(method: str, endpoint: str, **kwargs) -> Any:
|
|
34
|
+
"""Make a request to the Shadeform API."""
|
|
35
|
+
url = f'{SHADEFORM_API_BASE}/{endpoint.lstrip("/")}'
|
|
36
|
+
headers = {
|
|
37
|
+
'X-API-KEY': get_api_key(),
|
|
38
|
+
'Content-Type': 'application/json',
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
response = requests.request(method, url, headers=headers, **kwargs)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
|
|
44
|
+
# Some APIs (like delete) return empty responses with just 200 status
|
|
45
|
+
if response.text.strip():
|
|
46
|
+
return response.json()
|
|
47
|
+
else:
|
|
48
|
+
# Return empty dict for empty responses (e.g., delete operations)
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_instances() -> Dict[str, Any]:
|
|
53
|
+
"""Get all instances."""
|
|
54
|
+
return make_request('GET', '/instances')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instance_info(instance_id: str) -> Dict[str, Any]:
|
|
58
|
+
"""Get information about a specific instance."""
|
|
59
|
+
return make_request('GET', f'/instances/{instance_id}/info')
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def create_instance(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
63
|
+
"""Create a new instance."""
|
|
64
|
+
return make_request('POST', '/instances/create', json=config)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def delete_instance(instance_id: str) -> Dict[str, Any]:
|
|
68
|
+
"""Delete an instance.
|
|
69
|
+
|
|
70
|
+
Note: Shadeform delete API returns empty response with 200 status.
|
|
71
|
+
"""
|
|
72
|
+
return make_request('POST', f'/instances/{instance_id}/delete')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_ssh_keys() -> Dict[str, Any]:
|
|
76
|
+
"""Get all SSH keys."""
|
|
77
|
+
return make_request('GET', '/sshkeys')
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def add_ssh_key(name: str, public_key: str) -> Dict[str, Any]:
|
|
81
|
+
"""Add a new SSH key."""
|
|
82
|
+
config = {'name': name, 'public_key': public_key}
|
|
83
|
+
return make_request('POST', '/sshkeys/add', json=config)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Slurm provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.slurm.config import bootstrap_instances
|
|
4
|
+
from sky.provision.slurm.instance import cleanup_ports
|
|
5
|
+
from sky.provision.slurm.instance import get_cluster_info
|
|
6
|
+
from sky.provision.slurm.instance import get_command_runners
|
|
7
|
+
from sky.provision.slurm.instance import open_ports
|
|
8
|
+
from sky.provision.slurm.instance import query_instances
|
|
9
|
+
from sky.provision.slurm.instance import run_instances
|
|
10
|
+
from sky.provision.slurm.instance import stop_instances
|
|
11
|
+
from sky.provision.slurm.instance import terminate_instances
|
|
12
|
+
from sky.provision.slurm.instance import wait_instances
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Slrum-specific configuration for the provisioner."""
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from sky.provision import common
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def bootstrap_instances(
|
|
10
|
+
region: str, cluster_name: str,
|
|
11
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
12
|
+
del region, cluster_name # unused
|
|
13
|
+
return config
|