skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
"""This module provides functions to generate GraphQL mutations for deploying
|
2
|
+
spot instance Pods on RunPod.
|
3
|
+
|
4
|
+
Reference:
|
5
|
+
https://github.com/runpod/runpod-python/blob/main/runpod/api/ctl_commands.py
|
6
|
+
|
7
|
+
Functions:
|
8
|
+
generate_spot_pod_deployment_mutation: Generates a GraphQL mutation string
|
9
|
+
for deploying a spot instance Pod on RunPod.
|
10
|
+
|
11
|
+
Example:
|
12
|
+
>>> mutation = generate_spot_pod_deployment_mutation(
|
13
|
+
name='test',
|
14
|
+
image_name='runpod/stack',
|
15
|
+
gpu_type_id='NVIDIA GeForce RTX 3070',
|
16
|
+
bid_per_gpu=0.3
|
17
|
+
)
|
18
|
+
"""
|
19
|
+
from typing import List, Optional
|
20
|
+
|
21
|
+
from sky.adaptors import runpod
|
22
|
+
from sky.provision.runpod.api.pods import generate_spot_pod_deployment_mutation
|
23
|
+
|
24
|
+
_INTERRUPTABLE_POD_FIELD: str = 'podRentInterruptable'
|
25
|
+
_RESPONSE_DATA_FIELD: str = 'data'
|
26
|
+
|
27
|
+
|
28
|
+
def create_spot_pod(
|
29
|
+
name: str,
|
30
|
+
image_name: str,
|
31
|
+
gpu_type_id: str,
|
32
|
+
bid_per_gpu: float,
|
33
|
+
cloud_type: str = 'ALL',
|
34
|
+
volume_mount_path: str = '/runpod-volume',
|
35
|
+
gpu_count: Optional[int] = 1,
|
36
|
+
min_memory_in_gb: Optional[int] = 1,
|
37
|
+
min_vcpu_count: Optional[int] = 1,
|
38
|
+
container_disk_in_gb: Optional[int] = None,
|
39
|
+
volume_in_gb: Optional[int] = 0,
|
40
|
+
ports: Optional[str] = None,
|
41
|
+
start_ssh: Optional[bool] = True,
|
42
|
+
start_jupyter: Optional[bool] = False,
|
43
|
+
env: Optional[dict] = None,
|
44
|
+
docker_args: Optional[str] = '',
|
45
|
+
support_public_ip: Optional[bool] = True,
|
46
|
+
terminate_after: Optional[str] = None,
|
47
|
+
stop_after: Optional[str] = None,
|
48
|
+
data_center_id: Optional[str] = None,
|
49
|
+
country_code: Optional[str] = None,
|
50
|
+
network_volume_id: Optional[str] = None,
|
51
|
+
allowed_cuda_versions: Optional[List[str]] = None,
|
52
|
+
min_download: Optional[int] = None,
|
53
|
+
min_upload: Optional[int] = None,
|
54
|
+
cuda_version: Optional[str] = None,
|
55
|
+
template_id: Optional[str] = None,
|
56
|
+
volume_key: Optional[str] = None,
|
57
|
+
) -> dict:
|
58
|
+
"""This module provides functions to generate GraphQL mutations for
|
59
|
+
deploying spot instance Pods on RunPod.
|
60
|
+
|
61
|
+
Functions:
|
62
|
+
generate_spot_pod_deployment_mutation: Generates a GraphQL mutation
|
63
|
+
string for deploying a spot instance Pod on RunPod.
|
64
|
+
|
65
|
+
Example:
|
66
|
+
>>> mutation = generate_spot_pod_deployment_mutation(
|
67
|
+
name='test',
|
68
|
+
image_name='runpod/stack',
|
69
|
+
gpu_type_id='NVIDIA GeForce RTX 3070',
|
70
|
+
bid_per_gpu=0.3
|
71
|
+
)
|
72
|
+
"""
|
73
|
+
runpod.runpod.get_gpu(gpu_type_id)
|
74
|
+
# refer to https://graphql-spec.runpod.io/#definition-CloudTypeEnum
|
75
|
+
if cloud_type not in ['ALL', 'COMMUNITY', 'SECURE']:
|
76
|
+
raise ValueError('cloud_type must be one of ALL, COMMUNITY or SECURE')
|
77
|
+
|
78
|
+
if network_volume_id and data_center_id is None:
|
79
|
+
user_info = runpod.runpod.get_user()
|
80
|
+
for network_volume in user_info['networkVolumes']:
|
81
|
+
if network_volume['id'] == network_volume_id:
|
82
|
+
data_center_id = network_volume['dataCenterId']
|
83
|
+
break
|
84
|
+
|
85
|
+
if container_disk_in_gb is None and template_id is None:
|
86
|
+
container_disk_in_gb = 10
|
87
|
+
|
88
|
+
mutation = generate_spot_pod_deployment_mutation(
|
89
|
+
name=name,
|
90
|
+
image_name=image_name,
|
91
|
+
gpu_type_id=gpu_type_id,
|
92
|
+
bid_per_gpu=bid_per_gpu,
|
93
|
+
cloud_type=cloud_type,
|
94
|
+
gpu_count=gpu_count,
|
95
|
+
min_memory_in_gb=min_memory_in_gb,
|
96
|
+
min_vcpu_count=min_vcpu_count,
|
97
|
+
container_disk_in_gb=container_disk_in_gb,
|
98
|
+
volume_in_gb=volume_in_gb,
|
99
|
+
volume_mount_path=volume_mount_path,
|
100
|
+
ports=ports,
|
101
|
+
start_ssh=start_ssh,
|
102
|
+
start_jupyter=start_jupyter,
|
103
|
+
env=env,
|
104
|
+
docker_args=docker_args,
|
105
|
+
support_public_ip=support_public_ip,
|
106
|
+
terminate_after=terminate_after,
|
107
|
+
stop_after=stop_after,
|
108
|
+
data_center_id=data_center_id,
|
109
|
+
country_code=country_code,
|
110
|
+
network_volume_id=network_volume_id,
|
111
|
+
allowed_cuda_versions=allowed_cuda_versions,
|
112
|
+
min_download=min_download,
|
113
|
+
min_upload=min_upload,
|
114
|
+
cuda_version=cuda_version,
|
115
|
+
template_id=template_id,
|
116
|
+
volume_key=volume_key,
|
117
|
+
)
|
118
|
+
response = runpod.runpod.api.graphql.run_graphql_query(mutation)
|
119
|
+
return response[_RESPONSE_DATA_FIELD][_INTERRUPTABLE_POD_FIELD]
|
@@ -0,0 +1,142 @@
|
|
1
|
+
"""This module provides functions to generate GraphQL mutations for deploying
|
2
|
+
spot instance Pods on RunPod.
|
3
|
+
|
4
|
+
Reference:
|
5
|
+
https://github.com/runpod/runpod-python/blob/main/runpod/api/mutations/pods.py
|
6
|
+
|
7
|
+
Functions:
|
8
|
+
generate_spot_pod_deployment_mutation: Generates a GraphQL mutation string
|
9
|
+
for deploying a spot instance Pod on RunPod.
|
10
|
+
Example:
|
11
|
+
>>> mutation = generate_spot_pod_deployment_mutation(
|
12
|
+
name='test',
|
13
|
+
image_name='runpod/stack',
|
14
|
+
gpu_type_id='NVIDIA GeForce RTX 3070',
|
15
|
+
bid_per_gpu=0.3
|
16
|
+
)
|
17
|
+
"""
|
18
|
+
|
19
|
+
from typing import List, Optional
|
20
|
+
|
21
|
+
|
22
|
+
# refer to https://graphql-spec.runpod.io/#definition-PodRentInterruptableInput
|
23
|
+
def generate_spot_pod_deployment_mutation(
|
24
|
+
name: str,
|
25
|
+
image_name: str,
|
26
|
+
gpu_type_id: str,
|
27
|
+
bid_per_gpu: float,
|
28
|
+
volume_mount_path: str,
|
29
|
+
cloud_type: str = 'ALL',
|
30
|
+
gpu_count: Optional[int] = None,
|
31
|
+
min_memory_in_gb: Optional[int] = None,
|
32
|
+
min_vcpu_count: Optional[int] = None,
|
33
|
+
container_disk_in_gb: Optional[int] = None,
|
34
|
+
volume_in_gb: Optional[int] = None,
|
35
|
+
ports: Optional[str] = None,
|
36
|
+
start_ssh: Optional[bool] = True,
|
37
|
+
start_jupyter: Optional[bool] = False,
|
38
|
+
env: Optional[dict] = None,
|
39
|
+
docker_args: Optional[str] = None,
|
40
|
+
support_public_ip: Optional[bool] = True,
|
41
|
+
terminate_after: Optional[str] = None,
|
42
|
+
stop_after: Optional[str] = None,
|
43
|
+
data_center_id: Optional[str] = None,
|
44
|
+
country_code: Optional[str] = None,
|
45
|
+
network_volume_id: Optional[str] = None,
|
46
|
+
allowed_cuda_versions: Optional[List[str]] = None,
|
47
|
+
min_download: Optional[int] = None,
|
48
|
+
min_upload: Optional[int] = None,
|
49
|
+
cuda_version: Optional[str] = None,
|
50
|
+
template_id: Optional[str] = None,
|
51
|
+
volume_key: Optional[str] = None,
|
52
|
+
) -> str:
|
53
|
+
input_fields = []
|
54
|
+
|
55
|
+
# Required Fields
|
56
|
+
input_fields.append(f'name: "{name}"')
|
57
|
+
input_fields.append(f'imageName: "{image_name}"')
|
58
|
+
input_fields.append(f'gpuTypeId: "{gpu_type_id}"')
|
59
|
+
input_fields.append(f'bidPerGpu: {bid_per_gpu}')
|
60
|
+
input_fields.append(f'volumeMountPath: "{volume_mount_path}"')
|
61
|
+
|
62
|
+
# Default Fields
|
63
|
+
input_fields.append(f'cloudType: {cloud_type}')
|
64
|
+
|
65
|
+
if start_ssh:
|
66
|
+
input_fields.append('startSsh: true')
|
67
|
+
if start_jupyter:
|
68
|
+
input_fields.append('startJupyter: true')
|
69
|
+
if support_public_ip:
|
70
|
+
input_fields.append('supportPublicIp: true')
|
71
|
+
else:
|
72
|
+
input_fields.append('supportPublicIp: false')
|
73
|
+
|
74
|
+
# Optional Fields
|
75
|
+
if gpu_count is not None:
|
76
|
+
input_fields.append(f'gpuCount: {gpu_count}')
|
77
|
+
if min_memory_in_gb is not None:
|
78
|
+
input_fields.append(f'minMemoryInGb: {min_memory_in_gb}')
|
79
|
+
if min_vcpu_count is not None:
|
80
|
+
input_fields.append(f'minVcpuCount: {min_vcpu_count}')
|
81
|
+
if container_disk_in_gb is not None:
|
82
|
+
input_fields.append(f'containerDiskInGb: {container_disk_in_gb}')
|
83
|
+
if volume_in_gb is not None:
|
84
|
+
input_fields.append(f'volumeInGb: {volume_in_gb}')
|
85
|
+
if ports is not None:
|
86
|
+
ports = ports.replace(' ', '')
|
87
|
+
input_fields.append(f'ports: "{ports}"')
|
88
|
+
if docker_args is not None:
|
89
|
+
input_fields.append(f'dockerArgs: "{docker_args}"')
|
90
|
+
if terminate_after is not None:
|
91
|
+
input_fields.append(f'terminateAfter: "{terminate_after}"')
|
92
|
+
if stop_after is not None:
|
93
|
+
input_fields.append(f'stopAfter: "{stop_after}"')
|
94
|
+
if data_center_id is not None:
|
95
|
+
input_fields.append(f'dataCenterId: "{data_center_id}"')
|
96
|
+
if country_code is not None:
|
97
|
+
input_fields.append(f'countryCode: "{country_code}"')
|
98
|
+
if network_volume_id is not None:
|
99
|
+
input_fields.append(f'networkVolumeId: "{network_volume_id}"')
|
100
|
+
if allowed_cuda_versions is not None:
|
101
|
+
allowed_cuda_versions_string = ', '.join(
|
102
|
+
[f'"{version}"' for version in allowed_cuda_versions])
|
103
|
+
input_fields.append(
|
104
|
+
f'allowedCudaVersions: [{allowed_cuda_versions_string}]')
|
105
|
+
if min_download is not None:
|
106
|
+
input_fields.append(f'minDownload: {min_download}')
|
107
|
+
if min_upload is not None:
|
108
|
+
input_fields.append(f'minUpload: {min_upload}')
|
109
|
+
if cuda_version is not None:
|
110
|
+
input_fields.append(f'cudaVersion: "{cuda_version}"')
|
111
|
+
if template_id is not None:
|
112
|
+
input_fields.append(f'templateId: "{template_id}"')
|
113
|
+
if volume_key is not None:
|
114
|
+
input_fields.append(f'volumeKey: "{volume_key}"')
|
115
|
+
|
116
|
+
if env is not None:
|
117
|
+
env_string = ', '.join([
|
118
|
+
f'{{ key: "{key}", value: "{value}" }}'
|
119
|
+
for key, value in env.items()
|
120
|
+
])
|
121
|
+
input_fields.append(f'env: [{env_string}]')
|
122
|
+
|
123
|
+
# Format input fields
|
124
|
+
input_string = ', '.join(input_fields)
|
125
|
+
return f"""
|
126
|
+
mutation {{
|
127
|
+
podRentInterruptable(
|
128
|
+
input: {{
|
129
|
+
{input_string}
|
130
|
+
}}
|
131
|
+
) {{
|
132
|
+
id
|
133
|
+
desiredStatus
|
134
|
+
imageName
|
135
|
+
env
|
136
|
+
machineId
|
137
|
+
machine {{
|
138
|
+
podHostId
|
139
|
+
}}
|
140
|
+
}}
|
141
|
+
}}
|
142
|
+
"""
|
sky/provision/runpod/instance.py
CHANGED
@@ -3,24 +3,27 @@ import time
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
5
|
from sky import sky_logging
|
6
|
-
from sky import status_lib
|
7
6
|
from sky.provision import common
|
8
7
|
from sky.provision.runpod import utils
|
9
8
|
from sky.utils import common_utils
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
from sky.utils import status_lib
|
10
11
|
from sky.utils import ux_utils
|
11
12
|
|
12
13
|
POLL_INTERVAL = 5
|
14
|
+
QUERY_PORTS_TIMEOUT_SECONDS = 30
|
13
15
|
|
14
16
|
logger = sky_logging.init_logger(__name__)
|
15
17
|
|
16
18
|
|
17
19
|
def _filter_instances(cluster_name_on_cloud: str,
|
18
|
-
status_filters: Optional[List[str]]
|
20
|
+
status_filters: Optional[List[str]],
|
21
|
+
head_only: bool = False) -> Dict[str, Any]:
|
19
22
|
|
20
23
|
instances = utils.list_instances()
|
21
|
-
possible_names = [
|
22
|
-
|
23
|
-
|
24
|
+
possible_names = [f'{cluster_name_on_cloud}-head']
|
25
|
+
if not head_only:
|
26
|
+
possible_names.append(f'{cluster_name_on_cloud}-worker')
|
24
27
|
|
25
28
|
filtered_instances = {}
|
26
29
|
for instance_id, instance in instances.items():
|
@@ -80,10 +83,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
80
83
|
node_type = 'head' if head_instance_id is None else 'worker'
|
81
84
|
try:
|
82
85
|
instance_id = utils.launch(
|
83
|
-
|
86
|
+
cluster_name=cluster_name_on_cloud,
|
87
|
+
node_type=node_type,
|
84
88
|
instance_type=config.node_config['InstanceType'],
|
85
89
|
region=region,
|
86
|
-
disk_size=config.node_config['DiskSize']
|
90
|
+
disk_size=config.node_config['DiskSize'],
|
91
|
+
image_name=config.node_config['ImageId'],
|
92
|
+
ports=config.ports_to_open_on_launch,
|
93
|
+
public_key=config.node_config['PublicKey'],
|
94
|
+
preemptible=config.node_config['Preemptible'],
|
95
|
+
bid_per_gpu=config.node_config['BidPerGPU'],
|
96
|
+
docker_login_config=config.provider_config.get(
|
97
|
+
'docker_login_config'),
|
98
|
+
)
|
87
99
|
except Exception as e: # pylint: disable=broad-except
|
88
100
|
logger.warning(f'run_instances error: {e}')
|
89
101
|
raise
|
@@ -136,6 +148,8 @@ def terminate_instances(
|
|
136
148
|
"""See sky/provision/__init__.py"""
|
137
149
|
del provider_config # unused
|
138
150
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
151
|
+
template_name, registry_auth_id = utils.get_registry_auth_resources(
|
152
|
+
cluster_name_on_cloud)
|
139
153
|
for inst_id, inst in instances.items():
|
140
154
|
logger.debug(f'Terminating instance {inst_id}: {inst}')
|
141
155
|
if worker_only and inst['name'].endswith('-head'):
|
@@ -148,6 +162,10 @@ def terminate_instances(
|
|
148
162
|
f'Failed to terminate instance {inst_id}: '
|
149
163
|
f'{common_utils.format_exception(e, use_bracket=False)}'
|
150
164
|
) from e
|
165
|
+
if template_name is not None:
|
166
|
+
utils.delete_pod_template(template_name)
|
167
|
+
if registry_auth_id is not None:
|
168
|
+
utils.delete_register_auth(registry_auth_id)
|
151
169
|
|
152
170
|
|
153
171
|
def get_cluster_info(
|
@@ -205,6 +223,44 @@ def query_instances(
|
|
205
223
|
|
206
224
|
def cleanup_ports(
|
207
225
|
cluster_name_on_cloud: str,
|
226
|
+
ports: List[str],
|
208
227
|
provider_config: Optional[Dict[str, Any]] = None,
|
209
228
|
) -> None:
|
210
|
-
del cluster_name_on_cloud, provider_config
|
229
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
230
|
+
|
231
|
+
|
232
|
+
def query_ports(
|
233
|
+
cluster_name_on_cloud: str,
|
234
|
+
ports: List[str],
|
235
|
+
head_ip: Optional[str] = None,
|
236
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
237
|
+
) -> Dict[int, List[common.Endpoint]]:
|
238
|
+
"""See sky/provision/__init__.py"""
|
239
|
+
del head_ip, provider_config # Unused.
|
240
|
+
# RunPod ports sometimes take a while to be ready.
|
241
|
+
start_time = time.time()
|
242
|
+
ports_to_query = resources_utils.port_ranges_to_set(ports)
|
243
|
+
while True:
|
244
|
+
instances = _filter_instances(cluster_name_on_cloud,
|
245
|
+
None,
|
246
|
+
head_only=True)
|
247
|
+
assert len(instances) <= 1
|
248
|
+
# It is possible that the instance is terminated on console by
|
249
|
+
# the user. In this case, the instance will not be found and we
|
250
|
+
# should return an empty dict.
|
251
|
+
if not instances:
|
252
|
+
return {}
|
253
|
+
head_inst = list(instances.values())[0]
|
254
|
+
ready_ports: Dict[int, List[common.Endpoint]] = {
|
255
|
+
port: [common.SocketEndpoint(**endpoint)]
|
256
|
+
for port, endpoint in head_inst['port2endpoint'].items()
|
257
|
+
if port in ports_to_query
|
258
|
+
}
|
259
|
+
not_ready_ports = ports_to_query - set(ready_ports.keys())
|
260
|
+
if not not_ready_ports:
|
261
|
+
return ready_ports
|
262
|
+
if time.time() - start_time > QUERY_PORTS_TIMEOUT_SECONDS:
|
263
|
+
logger.warning(f'Querying ports {ports} timed out. Ports '
|
264
|
+
f'{not_ready_ports} are not ready.')
|
265
|
+
return ready_ports
|
266
|
+
time.sleep(1)
|