skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,487 @@
|
|
1
|
+
"""Payloads for the Sky API requests.
|
2
|
+
|
3
|
+
TODO(zhwu): We can consider a better way to handle the default values of the
|
4
|
+
kwargs for the payloads, otherwise, we have to keep the default values the sync
|
5
|
+
with the backend functions. The benefit of having the default values in the
|
6
|
+
payloads is that a user can find the default values in the Restful API docs.
|
7
|
+
"""
|
8
|
+
import getpass
|
9
|
+
import json
|
10
|
+
import os
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
12
|
+
|
13
|
+
import pydantic
|
14
|
+
|
15
|
+
from sky import serve
|
16
|
+
from sky import sky_logging
|
17
|
+
from sky import skypilot_config
|
18
|
+
from sky.server import common
|
19
|
+
from sky.skylet import constants
|
20
|
+
from sky.usage import constants as usage_constants
|
21
|
+
from sky.usage import usage_lib
|
22
|
+
from sky.utils import annotations
|
23
|
+
from sky.utils import common as common_lib
|
24
|
+
from sky.utils import common_utils
|
25
|
+
from sky.utils import registry
|
26
|
+
|
27
|
+
logger = sky_logging.init_logger(__name__)
|
28
|
+
|
29
|
+
|
30
|
+
@annotations.lru_cache(scope='global')
|
31
|
+
def request_body_env_vars() -> dict:
|
32
|
+
env_vars = {}
|
33
|
+
for env_var in os.environ:
|
34
|
+
if env_var.startswith(constants.SKYPILOT_ENV_VAR_PREFIX):
|
35
|
+
env_vars[env_var] = os.environ[env_var]
|
36
|
+
env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
|
37
|
+
env_vars[constants.USER_ENV_VAR] = os.getenv(constants.USER_ENV_VAR,
|
38
|
+
getpass.getuser())
|
39
|
+
env_vars[
|
40
|
+
usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
|
41
|
+
# Remove the path to config file, as the config content is included in the
|
42
|
+
# request body and will be merged with the config on the server side.
|
43
|
+
env_vars.pop(skypilot_config.ENV_VAR_SKYPILOT_CONFIG, None)
|
44
|
+
return env_vars
|
45
|
+
|
46
|
+
|
47
|
+
def get_override_skypilot_config_from_client() -> Dict[str, Any]:
|
48
|
+
"""Returns the override configs from the client."""
|
49
|
+
config = skypilot_config.to_dict()
|
50
|
+
# Remove the API server config, as we should not specify the SkyPilot
|
51
|
+
# server endpoint on the server side. This avoids the warning below.
|
52
|
+
config.pop_nested(('api_server',), default_value=None)
|
53
|
+
ignored_key_values = {}
|
54
|
+
for nested_key in constants.SKIPPED_CLIENT_OVERRIDE_KEYS:
|
55
|
+
value = config.pop_nested(nested_key, default_value=None)
|
56
|
+
if value is not None:
|
57
|
+
ignored_key_values['.'.join(nested_key)] = value
|
58
|
+
if ignored_key_values:
|
59
|
+
logger.debug(f'The following keys ({json.dumps(ignored_key_values)}) '
|
60
|
+
'are specified in the client SkyPilot config at '
|
61
|
+
f'{skypilot_config.loaded_config_path()!r}. '
|
62
|
+
'This will be ignored. If you want to specify it, '
|
63
|
+
'please modify it on server side or contact your '
|
64
|
+
'administrator.')
|
65
|
+
return config
|
66
|
+
|
67
|
+
|
68
|
+
class RequestBody(pydantic.BaseModel):
|
69
|
+
"""The request body for the SkyPilot API."""
|
70
|
+
env_vars: Dict[str, str] = {}
|
71
|
+
entrypoint: str = ''
|
72
|
+
entrypoint_command: str = ''
|
73
|
+
using_remote_api_server: bool = False
|
74
|
+
override_skypilot_config: Optional[Dict[str, Any]] = {}
|
75
|
+
|
76
|
+
def __init__(self, **data):
|
77
|
+
data['env_vars'] = data.get('env_vars', request_body_env_vars())
|
78
|
+
usage_lib_entrypoint = usage_lib.messages.usage.entrypoint
|
79
|
+
if usage_lib_entrypoint is None:
|
80
|
+
usage_lib_entrypoint = ''
|
81
|
+
data['entrypoint'] = data.get('entrypoint', usage_lib_entrypoint)
|
82
|
+
data['entrypoint_command'] = data.get(
|
83
|
+
'entrypoint_command', common_utils.get_pretty_entrypoint_cmd())
|
84
|
+
data['using_remote_api_server'] = data.get(
|
85
|
+
'using_remote_api_server', not common.is_api_server_local())
|
86
|
+
data['override_skypilot_config'] = data.get(
|
87
|
+
'override_skypilot_config',
|
88
|
+
get_override_skypilot_config_from_client())
|
89
|
+
super().__init__(**data)
|
90
|
+
|
91
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
92
|
+
"""Convert the request body to a kwargs dictionary on API server.
|
93
|
+
|
94
|
+
This converts the request body into kwargs for the underlying SkyPilot
|
95
|
+
backend's function.
|
96
|
+
"""
|
97
|
+
kwargs = self.model_dump()
|
98
|
+
kwargs.pop('env_vars')
|
99
|
+
kwargs.pop('entrypoint')
|
100
|
+
kwargs.pop('entrypoint_command')
|
101
|
+
kwargs.pop('using_remote_api_server')
|
102
|
+
kwargs.pop('override_skypilot_config')
|
103
|
+
return kwargs
|
104
|
+
|
105
|
+
@property
|
106
|
+
def user_hash(self) -> Optional[str]:
|
107
|
+
return self.env_vars.get(constants.USER_ID_ENV_VAR)
|
108
|
+
|
109
|
+
|
110
|
+
class CheckBody(RequestBody):
|
111
|
+
"""The request body for the check endpoint."""
|
112
|
+
clouds: Optional[Tuple[str, ...]]
|
113
|
+
verbose: bool
|
114
|
+
|
115
|
+
|
116
|
+
class ValidateBody(RequestBody):
|
117
|
+
"""The request body for the validate endpoint."""
|
118
|
+
dag: str
|
119
|
+
|
120
|
+
|
121
|
+
class OptimizeBody(RequestBody):
|
122
|
+
"""The request body for the optimize endpoint."""
|
123
|
+
dag: str
|
124
|
+
minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
|
125
|
+
|
126
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
127
|
+
# Import here to avoid requirement of the whole SkyPilot dependency on
|
128
|
+
# local clients.
|
129
|
+
# pylint: disable=import-outside-toplevel
|
130
|
+
from sky.utils import dag_utils
|
131
|
+
|
132
|
+
kwargs = super().to_kwargs()
|
133
|
+
|
134
|
+
dag = dag_utils.load_chain_dag_from_yaml_str(self.dag)
|
135
|
+
# We should not validate the dag here, as the file mounts are not
|
136
|
+
# processed yet, but we need to validate the resources during the
|
137
|
+
# optimization to make sure the resources are available.
|
138
|
+
kwargs['dag'] = dag
|
139
|
+
return kwargs
|
140
|
+
|
141
|
+
|
142
|
+
class LaunchBody(RequestBody):
|
143
|
+
"""The request body for the launch endpoint."""
|
144
|
+
task: str
|
145
|
+
cluster_name: str
|
146
|
+
retry_until_up: bool = False
|
147
|
+
idle_minutes_to_autostop: Optional[int] = None
|
148
|
+
dryrun: bool = False
|
149
|
+
down: bool = False
|
150
|
+
backend: Optional[str] = None
|
151
|
+
optimize_target: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
|
152
|
+
no_setup: bool = False
|
153
|
+
clone_disk_from: Optional[str] = None
|
154
|
+
fast: bool = False
|
155
|
+
# Internal only:
|
156
|
+
# pylint: disable=invalid-name
|
157
|
+
quiet_optimizer: bool = False
|
158
|
+
is_launched_by_jobs_controller: bool = False
|
159
|
+
is_launched_by_sky_serve_controller: bool = False
|
160
|
+
disable_controller_check: bool = False
|
161
|
+
|
162
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
163
|
+
|
164
|
+
kwargs = super().to_kwargs()
|
165
|
+
dag = common.process_mounts_in_task_on_api_server(self.task,
|
166
|
+
self.env_vars,
|
167
|
+
workdir_only=False)
|
168
|
+
|
169
|
+
backend_cls = registry.BACKEND_REGISTRY.from_str(self.backend)
|
170
|
+
backend = backend_cls() if backend_cls is not None else None
|
171
|
+
kwargs['task'] = dag
|
172
|
+
kwargs['backend'] = backend
|
173
|
+
kwargs['_quiet_optimizer'] = kwargs.pop('quiet_optimizer')
|
174
|
+
kwargs['_is_launched_by_jobs_controller'] = kwargs.pop(
|
175
|
+
'is_launched_by_jobs_controller')
|
176
|
+
kwargs['_is_launched_by_sky_serve_controller'] = kwargs.pop(
|
177
|
+
'is_launched_by_sky_serve_controller')
|
178
|
+
kwargs['_disable_controller_check'] = kwargs.pop(
|
179
|
+
'disable_controller_check')
|
180
|
+
return kwargs
|
181
|
+
|
182
|
+
|
183
|
+
class ExecBody(RequestBody):
|
184
|
+
"""The request body for the exec endpoint."""
|
185
|
+
task: str
|
186
|
+
cluster_name: str
|
187
|
+
dryrun: bool = False
|
188
|
+
down: bool = False
|
189
|
+
backend: Optional[str] = None
|
190
|
+
|
191
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
192
|
+
|
193
|
+
kwargs = super().to_kwargs()
|
194
|
+
dag = common.process_mounts_in_task_on_api_server(self.task,
|
195
|
+
self.env_vars,
|
196
|
+
workdir_only=True)
|
197
|
+
backend_cls = registry.BACKEND_REGISTRY.from_str(self.backend)
|
198
|
+
backend = backend_cls() if backend_cls is not None else None
|
199
|
+
kwargs['task'] = dag
|
200
|
+
kwargs['backend'] = backend
|
201
|
+
return kwargs
|
202
|
+
|
203
|
+
|
204
|
+
class StopOrDownBody(RequestBody):
|
205
|
+
cluster_name: str
|
206
|
+
purge: bool = False
|
207
|
+
|
208
|
+
|
209
|
+
class StatusBody(RequestBody):
|
210
|
+
"""The request body for the status endpoint."""
|
211
|
+
cluster_names: Optional[List[str]] = None
|
212
|
+
refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
|
213
|
+
all_users: bool = True
|
214
|
+
|
215
|
+
|
216
|
+
class StartBody(RequestBody):
|
217
|
+
"""The request body for the start endpoint."""
|
218
|
+
cluster_name: str
|
219
|
+
idle_minutes_to_autostop: Optional[int] = None
|
220
|
+
retry_until_up: bool = False
|
221
|
+
down: bool = False
|
222
|
+
force: bool = False
|
223
|
+
|
224
|
+
|
225
|
+
class AutostopBody(RequestBody):
|
226
|
+
"""The request body for the autostop endpoint."""
|
227
|
+
cluster_name: str
|
228
|
+
idle_minutes: int
|
229
|
+
down: bool = False
|
230
|
+
|
231
|
+
|
232
|
+
class QueueBody(RequestBody):
|
233
|
+
"""The request body for the queue endpoint."""
|
234
|
+
cluster_name: str
|
235
|
+
skip_finished: bool = False
|
236
|
+
all_users: bool = False
|
237
|
+
|
238
|
+
|
239
|
+
class CancelBody(RequestBody):
|
240
|
+
"""The request body for the cancel endpoint."""
|
241
|
+
cluster_name: str
|
242
|
+
job_ids: Optional[List[int]]
|
243
|
+
all: bool = False
|
244
|
+
all_users: bool = False
|
245
|
+
# Internal only. We cannot use prefix `_` because pydantic will not
|
246
|
+
# include it in the request body.
|
247
|
+
try_cancel_if_cluster_is_init: bool = False
|
248
|
+
|
249
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
250
|
+
kwargs = super().to_kwargs()
|
251
|
+
kwargs['_try_cancel_if_cluster_is_init'] = kwargs.pop(
|
252
|
+
'try_cancel_if_cluster_is_init')
|
253
|
+
return kwargs
|
254
|
+
|
255
|
+
|
256
|
+
class ClusterNameBody(RequestBody):
|
257
|
+
"""Cluster node."""
|
258
|
+
cluster_name: str
|
259
|
+
|
260
|
+
|
261
|
+
class ClusterJobBody(RequestBody):
|
262
|
+
"""The request body for the cluster job endpoint."""
|
263
|
+
cluster_name: str
|
264
|
+
job_id: Optional[int]
|
265
|
+
follow: bool = True
|
266
|
+
tail: int = 0
|
267
|
+
|
268
|
+
|
269
|
+
class ClusterJobsBody(RequestBody):
|
270
|
+
"""The request body for the cluster jobs endpoint."""
|
271
|
+
cluster_name: str
|
272
|
+
job_ids: Optional[List[str]]
|
273
|
+
|
274
|
+
|
275
|
+
class ClusterJobsDownloadLogsBody(RequestBody):
|
276
|
+
"""The request body for the cluster jobs download logs endpoint."""
|
277
|
+
cluster_name: str
|
278
|
+
job_ids: Optional[List[str]]
|
279
|
+
local_dir: str = constants.SKY_LOGS_DIRECTORY
|
280
|
+
|
281
|
+
|
282
|
+
class DownloadBody(RequestBody):
|
283
|
+
"""The request body for the download endpoint."""
|
284
|
+
folder_paths: List[str]
|
285
|
+
|
286
|
+
|
287
|
+
class StorageBody(RequestBody):
|
288
|
+
"""The request body for the storage endpoint."""
|
289
|
+
name: str
|
290
|
+
|
291
|
+
|
292
|
+
class EndpointsBody(RequestBody):
|
293
|
+
"""The request body for the endpoint."""
|
294
|
+
cluster: str
|
295
|
+
port: Optional[Union[int, str]] = None
|
296
|
+
|
297
|
+
|
298
|
+
class ServeEndpointBody(RequestBody):
|
299
|
+
"""The request body for the serve controller endpoint."""
|
300
|
+
port: Optional[Union[int, str]] = None
|
301
|
+
|
302
|
+
|
303
|
+
class JobStatusBody(RequestBody):
|
304
|
+
"""The request body for the job status endpoint."""
|
305
|
+
cluster_name: str
|
306
|
+
job_ids: Optional[List[int]]
|
307
|
+
|
308
|
+
|
309
|
+
class JobsLaunchBody(RequestBody):
|
310
|
+
"""The request body for the jobs launch endpoint."""
|
311
|
+
task: str
|
312
|
+
name: Optional[str]
|
313
|
+
|
314
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
315
|
+
kwargs = super().to_kwargs()
|
316
|
+
kwargs['task'] = common.process_mounts_in_task_on_api_server(
|
317
|
+
self.task, self.env_vars, workdir_only=False)
|
318
|
+
return kwargs
|
319
|
+
|
320
|
+
|
321
|
+
class JobsQueueBody(RequestBody):
|
322
|
+
"""The request body for the jobs queue endpoint."""
|
323
|
+
refresh: bool = False
|
324
|
+
skip_finished: bool = False
|
325
|
+
all_users: bool = False
|
326
|
+
|
327
|
+
|
328
|
+
class JobsCancelBody(RequestBody):
|
329
|
+
"""The request body for the jobs cancel endpoint."""
|
330
|
+
name: Optional[str]
|
331
|
+
job_ids: Optional[List[int]]
|
332
|
+
all: bool = False
|
333
|
+
all_users: bool = False
|
334
|
+
|
335
|
+
|
336
|
+
class JobsLogsBody(RequestBody):
|
337
|
+
"""The request body for the jobs logs endpoint."""
|
338
|
+
name: Optional[str] = None
|
339
|
+
job_id: Optional[int] = None
|
340
|
+
follow: bool = True
|
341
|
+
controller: bool = False
|
342
|
+
refresh: bool = False
|
343
|
+
|
344
|
+
|
345
|
+
class RequestCancelBody(RequestBody):
|
346
|
+
"""The request body for the API request cancellation endpoint."""
|
347
|
+
# Kill all requests if request_ids is None.
|
348
|
+
request_ids: Optional[List[str]] = None
|
349
|
+
user_id: Optional[str] = None
|
350
|
+
|
351
|
+
|
352
|
+
class RequestStatusBody(pydantic.BaseModel):
|
353
|
+
"""The request body for the API request status endpoint."""
|
354
|
+
request_ids: Optional[List[str]] = None
|
355
|
+
all_status: bool = False
|
356
|
+
|
357
|
+
|
358
|
+
class ServeUpBody(RequestBody):
|
359
|
+
"""The request body for the serve up endpoint."""
|
360
|
+
task: str
|
361
|
+
service_name: str
|
362
|
+
|
363
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
364
|
+
kwargs = super().to_kwargs()
|
365
|
+
dag = common.process_mounts_in_task_on_api_server(self.task,
|
366
|
+
self.env_vars,
|
367
|
+
workdir_only=False)
|
368
|
+
assert len(
|
369
|
+
dag.tasks) == 1, ('Must only specify one task in the DAG for '
|
370
|
+
'a service.', dag)
|
371
|
+
kwargs['task'] = dag.tasks[0]
|
372
|
+
return kwargs
|
373
|
+
|
374
|
+
|
375
|
+
class ServeUpdateBody(RequestBody):
|
376
|
+
"""The request body for the serve update endpoint."""
|
377
|
+
task: str
|
378
|
+
service_name: str
|
379
|
+
mode: serve.UpdateMode
|
380
|
+
|
381
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
382
|
+
kwargs = super().to_kwargs()
|
383
|
+
dag = common.process_mounts_in_task_on_api_server(self.task,
|
384
|
+
self.env_vars,
|
385
|
+
workdir_only=False)
|
386
|
+
assert len(
|
387
|
+
dag.tasks) == 1, ('Must only specify one task in the DAG for '
|
388
|
+
'a service.', dag)
|
389
|
+
kwargs['task'] = dag.tasks[0]
|
390
|
+
return kwargs
|
391
|
+
|
392
|
+
|
393
|
+
class ServeDownBody(RequestBody):
|
394
|
+
"""The request body for the serve down endpoint."""
|
395
|
+
service_names: Optional[Union[str, List[str]]]
|
396
|
+
all: bool = False
|
397
|
+
purge: bool = False
|
398
|
+
|
399
|
+
|
400
|
+
class ServeLogsBody(RequestBody):
|
401
|
+
"""The request body for the serve logs endpoint."""
|
402
|
+
service_name: str
|
403
|
+
target: Union[str, serve.ServiceComponent]
|
404
|
+
replica_id: Optional[int] = None
|
405
|
+
follow: bool = True
|
406
|
+
|
407
|
+
|
408
|
+
class ServeStatusBody(RequestBody):
|
409
|
+
"""The request body for the serve status endpoint."""
|
410
|
+
service_names: Optional[Union[str, List[str]]]
|
411
|
+
|
412
|
+
|
413
|
+
class RealtimeGpuAvailabilityRequestBody(RequestBody):
|
414
|
+
"""The request body for the realtime GPU availability endpoint."""
|
415
|
+
context: Optional[str]
|
416
|
+
name_filter: Optional[str]
|
417
|
+
quantity_filter: Optional[int]
|
418
|
+
|
419
|
+
|
420
|
+
class KubernetesNodeInfoRequestBody(RequestBody):
|
421
|
+
"""The request body for the kubernetes node info endpoint."""
|
422
|
+
context: Optional[str] = None
|
423
|
+
|
424
|
+
|
425
|
+
class ListAcceleratorsBody(RequestBody):
|
426
|
+
"""The request body for the list accelerators endpoint."""
|
427
|
+
gpus_only: bool = True
|
428
|
+
name_filter: Optional[str] = None
|
429
|
+
region_filter: Optional[str] = None
|
430
|
+
quantity_filter: Optional[int] = None
|
431
|
+
clouds: Optional[Union[List[str], str]] = None
|
432
|
+
all_regions: bool = False
|
433
|
+
require_price: bool = True
|
434
|
+
case_sensitive: bool = True
|
435
|
+
|
436
|
+
|
437
|
+
class ListAcceleratorCountsBody(RequestBody):
|
438
|
+
"""The request body for the list accelerator counts endpoint."""
|
439
|
+
gpus_only: bool = True
|
440
|
+
name_filter: Optional[str] = None
|
441
|
+
region_filter: Optional[str] = None
|
442
|
+
quantity_filter: Optional[int] = None
|
443
|
+
clouds: Optional[Union[List[str], str]] = None
|
444
|
+
|
445
|
+
|
446
|
+
class LocalUpBody(RequestBody):
|
447
|
+
"""The request body for the local up endpoint."""
|
448
|
+
gpus: bool = True
|
449
|
+
ips: Optional[List[str]] = None
|
450
|
+
ssh_user: Optional[str] = None
|
451
|
+
ssh_key: Optional[str] = None
|
452
|
+
cleanup: bool = False
|
453
|
+
|
454
|
+
|
455
|
+
class ServeTerminateReplicaBody(RequestBody):
|
456
|
+
"""The request body for the serve terminate replica endpoint."""
|
457
|
+
service_name: str
|
458
|
+
replica_id: int
|
459
|
+
purge: bool = False
|
460
|
+
|
461
|
+
|
462
|
+
class KillRequestProcessesBody(RequestBody):
|
463
|
+
"""The request body for the kill request processes endpoint."""
|
464
|
+
request_ids: List[str]
|
465
|
+
|
466
|
+
|
467
|
+
class StreamBody(pydantic.BaseModel):
|
468
|
+
"""The request body for the stream endpoint."""
|
469
|
+
request_id: Optional[str] = None
|
470
|
+
log_path: Optional[str] = None
|
471
|
+
tail: Optional[int] = None
|
472
|
+
plain_logs: bool = True
|
473
|
+
|
474
|
+
|
475
|
+
class JobsDownloadLogsBody(RequestBody):
|
476
|
+
"""The request body for the jobs download logs endpoint."""
|
477
|
+
name: Optional[str]
|
478
|
+
job_id: Optional[int]
|
479
|
+
refresh: bool = False
|
480
|
+
controller: bool = False
|
481
|
+
local_dir: str = constants.SKY_LOGS_DIRECTORY
|
482
|
+
|
483
|
+
|
484
|
+
class UploadZipFileResponse(pydantic.BaseModel):
|
485
|
+
"""The response body for the upload zip file endpoint."""
|
486
|
+
status: str
|
487
|
+
missing_chunks: Optional[List[str]] = None
|
File without changes
|
@@ -0,0 +1,76 @@
|
|
1
|
+
"""Shared queues for multiprocessing."""
|
2
|
+
from multiprocessing import managers
|
3
|
+
import queue
|
4
|
+
import time
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
from sky import sky_logging
|
8
|
+
|
9
|
+
logger = sky_logging.init_logger(__name__)
|
10
|
+
|
11
|
+
# The default port used by SkyPilot API server's request queue.
|
12
|
+
# We avoid 50010, as it might be taken by HDFS.
|
13
|
+
DEFAULT_QUEUE_MANAGER_PORT = 50011
|
14
|
+
|
15
|
+
|
16
|
+
# Have to create custom manager to handle different processes connecting to the
|
17
|
+
# same manager and getting the same queues.
|
18
|
+
class QueueManager(managers.BaseManager):
|
19
|
+
pass
|
20
|
+
|
21
|
+
|
22
|
+
def start_queue_manager(queue_names: List[str],
|
23
|
+
port: int = DEFAULT_QUEUE_MANAGER_PORT) -> None:
|
24
|
+
# Defining a local function instead of a lambda function
|
25
|
+
# (e.g. lambda: q) because the lambda function captures q by
|
26
|
+
# reference, so by the time lambda is called, the loop has already
|
27
|
+
# reached the last q item, causing the manager to always return
|
28
|
+
# the last q item.
|
29
|
+
def queue_getter(q_obj):
|
30
|
+
return lambda: q_obj
|
31
|
+
|
32
|
+
for name in queue_names:
|
33
|
+
q_obj: queue.Queue = queue.Queue()
|
34
|
+
QueueManager.register(name, callable=queue_getter(q_obj))
|
35
|
+
|
36
|
+
# Start long-running manager server.
|
37
|
+
# Manager will set socket.SO_REUSEADDR, but BSD and Linux have different
|
38
|
+
# behaviors on this option:
|
39
|
+
# - BSD(e.g. MacOS): * (0.0.0.0) does not conflict with other addresses on
|
40
|
+
# the same port
|
41
|
+
# - Linux: in the contrary, * conflicts with any other addresses
|
42
|
+
# So on BSD systems, binding to * while the port is already bound to
|
43
|
+
# localhost (127.0.0.1) will succeed, but the Manager won't actually be able
|
44
|
+
# to accept connections on localhost.
|
45
|
+
# For portability, we use the loopback address instead of *.
|
46
|
+
manager = QueueManager(address=('localhost', port), authkey=b'skypilot')
|
47
|
+
server = manager.get_server()
|
48
|
+
server.serve_forever()
|
49
|
+
|
50
|
+
|
51
|
+
def get_queue(queue_name: str,
|
52
|
+
port: int = DEFAULT_QUEUE_MANAGER_PORT) -> queue.Queue:
|
53
|
+
QueueManager.register(queue_name)
|
54
|
+
manager = QueueManager(address=('localhost', port), authkey=b'skypilot')
|
55
|
+
manager.connect()
|
56
|
+
return getattr(manager, queue_name)()
|
57
|
+
|
58
|
+
|
59
|
+
def wait_for_queues_to_be_ready(queue_names: List[str],
|
60
|
+
port: int = DEFAULT_QUEUE_MANAGER_PORT) -> None:
|
61
|
+
"""Wait for the queues to be ready after queue manager is just started."""
|
62
|
+
initial_time = time.time()
|
63
|
+
max_wait_time = 5
|
64
|
+
while queue_names:
|
65
|
+
try:
|
66
|
+
get_queue(queue_names[0], port)
|
67
|
+
queue_names.pop(0)
|
68
|
+
break
|
69
|
+
except ConnectionRefusedError as e: # pylint: disable=broad-except
|
70
|
+
logger.info(f'Waiting for request queue, named {queue_names[0]!r}, '
|
71
|
+
f'to be ready...')
|
72
|
+
time.sleep(0.2)
|
73
|
+
if time.time() - initial_time > max_wait_time:
|
74
|
+
raise RuntimeError(
|
75
|
+
f'Request queue, named {queue_names[0]!r}, '
|
76
|
+
f'is not ready after {max_wait_time} seconds.') from e
|