skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import base64
|
|
3
3
|
import pickle
|
|
4
4
|
import typing
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
from sky import jobs as managed_jobs
|
|
8
8
|
from sky import models
|
|
@@ -56,10 +56,10 @@ def decode_status(
|
|
|
56
56
|
clusters = return_value
|
|
57
57
|
response = []
|
|
58
58
|
for cluster in clusters:
|
|
59
|
-
|
|
59
|
+
# handle may not always be present in the response.
|
|
60
|
+
if 'handle' in cluster and cluster['handle'] is not None:
|
|
61
|
+
cluster['handle'] = decode_and_unpickle(cluster['handle'])
|
|
60
62
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
61
|
-
cluster['storage_mounts_metadata'] = decode_and_unpickle(
|
|
62
|
-
cluster['storage_mounts_metadata'])
|
|
63
63
|
if 'is_managed' not in cluster:
|
|
64
64
|
cluster['is_managed'] = False
|
|
65
65
|
response.append(responses.StatusResponse.model_validate(cluster))
|
|
@@ -116,22 +116,35 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
|
116
116
|
|
|
117
117
|
|
|
118
118
|
@register_decoders('jobs.queue_v2')
|
|
119
|
-
def decode_jobs_queue_v2(
|
|
119
|
+
def decode_jobs_queue_v2(
|
|
120
|
+
return_value
|
|
121
|
+
) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
|
|
122
|
+
List[responses.ManagedJobRecord]]:
|
|
120
123
|
"""Decode jobs queue response.
|
|
121
124
|
|
|
122
|
-
Supports legacy list, or a dict {jobs, total
|
|
123
|
-
|
|
125
|
+
Supports legacy list, or a dict {jobs, total, total_no_filter,
|
|
126
|
+
status_counts}.
|
|
127
|
+
|
|
128
|
+
- Returns either list[job] or tuple(list[job], total, status_counts,
|
|
129
|
+
total_no_filter)
|
|
124
130
|
"""
|
|
125
|
-
# Case 1: dict shape {jobs, total}
|
|
126
|
-
if isinstance(return_value, dict)
|
|
131
|
+
# Case 1: dict shape {jobs, total, total_no_filter, status_counts}
|
|
132
|
+
if isinstance(return_value, dict):
|
|
127
133
|
jobs = return_value.get('jobs', [])
|
|
134
|
+
total = return_value.get('total', len(jobs))
|
|
135
|
+
total_no_filter = return_value.get('total_no_filter', total)
|
|
136
|
+
status_counts = return_value.get('status_counts', {})
|
|
137
|
+
for job in jobs:
|
|
138
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
139
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
140
|
+
return jobs, total, status_counts, total_no_filter
|
|
128
141
|
else:
|
|
129
142
|
# Case 2: legacy list
|
|
130
143
|
jobs = return_value
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
144
|
+
for job in jobs:
|
|
145
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
146
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
147
|
+
return jobs
|
|
135
148
|
|
|
136
149
|
|
|
137
150
|
def _decode_serve_status(
|
|
@@ -8,6 +8,8 @@ import pickle
|
|
|
8
8
|
import typing
|
|
9
9
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
|
+
from sky import models
|
|
12
|
+
from sky.catalog import common
|
|
11
13
|
from sky.schemas.api import responses
|
|
12
14
|
from sky.server import constants as server_constants
|
|
13
15
|
from sky.utils import serialize_utils
|
|
@@ -15,7 +17,6 @@ from sky.utils import serialize_utils
|
|
|
15
17
|
if typing.TYPE_CHECKING:
|
|
16
18
|
from sky import backends
|
|
17
19
|
from sky import clouds
|
|
18
|
-
from sky import models
|
|
19
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
20
21
|
|
|
21
22
|
handlers: Dict[str, Any] = {}
|
|
@@ -60,13 +61,23 @@ def encode_status(
|
|
|
60
61
|
clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
|
|
61
62
|
response = []
|
|
62
63
|
for cluster in clusters:
|
|
63
|
-
response_cluster = cluster.model_dump()
|
|
64
|
+
response_cluster = cluster.model_dump(exclude_none=True)
|
|
65
|
+
# These default setting is needed because last_use and status_updated_at
|
|
66
|
+
# used to be not optional.
|
|
67
|
+
# TODO(syang): remove this after v0.10.7 or v0.11.0
|
|
68
|
+
if 'last_use' not in response_cluster:
|
|
69
|
+
response_cluster['last_use'] = ''
|
|
70
|
+
if 'status_updated_at' not in response_cluster:
|
|
71
|
+
response_cluster['status_updated_at'] = 0
|
|
64
72
|
response_cluster['status'] = cluster['status'].value
|
|
65
73
|
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
66
74
|
cluster['handle'])
|
|
67
75
|
response_cluster['handle'] = pickle_and_encode(handle)
|
|
76
|
+
# TODO (syang) We still need to return this field for backwards
|
|
77
|
+
# compatibility.
|
|
78
|
+
# Remove this field at or after v0.10.7 or v0.11.0
|
|
68
79
|
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
69
|
-
|
|
80
|
+
None) # Always returns None.
|
|
70
81
|
response.append(response_cluster)
|
|
71
82
|
return response
|
|
72
83
|
|
|
@@ -121,7 +132,7 @@ def encode_status_kubernetes(
|
|
|
121
132
|
encoded_cluster = dataclasses.asdict(cluster)
|
|
122
133
|
encoded_cluster['status'] = encoded_cluster['status'].value
|
|
123
134
|
encoded_unmanaged_clusters.append(encoded_cluster)
|
|
124
|
-
all_jobs = [job.model_dump() for job in all_jobs]
|
|
135
|
+
all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
|
|
125
136
|
return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
|
|
126
137
|
|
|
127
138
|
|
|
@@ -148,12 +159,13 @@ def encode_jobs_queue_v2(
|
|
|
148
159
|
else:
|
|
149
160
|
jobs = jobs_or_tuple
|
|
150
161
|
total = None
|
|
151
|
-
for job in jobs
|
|
162
|
+
jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
|
|
163
|
+
for job in jobs_dict:
|
|
152
164
|
job['status'] = job['status'].value
|
|
153
165
|
if total is None:
|
|
154
|
-
return
|
|
166
|
+
return jobs_dict
|
|
155
167
|
return {
|
|
156
|
-
'jobs':
|
|
168
|
+
'jobs': jobs_dict,
|
|
157
169
|
'total': total,
|
|
158
170
|
'total_no_filter': total_no_filter,
|
|
159
171
|
'status_counts': status_counts
|
|
@@ -205,10 +217,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
|
|
|
205
217
|
@register_encoder('storage_ls')
|
|
206
218
|
def encode_storage_ls(
|
|
207
219
|
return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
|
|
208
|
-
for storage_info in return_value
|
|
220
|
+
response_list = [storage_info.model_dump() for storage_info in return_value]
|
|
221
|
+
for storage_info in response_list:
|
|
209
222
|
storage_info['status'] = storage_info['status'].value
|
|
210
223
|
storage_info['store'] = [store.value for store in storage_info['store']]
|
|
211
|
-
return
|
|
224
|
+
return response_list
|
|
212
225
|
|
|
213
226
|
|
|
214
227
|
@register_encoder('volume_list')
|
|
@@ -218,11 +231,11 @@ def encode_volume_list(
|
|
|
218
231
|
|
|
219
232
|
|
|
220
233
|
@register_encoder('job_status')
|
|
221
|
-
def encode_job_status(return_value: Dict[int, Any]) -> Dict[
|
|
234
|
+
def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
|
|
222
235
|
for job_id in return_value.keys():
|
|
223
236
|
if return_value[job_id] is not None:
|
|
224
237
|
return_value[job_id] = return_value[job_id].value
|
|
225
|
-
return return_value
|
|
238
|
+
return {str(k): v for k, v in return_value.items()}
|
|
226
239
|
|
|
227
240
|
|
|
228
241
|
@register_encoder('kubernetes_node_info')
|
|
@@ -234,3 +247,35 @@ def encode_kubernetes_node_info(
|
|
|
234
247
|
@register_encoder('endpoints')
|
|
235
248
|
def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
|
|
236
249
|
return {str(k): v for k, v in return_value.items()}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@register_encoder('realtime_kubernetes_gpu_availability')
|
|
253
|
+
def encode_realtime_gpu_availability(
|
|
254
|
+
return_value: List[Tuple[str,
|
|
255
|
+
List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
|
|
256
|
+
# Convert RealtimeGpuAvailability namedtuples to lists
|
|
257
|
+
# for JSON serialization.
|
|
258
|
+
encoded = []
|
|
259
|
+
for context, gpu_list in return_value:
|
|
260
|
+
converted_gpu_list = []
|
|
261
|
+
for gpu in gpu_list:
|
|
262
|
+
assert isinstance(gpu, models.RealtimeGpuAvailability), (
|
|
263
|
+
f'Expected RealtimeGpuAvailability, got {type(gpu)}')
|
|
264
|
+
converted_gpu_list.append(list(gpu))
|
|
265
|
+
encoded.append((context, converted_gpu_list))
|
|
266
|
+
return encoded
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@register_encoder('list_accelerators')
|
|
270
|
+
def encode_list_accelerators(
|
|
271
|
+
return_value: Dict[str, List[Any]]) -> Dict[str, Any]:
|
|
272
|
+
encoded: Dict[str, Any] = {}
|
|
273
|
+
for accelerator_name, instances in return_value.items():
|
|
274
|
+
# Convert InstanceTypeInfo namedtuples to lists for JSON serialization.
|
|
275
|
+
converted_instances: List[Any] = []
|
|
276
|
+
for instance in instances:
|
|
277
|
+
assert isinstance(instance, common.InstanceTypeInfo), (
|
|
278
|
+
f'Expected InstanceTypeInfo, got {type(instance)}')
|
|
279
|
+
converted_instances.append(list(instance))
|
|
280
|
+
encoded[accelerator_name] = converted_instances
|
|
281
|
+
return encoded
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Request execution threads management."""
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import threading
|
|
5
|
+
from typing import Callable, Set
|
|
6
|
+
|
|
7
|
+
from sky import exceptions
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.utils import atomic
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OnDemandThreadExecutor(concurrent.futures.Executor):
|
|
15
|
+
"""An executor that creates a new thread for each task and destroys it
|
|
16
|
+
after the task is completed.
|
|
17
|
+
|
|
18
|
+
Note(dev):
|
|
19
|
+
We raise an error instead of queuing the request if the limit is reached, so
|
|
20
|
+
that:
|
|
21
|
+
1. the request might be handled by other processes that have idle workers
|
|
22
|
+
upon retry;
|
|
23
|
+
2. if not, then users can be clearly hinted that they need to scale the API
|
|
24
|
+
server to support higher concurrency.
|
|
25
|
+
So this executor is only suitable for carefully selected cases where the
|
|
26
|
+
error can be properly handled by caller. To make this executor general, we
|
|
27
|
+
need to support configuring the queuing behavior (exception or queueing).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, name: str, max_workers: int):
|
|
31
|
+
self.name: str = name
|
|
32
|
+
self.max_workers: int = max_workers
|
|
33
|
+
self.running: atomic.AtomicInt = atomic.AtomicInt(0)
|
|
34
|
+
self._shutdown: bool = False
|
|
35
|
+
self._shutdown_lock: threading.Lock = threading.Lock()
|
|
36
|
+
self._threads: Set[threading.Thread] = set()
|
|
37
|
+
self._threads_lock: threading.Lock = threading.Lock()
|
|
38
|
+
|
|
39
|
+
def _cleanup_thread(self, thread: threading.Thread):
|
|
40
|
+
with self._threads_lock:
|
|
41
|
+
self._threads.discard(thread)
|
|
42
|
+
|
|
43
|
+
def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
|
|
44
|
+
*args, **kwargs):
|
|
45
|
+
try:
|
|
46
|
+
result = fn(*args, **kwargs)
|
|
47
|
+
fut.set_result(result)
|
|
48
|
+
except Exception as e: # pylint: disable=broad-except
|
|
49
|
+
logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
|
|
50
|
+
fut.set_exception(e)
|
|
51
|
+
finally:
|
|
52
|
+
self.running.decrement()
|
|
53
|
+
self._cleanup_thread(threading.current_thread())
|
|
54
|
+
|
|
55
|
+
def check_available(self, borrow: bool = False) -> int:
|
|
56
|
+
"""Check if there are available workers.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
borrow: If True, the caller borrow a worker from the executor.
|
|
60
|
+
The caller is responsible for returning the worker to the
|
|
61
|
+
executor after the task is completed.
|
|
62
|
+
"""
|
|
63
|
+
count = self.running.increment()
|
|
64
|
+
if count > self.max_workers:
|
|
65
|
+
self.running.decrement()
|
|
66
|
+
raise exceptions.ConcurrentWorkerExhaustedError(
|
|
67
|
+
f'Maximum concurrent workers {self.max_workers} of threads '
|
|
68
|
+
f'executor [{self.name}] reached')
|
|
69
|
+
if not borrow:
|
|
70
|
+
self.running.decrement()
|
|
71
|
+
return count
|
|
72
|
+
|
|
73
|
+
def submit(self, fn, /, *args, **kwargs):
|
|
74
|
+
with self._shutdown_lock:
|
|
75
|
+
if self._shutdown:
|
|
76
|
+
raise RuntimeError(
|
|
77
|
+
'Cannot submit task after executor is shutdown')
|
|
78
|
+
count = self.check_available(borrow=True)
|
|
79
|
+
fut: concurrent.futures.Future = concurrent.futures.Future()
|
|
80
|
+
# Name is assigned for debugging purpose, duplication is fine
|
|
81
|
+
thread = threading.Thread(target=self._task_wrapper,
|
|
82
|
+
name=f'{self.name}-{count}',
|
|
83
|
+
args=(fn, fut, *args),
|
|
84
|
+
kwargs=kwargs,
|
|
85
|
+
daemon=True)
|
|
86
|
+
with self._threads_lock:
|
|
87
|
+
self._threads.add(thread)
|
|
88
|
+
try:
|
|
89
|
+
thread.start()
|
|
90
|
+
except Exception as e:
|
|
91
|
+
self.running.decrement()
|
|
92
|
+
self._cleanup_thread(thread)
|
|
93
|
+
fut.set_exception(e)
|
|
94
|
+
raise
|
|
95
|
+
assert thread.ident is not None, 'Thread should be started'
|
|
96
|
+
return fut
|
|
97
|
+
|
|
98
|
+
def shutdown(self, wait=True):
|
|
99
|
+
with self._shutdown_lock:
|
|
100
|
+
self._shutdown = True
|
|
101
|
+
if not wait:
|
|
102
|
+
return
|
|
103
|
+
with self._threads_lock:
|
|
104
|
+
threads = list(self._threads)
|
|
105
|
+
for t in threads:
|
|
106
|
+
t.join()
|
sky/server/rest.py
CHANGED
|
@@ -178,14 +178,16 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
|
|
|
178
178
|
Notes(dev):
|
|
179
179
|
"""
|
|
180
180
|
|
|
181
|
+
def _readable_error_msg(message: str) -> str:
|
|
182
|
+
return (f'{colorama.Fore.YELLOW}API server is temporarily '
|
|
183
|
+
f'unavailable: {message}.\nRetrying...'
|
|
184
|
+
f'{colorama.Style.RESET_ALL}')
|
|
185
|
+
|
|
181
186
|
def decorator(func: F) -> F:
|
|
182
187
|
|
|
183
188
|
@functools.wraps(func)
|
|
184
189
|
def wrapper(*args, **kwargs) -> Any:
|
|
185
|
-
|
|
186
|
-
f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
|
|
187
|
-
'upgrade in progress. Waiting to resume...'
|
|
188
|
-
f'{colorama.Style.RESET_ALL}')
|
|
190
|
+
|
|
189
191
|
backoff = common_utils.Backoff(
|
|
190
192
|
initial_backoff=initial_backoff,
|
|
191
193
|
max_backoff_factor=max_backoff_factor)
|
|
@@ -203,7 +205,8 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
|
|
|
203
205
|
# stop the status spinner before retrying func() to
|
|
204
206
|
# avoid the status spinner get stuck if the func() runs
|
|
205
207
|
# for a long time without update status, e.g. sky logs.
|
|
206
|
-
with rich_utils.client_status(
|
|
208
|
+
with rich_utils.client_status(
|
|
209
|
+
_readable_error_msg(e.message)):
|
|
207
210
|
if time.time() - start_time > max_wait_seconds:
|
|
208
211
|
# pylint: disable=line-too-long
|
|
209
212
|
raise exceptions.ServerTemporarilyUnavailableError(
|
|
@@ -224,14 +227,67 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
|
|
|
224
227
|
|
|
225
228
|
|
|
226
229
|
def handle_server_unavailable(response: 'requests.Response') -> None:
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
230
|
+
"""Handle 503 (Service Unavailable) error
|
|
231
|
+
|
|
232
|
+
The client get 503 error in the following cases:
|
|
233
|
+
1. The reverse proxy cannot find any ready backend endpoints to serve the
|
|
234
|
+
request, e.g. when there is and rolling-update.
|
|
235
|
+
2. The skypilot API server has temporary resource issue, e.g. when the
|
|
236
|
+
cucurrency of the handling process is exhausted.
|
|
237
|
+
|
|
238
|
+
We expect the caller (CLI or SDK) retry on these cases and show clear wait
|
|
239
|
+
message to the user to let user decide whether keep waiting or abort the
|
|
240
|
+
request.
|
|
241
|
+
"""
|
|
242
|
+
if response.status_code != 503:
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
# error_msg = 'SkyPilot API server is temporarily unavailable. '
|
|
246
|
+
error_msg = ''
|
|
247
|
+
try:
|
|
248
|
+
response_data = response.json()
|
|
249
|
+
if 'detail' in response_data:
|
|
250
|
+
error_msg = response_data['detail']
|
|
251
|
+
except Exception: # pylint: disable=broad-except
|
|
252
|
+
if response.text:
|
|
253
|
+
error_msg = response.text
|
|
254
|
+
|
|
255
|
+
with ux_utils.print_exception_no_traceback():
|
|
256
|
+
raise exceptions.ServerTemporarilyUnavailableError(error_msg)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
async def handle_server_unavailable_async(
|
|
260
|
+
response: 'aiohttp.ClientResponse') -> None:
|
|
261
|
+
"""Async version: Handle 503 (Service Unavailable) error
|
|
262
|
+
|
|
263
|
+
The client get 503 error in the following cases:
|
|
264
|
+
1. The reverse proxy cannot find any ready backend endpoints to serve the
|
|
265
|
+
request, e.g. when there is and rolling-update.
|
|
266
|
+
2. The skypilot API server has temporary resource issue, e.g. when the
|
|
267
|
+
cucurrency of the handling process is exhausted.
|
|
268
|
+
|
|
269
|
+
We expect the caller (CLI or SDK) retry on these cases and show clear wait
|
|
270
|
+
message to the user to let user decide whether keep waiting or abort the
|
|
271
|
+
request.
|
|
272
|
+
"""
|
|
273
|
+
if response.status != 503:
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
error_msg = ''
|
|
277
|
+
try:
|
|
278
|
+
response_data = await response.json()
|
|
279
|
+
if 'detail' in response_data:
|
|
280
|
+
error_msg = response_data['detail']
|
|
281
|
+
except Exception: # pylint: disable=broad-except
|
|
282
|
+
try:
|
|
283
|
+
text = await response.text()
|
|
284
|
+
if text:
|
|
285
|
+
error_msg = text
|
|
286
|
+
except Exception: # pylint: disable=broad-except
|
|
287
|
+
pass
|
|
288
|
+
|
|
289
|
+
with ux_utils.print_exception_no_traceback():
|
|
290
|
+
raise exceptions.ServerTemporarilyUnavailableError(error_msg)
|
|
235
291
|
|
|
236
292
|
|
|
237
293
|
@_retry_on_server_unavailable()
|
|
@@ -310,11 +366,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
|
|
|
310
366
|
response = await session.request(method, url, **kwargs)
|
|
311
367
|
|
|
312
368
|
# Handle server unavailability (503 status) - same as sync version
|
|
313
|
-
|
|
314
|
-
with ux_utils.print_exception_no_traceback():
|
|
315
|
-
raise exceptions.ServerTemporarilyUnavailableError(
|
|
316
|
-
'SkyPilot API server is temporarily unavailable. '
|
|
317
|
-
'Please try again later.')
|
|
369
|
+
await handle_server_unavailable_async(response)
|
|
318
370
|
|
|
319
371
|
# Set remote API version and version from headers - same as sync version
|
|
320
372
|
remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
|