skypilot-nightly 1.0.0.dev20250519__py3-none-any.whl → 1.0.0.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +56 -37
- sky/check.py +3 -3
- sky/cli.py +89 -16
- sky/client/cli.py +89 -16
- sky/client/sdk.py +20 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +6 -0
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +1 -0
- sky/dashboard/out/_next/static/chunks/{678-206dddca808e6d16.js → 582-683f4f27b81996dc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +1 -0
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +3 -0
- sky/dashboard/out/_next/static/hvWzC5E6Q4CcKzXcWbgig/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/execution.py +1 -1
- sky/jobs/server/core.py +1 -1
- sky/jobs/utils.py +38 -7
- sky/optimizer.py +36 -29
- sky/provision/kubernetes/instance.py +6 -0
- sky/provision/provisioner.py +16 -7
- sky/resources.py +60 -15
- sky/serve/serve_utils.py +5 -13
- sky/server/common.py +14 -5
- sky/server/requests/payloads.py +3 -3
- sky/utils/cli_utils/status_utils.py +95 -56
- sky/utils/common_utils.py +35 -2
- sky/utils/infra_utils.py +175 -0
- sky/utils/resources_utils.py +41 -21
- sky/utils/schemas.py +65 -5
- {skypilot_nightly-1.0.0.dev20250519.dist-info → skypilot_nightly-1.0.0.dev20250521.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250519.dist-info → skypilot_nightly-1.0.0.dev20250521.dist-info}/RECORD +51 -48
- {skypilot_nightly-1.0.0.dev20250519.dist-info → skypilot_nightly-1.0.0.dev20250521.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/EZ3zXDgkK3s9_F2lRAluJ/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- /sky/dashboard/out/_next/static/{EZ3zXDgkK3s9_F2lRAluJ → hvWzC5E6Q4CcKzXcWbgig}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250519.dist-info → skypilot_nightly-1.0.0.dev20250521.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250519.dist-info → skypilot_nightly-1.0.0.dev20250521.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250519.dist-info → skypilot_nightly-1.0.0.dev20250521.dist-info}/top_level.txt +0 -0
sky/optimizer.py
CHANGED
@@ -167,7 +167,7 @@ class Optimizer:
|
|
167
167
|
|
168
168
|
def make_dummy(name):
|
169
169
|
dummy = task_lib.Task(name)
|
170
|
-
dummy.set_resources({DummyResources(DummyCloud()
|
170
|
+
dummy.set_resources({DummyResources(cloud=DummyCloud())})
|
171
171
|
dummy.set_time_estimator(lambda _: 0)
|
172
172
|
return dummy
|
173
173
|
|
@@ -321,10 +321,10 @@ class Optimizer:
|
|
321
321
|
estimated_runtime = 1 * 3600
|
322
322
|
else:
|
323
323
|
# We assume the time estimator takes in a partial resource
|
324
|
-
# Resources('V100')
|
324
|
+
# Resources(accelerators='V100')
|
325
325
|
# and treats their launchable versions
|
326
|
-
# Resources(
|
327
|
-
# Resources(
|
326
|
+
# Resources(infra='aws', instance_type='p3.2xlarge'),
|
327
|
+
# Resources(infra='gcp', accelerators='V100'),
|
328
328
|
# ...
|
329
329
|
# as having the same run time.
|
330
330
|
# FIXME(zongheng): take 'num_nodes' as an arg/into
|
@@ -772,6 +772,15 @@ class Optimizer:
|
|
772
772
|
f'{colorama.Style.BRIGHT}Estimated total cost: '
|
773
773
|
f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
|
774
774
|
|
775
|
+
def _instance_type_str(resources: 'resources_lib.Resources') -> str:
|
776
|
+
instance_type = resources.instance_type
|
777
|
+
assert instance_type is not None, 'Instance type must be specified'
|
778
|
+
if isinstance(resources.cloud, clouds.Kubernetes):
|
779
|
+
instance_type = '-'
|
780
|
+
if resources.use_spot:
|
781
|
+
instance_type = ''
|
782
|
+
return instance_type
|
783
|
+
|
775
784
|
def _get_resources_element_list(
|
776
785
|
resources: 'resources_lib.Resources') -> List[str]:
|
777
786
|
accelerators = resources.get_accelerators_str()
|
@@ -794,22 +803,20 @@ class Optimizer:
|
|
794
803
|
vcpus = format_number(vcpus_)
|
795
804
|
mem = format_number(mem_)
|
796
805
|
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
region_or_zone = resources.zone
|
806
|
+
# Format infra as CLOUD (REGION/ZONE)
|
807
|
+
infra = resources.infra.formatted_str()
|
808
|
+
|
801
809
|
return [
|
802
|
-
|
803
|
-
resources
|
810
|
+
infra,
|
811
|
+
_instance_type_str(resources) + spot,
|
804
812
|
vcpus,
|
805
813
|
mem,
|
806
814
|
str(accelerators),
|
807
|
-
str(region_or_zone),
|
808
815
|
]
|
809
816
|
|
810
817
|
Row = collections.namedtuple('Row', [
|
811
|
-
'
|
812
|
-
'
|
818
|
+
'infra', 'instance', 'vcpus', 'mem', 'accelerators', 'cost_str',
|
819
|
+
'chosen_str'
|
813
820
|
])
|
814
821
|
|
815
822
|
def _get_resources_named_tuple(resources: 'resources_lib.Resources',
|
@@ -833,18 +840,15 @@ class Optimizer:
|
|
833
840
|
vcpus = format_number(vcpus_)
|
834
841
|
mem = format_number(mem_)
|
835
842
|
|
836
|
-
|
837
|
-
region_or_zone = resources.region
|
838
|
-
else:
|
839
|
-
region_or_zone = resources.zone
|
843
|
+
infra = resources.infra.formatted_str()
|
840
844
|
|
841
845
|
chosen_str = ''
|
842
846
|
if chosen:
|
843
847
|
chosen_str = (colorama.Fore.GREEN + ' ' + '\u2714' +
|
844
848
|
colorama.Style.RESET_ALL)
|
845
|
-
row = Row(
|
846
|
-
|
847
|
-
chosen_str)
|
849
|
+
row = Row(infra,
|
850
|
+
_instance_type_str(resources) + spot, vcpus, mem,
|
851
|
+
str(accelerators), cost_str, chosen_str)
|
848
852
|
|
849
853
|
return row
|
850
854
|
|
@@ -862,10 +866,7 @@ class Optimizer:
|
|
862
866
|
return json.dumps(resource_key_dict, sort_keys=True)
|
863
867
|
|
864
868
|
# Print the list of resouces that the optimizer considered.
|
865
|
-
resource_fields = [
|
866
|
-
'CLOUD', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'ACCELERATORS',
|
867
|
-
'REGION/ZONE'
|
868
|
-
]
|
869
|
+
resource_fields = ['INFRA', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'GPUS']
|
869
870
|
if len(ordered_best_plan) > 1:
|
870
871
|
best_plan_rows = []
|
871
872
|
for t, r in ordered_best_plan.items():
|
@@ -993,13 +994,19 @@ class Optimizer:
|
|
993
994
|
if len(candidate_list) > 1:
|
994
995
|
is_multi_instances = True
|
995
996
|
instance_list = [
|
996
|
-
res.instance_type
|
997
|
+
res.instance_type
|
998
|
+
for res in candidate_list
|
999
|
+
if res.instance_type is not None
|
997
1000
|
]
|
1001
|
+
candidate_str = resources_utils.format_resource(
|
1002
|
+
candidate_list[0], simplify=True)
|
1003
|
+
|
998
1004
|
logger.info(
|
999
|
-
f'Multiple {cloud} instances
|
1000
|
-
f'{acc_name}:{int(acc_count)}. '
|
1001
|
-
f'The cheapest {
|
1002
|
-
f'among
|
1005
|
+
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|
1006
|
+
f'satisfy {acc_name}:{int(acc_count)}. '
|
1007
|
+
f'The cheapest {candidate_str} is considered '
|
1008
|
+
f'among: {", ".join(instance_list)}.'
|
1009
|
+
f'{colorama.Style.RESET_ALL}')
|
1003
1010
|
if is_multi_instances:
|
1004
1011
|
logger.info(
|
1005
1012
|
f'To list more details, run: sky show-gpus {acc_name}\n')
|
@@ -67,6 +67,9 @@ def is_high_availability_cluster_by_kubectl(
|
|
67
67
|
namespace: Optional[str] = None) -> bool:
|
68
68
|
"""Check if a cluster is a high availability controller by calling
|
69
69
|
`kubectl get deployment`.
|
70
|
+
|
71
|
+
The deployment must have the label `skypilot-cluster-name` set to
|
72
|
+
`cluster_name`.
|
70
73
|
"""
|
71
74
|
try:
|
72
75
|
deployment_list = kubernetes.apps_api(
|
@@ -896,6 +899,9 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
896
899
|
TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
897
900
|
template_pod_spec['metadata'] = pod_spec_copy['metadata']
|
898
901
|
template_pod_spec['spec'].update(pod_spec_copy['spec'])
|
902
|
+
# Propagate the labels to the deployment for identification.
|
903
|
+
deployment_spec['metadata']['labels'] = pod_spec_copy['metadata'][
|
904
|
+
'labels']
|
899
905
|
try:
|
900
906
|
return kubernetes.apps_api(
|
901
907
|
context).create_namespaced_deployment(
|
sky/provision/provisioner.py
CHANGED
@@ -17,6 +17,7 @@ from sky import clouds
|
|
17
17
|
from sky import exceptions
|
18
18
|
from sky import provision
|
19
19
|
from sky import sky_logging
|
20
|
+
from sky import skypilot_config
|
20
21
|
from sky.adaptors import aws
|
21
22
|
from sky.backends import backend_utils
|
22
23
|
from sky.provision import common as provision_common
|
@@ -228,9 +229,9 @@ def _ssh_probe_command(ip: str,
|
|
228
229
|
ssh_port: int,
|
229
230
|
ssh_user: str,
|
230
231
|
ssh_private_key: str,
|
232
|
+
ssh_probe_timeout: int,
|
231
233
|
ssh_proxy_command: Optional[str] = None) -> List[str]:
|
232
|
-
# NOTE: Ray uses 'uptime' command
|
233
|
-
# setting here.
|
234
|
+
# NOTE: Ray uses 'uptime' command, we use the same setting here.
|
234
235
|
command = [
|
235
236
|
'ssh',
|
236
237
|
'-T',
|
@@ -244,7 +245,7 @@ def _ssh_probe_command(ip: str,
|
|
244
245
|
'-o',
|
245
246
|
'PasswordAuthentication=no',
|
246
247
|
'-o',
|
247
|
-
'ConnectTimeout=
|
248
|
+
f'ConnectTimeout={ssh_probe_timeout}s',
|
248
249
|
'-o',
|
249
250
|
f'UserKnownHostsFile={os.devnull}',
|
250
251
|
'-o',
|
@@ -277,6 +278,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
277
278
|
ssh_port: int,
|
278
279
|
ssh_user: str,
|
279
280
|
ssh_private_key: str,
|
281
|
+
ssh_probe_timeout: int,
|
280
282
|
ssh_control_name: Optional[str] = None,
|
281
283
|
ssh_proxy_command: Optional[str] = None,
|
282
284
|
**kwargs) -> Tuple[bool, str]:
|
@@ -305,6 +307,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
305
307
|
if success:
|
306
308
|
return _wait_ssh_connection_indirect(ip, ssh_port, ssh_user,
|
307
309
|
ssh_private_key,
|
310
|
+
ssh_probe_timeout,
|
308
311
|
ssh_control_name,
|
309
312
|
ssh_proxy_command)
|
310
313
|
except socket.timeout: # this is the most expected exception
|
@@ -312,7 +315,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
312
315
|
except Exception as e: # pylint: disable=broad-except
|
313
316
|
stderr = f'Error: {common_utils.format_exception(e)}'
|
314
317
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
315
|
-
ssh_proxy_command)
|
318
|
+
ssh_probe_timeout, ssh_proxy_command)
|
316
319
|
logger.debug(f'Waiting for SSH to {ip}. Try: '
|
317
320
|
f'{_shlex_join(command)}. '
|
318
321
|
f'{stderr}')
|
@@ -323,6 +326,7 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
323
326
|
ssh_port: int,
|
324
327
|
ssh_user: str,
|
325
328
|
ssh_private_key: str,
|
329
|
+
ssh_probe_timeout: int,
|
326
330
|
ssh_control_name: Optional[str] = None,
|
327
331
|
ssh_proxy_command: Optional[str] = None,
|
328
332
|
**kwargs) -> Tuple[bool, str]:
|
@@ -333,14 +337,14 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
333
337
|
"""
|
334
338
|
del ssh_control_name, kwargs # unused
|
335
339
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
336
|
-
ssh_proxy_command)
|
340
|
+
ssh_probe_timeout, ssh_proxy_command)
|
337
341
|
message = f'Waiting for SSH using command: {_shlex_join(command)}'
|
338
342
|
logger.debug(message)
|
339
343
|
try:
|
340
344
|
proc = subprocess.run(command,
|
341
345
|
shell=False,
|
342
346
|
check=False,
|
343
|
-
timeout=
|
347
|
+
timeout=ssh_probe_timeout,
|
344
348
|
stdout=subprocess.DEVNULL,
|
345
349
|
stderr=subprocess.PIPE)
|
346
350
|
if proc.returncode != 0:
|
@@ -383,8 +387,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
383
387
|
def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
|
384
388
|
ip, ssh_port = ip_ssh_port
|
385
389
|
success = False
|
390
|
+
ssh_probe_timeout = skypilot_config.get_nested(
|
391
|
+
('provision', 'ssh_timeout'), 10)
|
386
392
|
while not success:
|
387
|
-
success, stderr = waiter(ip,
|
393
|
+
success, stderr = waiter(ip,
|
394
|
+
ssh_port,
|
395
|
+
**ssh_credentials,
|
396
|
+
ssh_probe_timeout=ssh_probe_timeout)
|
388
397
|
if not success and time.time() - start > timeout:
|
389
398
|
with ux_utils.print_exception_no_traceback():
|
390
399
|
raise RuntimeError(
|
sky/resources.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
6
6
|
|
7
7
|
import colorama
|
8
8
|
|
9
|
+
import sky
|
9
10
|
from sky import check as sky_check
|
10
11
|
from sky import clouds
|
11
12
|
from sky import exceptions
|
@@ -20,6 +21,7 @@ from sky.utils import accelerator_registry
|
|
20
21
|
from sky.utils import annotations
|
21
22
|
from sky.utils import common_utils
|
22
23
|
from sky.utils import config_utils
|
24
|
+
from sky.utils import infra_utils
|
23
25
|
from sky.utils import log_utils
|
24
26
|
from sky.utils import registry
|
25
27
|
from sky.utils import resources_utils
|
@@ -106,6 +108,7 @@ class Resources:
|
|
106
108
|
memory: Union[None, int, float, str] = None,
|
107
109
|
accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
|
108
110
|
accelerator_args: Optional[Dict[str, str]] = None,
|
111
|
+
infra: Optional[str] = None,
|
109
112
|
use_spot: Optional[bool] = None,
|
110
113
|
job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
|
111
114
|
str]] = None,
|
@@ -134,9 +137,9 @@ class Resources:
|
|
134
137
|
.. code-block:: python
|
135
138
|
|
136
139
|
# Fully specified cloud and instance type (is_launchable() is True).
|
137
|
-
sky.Resources(
|
138
|
-
sky.Resources(
|
139
|
-
sky.Resources(
|
140
|
+
sky.Resources(infra='aws', instance_type='p3.2xlarge')
|
141
|
+
sky.Resources(infra='k8s/my-cluster-ctx', accelerators='V100')
|
142
|
+
sky.Resources(infra='gcp/us-central1', accelerators='V100')
|
140
143
|
|
141
144
|
# Specifying required resources; the system decides the
|
142
145
|
# cloud/instance type. The below are equivalent:
|
@@ -145,8 +148,9 @@ class Resources:
|
|
145
148
|
sky.Resources(accelerators={'V100': 1})
|
146
149
|
sky.Resources(cpus='2+', memory='16+', accelerators='V100')
|
147
150
|
|
151
|
+
|
148
152
|
Args:
|
149
|
-
cloud: the cloud to use.
|
153
|
+
cloud: the cloud to use. Deprecated. Use `infra` instead.
|
150
154
|
instance_type: the instance type to use.
|
151
155
|
cpus: the number of CPUs required for the task.
|
152
156
|
If a str, must be a string of the form ``'2'`` or ``'2+'``, where
|
@@ -160,6 +164,11 @@ class Resources:
|
|
160
164
|
dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
|
161
165
|
accelerator_args: accelerator-specific arguments. For example,
|
162
166
|
``{'tpu_vm': True, 'runtime_version': 'tpu-vm-base'}`` for TPUs.
|
167
|
+
infra: a string specifying the infrastructure to use, in the format
|
168
|
+
of "cloud/region" or "cloud/region/zone". For example,
|
169
|
+
`aws/us-east-1` or `k8s/my-cluster-ctx`. This is an alternative to
|
170
|
+
specifying cloud, region, and zone separately. If provided, it
|
171
|
+
takes precedence over cloud, region, and zone parameters.
|
163
172
|
use_spot: whether to use spot instances. If None, defaults to
|
164
173
|
False.
|
165
174
|
job_recovery: the job recovery strategy to use for the managed
|
@@ -172,8 +181,8 @@ class Resources:
|
|
172
181
|
- max_restarts_on_errors: the max number of restarts on user code
|
173
182
|
errors.
|
174
183
|
|
175
|
-
region: the region to use.
|
176
|
-
zone: the zone to use.
|
184
|
+
region: the region to use. Deprecated. Use `infra` instead.
|
185
|
+
zone: the zone to use. Deprecated. Use `infra` instead.
|
177
186
|
image_id: the image ID to use. If a str, must be a string
|
178
187
|
of the image id from the cloud, such as AWS:
|
179
188
|
``'ami-1234567890abcdef0'``, GCP:
|
@@ -218,6 +227,25 @@ class Resources:
|
|
218
227
|
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
219
228
|
"""
|
220
229
|
self._version = self._VERSION
|
230
|
+
|
231
|
+
if infra is not None and (cloud is not None or region is not None or
|
232
|
+
zone is not None):
|
233
|
+
with ux_utils.print_exception_no_traceback():
|
234
|
+
raise ValueError('Cannot specify both `infra` and `cloud`, '
|
235
|
+
'`region`, or `zone` parameters. '
|
236
|
+
f'Got: infra={infra}, cloud={cloud}, '
|
237
|
+
f'region={region}, zone={zone}')
|
238
|
+
|
239
|
+
# Infra is user facing, and cloud, region, zone in parameters are for
|
240
|
+
# backward compatibility. Internally, we keep using cloud, region, zone
|
241
|
+
# for simplicity.
|
242
|
+
if infra is not None:
|
243
|
+
infra_info = infra_utils.InfraInfo.from_str(infra)
|
244
|
+
# Infra takes precedence over individually specified parameters
|
245
|
+
cloud = sky.CLOUD_REGISTRY.from_str(infra_info.cloud)
|
246
|
+
region = infra_info.region
|
247
|
+
zone = infra_info.zone
|
248
|
+
|
221
249
|
self._cloud = cloud
|
222
250
|
self._region: Optional[str] = region
|
223
251
|
self._zone: Optional[str] = zone
|
@@ -431,6 +459,11 @@ class Resources:
|
|
431
459
|
repr_str += f'{region_str}{zone_str}'
|
432
460
|
return repr_str
|
433
461
|
|
462
|
+
@property
|
463
|
+
def infra(self) -> infra_utils.InfraInfo:
|
464
|
+
cloud = str(self.cloud) if self.cloud is not None else None
|
465
|
+
return infra_utils.InfraInfo(cloud, self.region, self.zone)
|
466
|
+
|
434
467
|
@property
|
435
468
|
def cloud(self) -> Optional[clouds.Cloud]:
|
436
469
|
return self._cloud
|
@@ -486,9 +519,9 @@ class Resources:
|
|
486
519
|
def accelerators(self) -> Optional[Dict[str, Union[int, float]]]:
|
487
520
|
"""Returns the accelerators field directly or by inferring.
|
488
521
|
|
489
|
-
For example, Resources(
|
490
|
-
set to None, but this function will infer {'V100': 1}
|
491
|
-
type.
|
522
|
+
For example, Resources(infra='aws', instance_type='p3.2xlarge') has its
|
523
|
+
accelerators field set to None, but this function will infer {'V100': 1}
|
524
|
+
from the instance type.
|
492
525
|
"""
|
493
526
|
if self._accelerators is not None:
|
494
527
|
return self._accelerators
|
@@ -1450,6 +1483,7 @@ class Resources:
|
|
1450
1483
|
ports=override.pop('ports', self.ports),
|
1451
1484
|
labels=override.pop('labels', self.labels),
|
1452
1485
|
autostop=override.pop('autostop', current_autostop_config),
|
1486
|
+
infra=override.pop('infra', None),
|
1453
1487
|
_docker_login_config=override.pop('_docker_login_config',
|
1454
1488
|
self._docker_login_config),
|
1455
1489
|
_docker_username_for_runpod=override.pop(
|
@@ -1621,9 +1655,21 @@ class Resources:
|
|
1621
1655
|
@classmethod
|
1622
1656
|
def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
|
1623
1657
|
|
1624
|
-
resources_fields = {}
|
1658
|
+
resources_fields: Dict[str, Any] = {}
|
1659
|
+
|
1660
|
+
# Extract infra field if present
|
1661
|
+
infra = config.pop('infra', None)
|
1662
|
+
resources_fields['infra'] = infra
|
1663
|
+
|
1664
|
+
# Keep backward compatibility with cloud, region, zone
|
1665
|
+
# Note: if both `infra` and any of `cloud`, `region`, `zone` are
|
1666
|
+
# specified, it will raise an error during the Resources.__init__
|
1667
|
+
# validation.
|
1625
1668
|
resources_fields['cloud'] = registry.CLOUD_REGISTRY.from_str(
|
1626
1669
|
config.pop('cloud', None))
|
1670
|
+
resources_fields['region'] = config.pop('region', None)
|
1671
|
+
resources_fields['zone'] = config.pop('zone', None)
|
1672
|
+
|
1627
1673
|
resources_fields['instance_type'] = config.pop('instance_type', None)
|
1628
1674
|
resources_fields['cpus'] = config.pop('cpus', None)
|
1629
1675
|
resources_fields['memory'] = config.pop('memory', None)
|
@@ -1641,8 +1687,6 @@ class Resources:
|
|
1641
1687
|
# exclusive by the schema validation.
|
1642
1688
|
resources_fields['job_recovery'] = config.pop('job_recovery', None)
|
1643
1689
|
resources_fields['disk_size'] = config.pop('disk_size', None)
|
1644
|
-
resources_fields['region'] = config.pop('region', None)
|
1645
|
-
resources_fields['zone'] = config.pop('zone', None)
|
1646
1690
|
resources_fields['image_id'] = config.pop('image_id', None)
|
1647
1691
|
resources_fields['disk_tier'] = config.pop('disk_tier', None)
|
1648
1692
|
resources_fields['ports'] = config.pop('ports', None)
|
@@ -1679,7 +1723,10 @@ class Resources:
|
|
1679
1723
|
if value is not None and value != 'None':
|
1680
1724
|
config[key] = value
|
1681
1725
|
|
1682
|
-
|
1726
|
+
# Construct infra field if cloud is set
|
1727
|
+
infra = self.infra.to_str()
|
1728
|
+
add_if_not_none('infra', infra)
|
1729
|
+
|
1683
1730
|
add_if_not_none('instance_type', self.instance_type)
|
1684
1731
|
add_if_not_none('cpus', self._cpus)
|
1685
1732
|
add_if_not_none('memory', self.memory)
|
@@ -1690,8 +1737,6 @@ class Resources:
|
|
1690
1737
|
add_if_not_none('use_spot', self.use_spot)
|
1691
1738
|
add_if_not_none('job_recovery', self.job_recovery)
|
1692
1739
|
add_if_not_none('disk_size', self.disk_size)
|
1693
|
-
add_if_not_none('region', self.region)
|
1694
|
-
add_if_not_none('zone', self.zone)
|
1695
1740
|
add_if_not_none('image_id', self.image_id)
|
1696
1741
|
if self.disk_tier is not None:
|
1697
1742
|
config['disk_tier'] = self.disk_tier.value
|
sky/serve/serve_utils.py
CHANGED
@@ -1027,11 +1027,9 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1027
1027
|
return 'No existing replicas.'
|
1028
1028
|
|
1029
1029
|
replica_columns = [
|
1030
|
-
'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', '
|
1031
|
-
'
|
1030
|
+
'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'INFRA',
|
1031
|
+
'RESOURCES', 'STATUS'
|
1032
1032
|
]
|
1033
|
-
if show_all:
|
1034
|
-
replica_columns.append('ZONE')
|
1035
1033
|
replica_table = log_utils.create_table(replica_columns)
|
1036
1034
|
|
1037
1035
|
truncate_hint = ''
|
@@ -1047,21 +1045,17 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1047
1045
|
version = (record['version'] if 'version' in record else '-')
|
1048
1046
|
replica_endpoint = endpoint if endpoint else '-'
|
1049
1047
|
launched_at = log_utils.readable_time_duration(record['launched_at'])
|
1048
|
+
infra = '-'
|
1050
1049
|
resources_str = '-'
|
1051
1050
|
replica_status = record['status']
|
1052
1051
|
status_str = replica_status.colored_str()
|
1053
|
-
region = '-'
|
1054
|
-
zone = '-'
|
1055
1052
|
|
1056
1053
|
replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
|
1057
1054
|
'handle']
|
1058
1055
|
if replica_handle is not None:
|
1056
|
+
infra = replica_handle.launched_resources.infra.formatted_str()
|
1059
1057
|
resources_str = resources_utils.get_readable_resources_repr(
|
1060
1058
|
replica_handle, simplify=not show_all)
|
1061
|
-
if replica_handle.launched_resources.region is not None:
|
1062
|
-
region = replica_handle.launched_resources.region
|
1063
|
-
if replica_handle.launched_resources.zone is not None:
|
1064
|
-
zone = replica_handle.launched_resources.zone
|
1065
1059
|
|
1066
1060
|
replica_values = [
|
1067
1061
|
service_name,
|
@@ -1069,12 +1063,10 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1069
1063
|
version,
|
1070
1064
|
replica_endpoint,
|
1071
1065
|
launched_at,
|
1066
|
+
infra,
|
1072
1067
|
resources_str,
|
1073
1068
|
status_str,
|
1074
|
-
region,
|
1075
1069
|
]
|
1076
|
-
if show_all:
|
1077
|
-
replica_values.append(zone)
|
1078
1070
|
replica_table.add_row(replica_values)
|
1079
1071
|
|
1080
1072
|
return f'{replica_table}{truncate_hint}'
|
sky/server/common.py
CHANGED
@@ -297,7 +297,6 @@ def _start_api_server(deploy: bool = False,
|
|
297
297
|
|
298
298
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
299
299
|
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
300
|
-
cmd = f'{" ".join(args)} > {log_path} 2>&1 < /dev/null'
|
301
300
|
|
302
301
|
# Start the API server process in the background and don't wait for it.
|
303
302
|
# If this is called from a CLI invocation, we need
|
@@ -305,10 +304,20 @@ def _start_api_server(deploy: bool = False,
|
|
305
304
|
# the API server.
|
306
305
|
server_env = os.environ.copy()
|
307
306
|
server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
307
|
+
with open(log_path, 'w', encoding='utf-8') as log_file:
|
308
|
+
# Because the log file is opened using a with statement, it may seem
|
309
|
+
# that the file will be closed when the with statement is exited
|
310
|
+
# causing the child process to be unable to write to the log file.
|
311
|
+
# However, Popen makes the file descriptor inheritable which means
|
312
|
+
# the child process will inherit its own copy of the fd,
|
313
|
+
# independent of the parent's fd table which enables to child
|
314
|
+
# process to continue writing to the log file.
|
315
|
+
proc = subprocess.Popen(args,
|
316
|
+
stdout=log_file,
|
317
|
+
stderr=subprocess.STDOUT,
|
318
|
+
stdin=subprocess.DEVNULL,
|
319
|
+
start_new_session=True,
|
320
|
+
env=server_env)
|
312
321
|
|
313
322
|
start_time = time.time()
|
314
323
|
while True:
|
sky/server/requests/payloads.py
CHANGED
@@ -443,9 +443,9 @@ class ServeStatusBody(RequestBody):
|
|
443
443
|
|
444
444
|
class RealtimeGpuAvailabilityRequestBody(RequestBody):
|
445
445
|
"""The request body for the realtime GPU availability endpoint."""
|
446
|
-
context: Optional[str]
|
447
|
-
name_filter: Optional[str]
|
448
|
-
quantity_filter: Optional[int]
|
446
|
+
context: Optional[str] = None
|
447
|
+
name_filter: Optional[str] = None
|
448
|
+
quantity_filter: Optional[int] = None
|
449
449
|
|
450
450
|
|
451
451
|
class KubernetesNodeInfoRequestBody(RequestBody):
|