skypilot-nightly 1.0.0.dev20250514__py3-none-any.whl → 1.0.0.dev20250515__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend.py +3 -2
- sky/backends/backend_utils.py +19 -17
- sky/backends/cloud_vm_ray_backend.py +30 -11
- sky/clouds/aws.py +11 -9
- sky/clouds/azure.py +16 -13
- sky/clouds/cloud.py +4 -3
- sky/clouds/cudo.py +3 -2
- sky/clouds/do.py +3 -2
- sky/clouds/fluidstack.py +3 -3
- sky/clouds/gcp.py +1 -1
- sky/clouds/ibm.py +12 -10
- sky/clouds/kubernetes.py +3 -2
- sky/clouds/lambda_cloud.py +6 -6
- sky/clouds/nebius.py +6 -5
- sky/clouds/oci.py +9 -7
- sky/clouds/paperspace.py +3 -2
- sky/clouds/runpod.py +9 -9
- sky/clouds/scp.py +5 -3
- sky/clouds/vast.py +8 -7
- sky/clouds/vsphere.py +4 -2
- sky/core.py +18 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
- sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → jFI0Y-uJZ_XDK5IGJpKFU}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/execution.py +33 -0
- sky/global_user_state.py +2 -0
- sky/jobs/recovery_strategy.py +4 -1
- sky/jobs/server/core.py +6 -12
- sky/optimizer.py +19 -13
- sky/provision/kubernetes/utils.py +26 -1
- sky/resources.py +203 -44
- sky/serve/server/core.py +0 -5
- sky/serve/spot_placer.py +3 -0
- sky/server/requests/executor.py +114 -22
- sky/server/requests/requests.py +15 -0
- sky/server/server.py +63 -20
- sky/server/uvicorn.py +12 -2
- sky/sky_logging.py +40 -2
- sky/skylet/log_lib.py +60 -11
- sky/skylet/log_lib.pyi +5 -0
- sky/task.py +8 -6
- sky/utils/cli_utils/status_utils.py +6 -5
- sky/utils/command_runner.py +3 -0
- sky/utils/context.py +264 -0
- sky/utils/context_utils.py +172 -0
- sky/utils/controller_utils.py +39 -43
- sky/utils/dag_utils.py +4 -2
- sky/utils/resources_utils.py +3 -0
- sky/utils/rich_utils.py +81 -37
- sky/utils/schemas.py +33 -24
- sky/utils/subprocess_utils.py +8 -2
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/RECORD +65 -63
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- /sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → jFI0Y-uJZ_XDK5IGJpKFU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
@@ -172,9 +172,6 @@ def launch(
|
|
172
172
|
controller_resources = controller_utils.get_controller_resources(
|
173
173
|
controller=controller,
|
174
174
|
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
175
|
-
controller_idle_minutes_to_autostop, controller_down = (
|
176
|
-
controller_utils.get_controller_autostop_config(
|
177
|
-
controller=controller))
|
178
175
|
|
179
176
|
vars_to_fill = {
|
180
177
|
'remote_user_yaml_path': remote_user_yaml_path,
|
@@ -216,15 +213,12 @@ def launch(
|
|
216
213
|
# Launch with the api server's user hash, so that sky status does not
|
217
214
|
# show the owner of the controller as whatever user launched it first.
|
218
215
|
with common.with_server_user_hash():
|
219
|
-
return execution.launch(
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
retry_until_up=True,
|
226
|
-
fast=True,
|
227
|
-
_disable_controller_check=True)
|
216
|
+
return execution.launch(task=controller_task,
|
217
|
+
cluster_name=controller_name,
|
218
|
+
stream_logs=stream_logs,
|
219
|
+
retry_until_up=True,
|
220
|
+
fast=True,
|
221
|
+
_disable_controller_check=True)
|
228
222
|
|
229
223
|
|
230
224
|
def queue_from_kubernetes_pod(
|
sky/optimizer.py
CHANGED
@@ -671,7 +671,7 @@ class Optimizer:
|
|
671
671
|
plan: Dict[task_lib.Task, resources_lib.Resources],
|
672
672
|
) -> float:
|
673
673
|
"""Estimates the total cost of running the DAG by the plan."""
|
674
|
-
total_cost = 0
|
674
|
+
total_cost = 0.
|
675
675
|
for node in topo_order:
|
676
676
|
resources = plan[node]
|
677
677
|
if node.time_estimator_func is None:
|
@@ -777,10 +777,13 @@ class Optimizer:
|
|
777
777
|
accelerators = resources.get_accelerators_str()
|
778
778
|
spot = resources.get_spot_str()
|
779
779
|
cloud = resources.cloud
|
780
|
-
|
780
|
+
assert cloud is not None, 'Cloud must be specified'
|
781
|
+
assert (resources.instance_type is not None), \
|
782
|
+
'Instance type must be specified'
|
783
|
+
vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
|
781
784
|
resources.instance_type)
|
782
785
|
|
783
|
-
def format_number(x):
|
786
|
+
def format_number(x: Optional[float]) -> str:
|
784
787
|
if x is None:
|
785
788
|
return '-'
|
786
789
|
elif x.is_integer():
|
@@ -788,8 +791,8 @@ class Optimizer:
|
|
788
791
|
else:
|
789
792
|
return f'{x:.1f}'
|
790
793
|
|
791
|
-
vcpus = format_number(
|
792
|
-
mem = format_number(
|
794
|
+
vcpus = format_number(vcpus_)
|
795
|
+
mem = format_number(mem_)
|
793
796
|
|
794
797
|
if resources.zone is None:
|
795
798
|
region_or_zone = resources.region
|
@@ -814,11 +817,12 @@ class Optimizer:
|
|
814
817
|
|
815
818
|
accelerators = resources.get_accelerators_str()
|
816
819
|
spot = resources.get_spot_str()
|
820
|
+
resources = resources.assert_launchable()
|
817
821
|
cloud = resources.cloud
|
818
|
-
|
822
|
+
vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
|
819
823
|
resources.instance_type)
|
820
824
|
|
821
|
-
def format_number(x):
|
825
|
+
def format_number(x: Optional[float]) -> str:
|
822
826
|
if x is None:
|
823
827
|
return '-'
|
824
828
|
elif x.is_integer():
|
@@ -826,8 +830,8 @@ class Optimizer:
|
|
826
830
|
else:
|
827
831
|
return f'{x:.1f}'
|
828
832
|
|
829
|
-
vcpus = format_number(
|
830
|
-
mem = format_number(
|
833
|
+
vcpus = format_number(vcpus_)
|
834
|
+
mem = format_number(mem_)
|
831
835
|
|
832
836
|
if resources.zone is None:
|
833
837
|
region_or_zone = resources.region
|
@@ -1195,10 +1199,12 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1195
1199
|
all_clouds_specified.add(cloud_str)
|
1196
1200
|
|
1197
1201
|
# Explicitly check again to update the enabled cloud list.
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
+
clouds_to_check_again = list(clouds_need_recheck -
|
1203
|
+
global_disabled_clouds)
|
1204
|
+
if len(clouds_to_check_again) > 0:
|
1205
|
+
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
1206
|
+
quiet=True,
|
1207
|
+
clouds=clouds_to_check_again)
|
1202
1208
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
1203
1209
|
capability=sky_cloud.CloudCapability.COMPUTE,
|
1204
1210
|
raise_if_no_cloud_access=True)
|
@@ -1736,9 +1736,16 @@ class KubernetesInstanceType:
|
|
1736
1736
|
@staticmethod
|
1737
1737
|
def is_valid_instance_type(name: str) -> bool:
|
1738
1738
|
"""Returns whether the given name is a valid instance type."""
|
1739
|
+
# Before https://github.com/skypilot-org/skypilot/pull/4756,
|
1740
|
+
# the accelerators are appended with format "--{a}{type}",
|
1741
|
+
# e.g. "4CPU--16GB--1V100".
|
1742
|
+
# Check both patterns to keep backward compatibility.
|
1743
|
+
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
1744
|
+
prev_pattern = re.compile(
|
1745
|
+
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
|
1739
1746
|
pattern = re.compile(
|
1740
1747
|
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
|
1741
|
-
return bool(pattern.match(name))
|
1748
|
+
return bool(pattern.match(name)) or bool(prev_pattern.match(name))
|
1742
1749
|
|
1743
1750
|
@classmethod
|
1744
1751
|
def _parse_instance_type(
|
@@ -1755,6 +1762,11 @@ class KubernetesInstanceType:
|
|
1755
1762
|
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
|
1756
1763
|
)
|
1757
1764
|
match = pattern.match(name)
|
1765
|
+
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
1766
|
+
prev_pattern = re.compile(
|
1767
|
+
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
|
1768
|
+
)
|
1769
|
+
prev_match = prev_pattern.match(name)
|
1758
1770
|
if match:
|
1759
1771
|
cpus = float(match.group('cpus'))
|
1760
1772
|
memory = float(match.group('memory'))
|
@@ -1769,6 +1781,19 @@ class KubernetesInstanceType:
|
|
1769
1781
|
accelerator_count = None
|
1770
1782
|
accelerator_type = None
|
1771
1783
|
return cpus, memory, accelerator_count, accelerator_type
|
1784
|
+
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
1785
|
+
elif prev_match:
|
1786
|
+
cpus = float(prev_match.group('cpus'))
|
1787
|
+
memory = float(prev_match.group('memory'))
|
1788
|
+
accelerator_count = prev_match.group('accelerator_count')
|
1789
|
+
accelerator_type = prev_match.group('accelerator_type')
|
1790
|
+
if accelerator_count:
|
1791
|
+
accelerator_count = int(accelerator_count)
|
1792
|
+
accelerator_type = str(accelerator_type).replace('_', ' ')
|
1793
|
+
else:
|
1794
|
+
accelerator_count = None
|
1795
|
+
accelerator_type = None
|
1796
|
+
return cpus, memory, accelerator_count, accelerator_type
|
1772
1797
|
else:
|
1773
1798
|
raise ValueError(f'Invalid instance name: {name}')
|
1774
1799
|
|