skypilot-nightly 1.0.0.dev20250514__py3-none-any.whl → 1.0.0.dev20250516__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +3 -2
  3. sky/backends/backend_utils.py +19 -17
  4. sky/backends/cloud_vm_ray_backend.py +30 -11
  5. sky/clouds/aws.py +11 -9
  6. sky/clouds/azure.py +16 -13
  7. sky/clouds/cloud.py +4 -3
  8. sky/clouds/cudo.py +3 -2
  9. sky/clouds/do.py +3 -2
  10. sky/clouds/fluidstack.py +3 -3
  11. sky/clouds/gcp.py +1 -1
  12. sky/clouds/ibm.py +12 -10
  13. sky/clouds/kubernetes.py +3 -2
  14. sky/clouds/lambda_cloud.py +6 -6
  15. sky/clouds/nebius.py +6 -5
  16. sky/clouds/oci.py +9 -7
  17. sky/clouds/paperspace.py +3 -2
  18. sky/clouds/runpod.py +9 -9
  19. sky/clouds/scp.py +5 -3
  20. sky/clouds/vast.py +8 -7
  21. sky/clouds/vsphere.py +4 -2
  22. sky/core.py +18 -12
  23. sky/dashboard/out/404.html +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
  25. sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → y1yf6Xc0zwam5fFluIyUm}/_buildManifest.js +1 -1
  26. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  27. sky/dashboard/out/clusters/[cluster].html +1 -1
  28. sky/dashboard/out/clusters.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/jobs/[job].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/execution.py +33 -0
  33. sky/global_user_state.py +2 -0
  34. sky/jobs/recovery_strategy.py +4 -1
  35. sky/jobs/server/core.py +6 -12
  36. sky/optimizer.py +19 -13
  37. sky/provision/kubernetes/utils.py +26 -1
  38. sky/resources.py +203 -44
  39. sky/serve/server/core.py +0 -5
  40. sky/serve/spot_placer.py +3 -0
  41. sky/server/requests/executor.py +114 -22
  42. sky/server/requests/requests.py +15 -0
  43. sky/server/server.py +63 -20
  44. sky/server/uvicorn.py +12 -2
  45. sky/setup_files/dependencies.py +4 -1
  46. sky/sky_logging.py +40 -2
  47. sky/skylet/log_lib.py +60 -11
  48. sky/skylet/log_lib.pyi +5 -0
  49. sky/task.py +8 -6
  50. sky/utils/cli_utils/status_utils.py +6 -5
  51. sky/utils/command_runner.py +3 -0
  52. sky/utils/context.py +264 -0
  53. sky/utils/context_utils.py +172 -0
  54. sky/utils/controller_utils.py +39 -43
  55. sky/utils/dag_utils.py +4 -2
  56. sky/utils/resources_utils.py +3 -0
  57. sky/utils/rich_utils.py +81 -37
  58. sky/utils/schemas.py +33 -24
  59. sky/utils/subprocess_utils.py +8 -2
  60. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/METADATA +2 -2
  61. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/RECORD +66 -64
  62. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/WHEEL +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  64. /sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → y1yf6Xc0zwam5fFluIyUm}/_ssgManifest.js +0 -0
  65. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/entry_points.txt +0 -0
  66. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/licenses/LICENSE +0 -0
  67. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -172,9 +172,6 @@ def launch(
172
172
  controller_resources = controller_utils.get_controller_resources(
173
173
  controller=controller,
174
174
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
175
- controller_idle_minutes_to_autostop, controller_down = (
176
- controller_utils.get_controller_autostop_config(
177
- controller=controller))
178
175
 
179
176
  vars_to_fill = {
180
177
  'remote_user_yaml_path': remote_user_yaml_path,
@@ -216,15 +213,12 @@ def launch(
216
213
  # Launch with the api server's user hash, so that sky status does not
217
214
  # show the owner of the controller as whatever user launched it first.
218
215
  with common.with_server_user_hash():
219
- return execution.launch(
220
- task=controller_task,
221
- cluster_name=controller_name,
222
- stream_logs=stream_logs,
223
- idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
224
- down=controller_down,
225
- retry_until_up=True,
226
- fast=True,
227
- _disable_controller_check=True)
216
+ return execution.launch(task=controller_task,
217
+ cluster_name=controller_name,
218
+ stream_logs=stream_logs,
219
+ retry_until_up=True,
220
+ fast=True,
221
+ _disable_controller_check=True)
228
222
 
229
223
 
230
224
  def queue_from_kubernetes_pod(
sky/optimizer.py CHANGED
@@ -671,7 +671,7 @@ class Optimizer:
671
671
  plan: Dict[task_lib.Task, resources_lib.Resources],
672
672
  ) -> float:
673
673
  """Estimates the total cost of running the DAG by the plan."""
674
- total_cost = 0
674
+ total_cost = 0.
675
675
  for node in topo_order:
676
676
  resources = plan[node]
677
677
  if node.time_estimator_func is None:
@@ -777,10 +777,13 @@ class Optimizer:
777
777
  accelerators = resources.get_accelerators_str()
778
778
  spot = resources.get_spot_str()
779
779
  cloud = resources.cloud
780
- vcpus, mem = cloud.get_vcpus_mem_from_instance_type(
780
+ assert cloud is not None, 'Cloud must be specified'
781
+ assert (resources.instance_type is not None), \
782
+ 'Instance type must be specified'
783
+ vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
781
784
  resources.instance_type)
782
785
 
783
- def format_number(x):
786
+ def format_number(x: Optional[float]) -> str:
784
787
  if x is None:
785
788
  return '-'
786
789
  elif x.is_integer():
@@ -788,8 +791,8 @@ class Optimizer:
788
791
  else:
789
792
  return f'{x:.1f}'
790
793
 
791
- vcpus = format_number(vcpus)
792
- mem = format_number(mem)
794
+ vcpus = format_number(vcpus_)
795
+ mem = format_number(mem_)
793
796
 
794
797
  if resources.zone is None:
795
798
  region_or_zone = resources.region
@@ -814,11 +817,12 @@ class Optimizer:
814
817
 
815
818
  accelerators = resources.get_accelerators_str()
816
819
  spot = resources.get_spot_str()
820
+ resources = resources.assert_launchable()
817
821
  cloud = resources.cloud
818
- vcpus, mem = cloud.get_vcpus_mem_from_instance_type(
822
+ vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
819
823
  resources.instance_type)
820
824
 
821
- def format_number(x):
825
+ def format_number(x: Optional[float]) -> str:
822
826
  if x is None:
823
827
  return '-'
824
828
  elif x.is_integer():
@@ -826,8 +830,8 @@ class Optimizer:
826
830
  else:
827
831
  return f'{x:.1f}'
828
832
 
829
- vcpus = format_number(vcpus)
830
- mem = format_number(mem)
833
+ vcpus = format_number(vcpus_)
834
+ mem = format_number(mem_)
831
835
 
832
836
  if resources.zone is None:
833
837
  region_or_zone = resources.region
@@ -1195,10 +1199,12 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1195
1199
  all_clouds_specified.add(cloud_str)
1196
1200
 
1197
1201
  # Explicitly check again to update the enabled cloud list.
1198
- sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
1199
- quiet=True,
1200
- clouds=list(clouds_need_recheck -
1201
- global_disabled_clouds))
1202
+ clouds_to_check_again = list(clouds_need_recheck -
1203
+ global_disabled_clouds)
1204
+ if len(clouds_to_check_again) > 0:
1205
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
1206
+ quiet=True,
1207
+ clouds=clouds_to_check_again)
1202
1208
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1203
1209
  capability=sky_cloud.CloudCapability.COMPUTE,
1204
1210
  raise_if_no_cloud_access=True)
@@ -1736,9 +1736,16 @@ class KubernetesInstanceType:
1736
1736
  @staticmethod
1737
1737
  def is_valid_instance_type(name: str) -> bool:
1738
1738
  """Returns whether the given name is a valid instance type."""
1739
+ # Before https://github.com/skypilot-org/skypilot/pull/4756,
1740
+ # the accelerators are appended with format "--{a}{type}",
1741
+ # e.g. "4CPU--16GB--1V100".
1742
+ # Check both patterns to keep backward compatibility.
1743
+ # TODO(romilb): Backward compatibility, remove after 0.11.0.
1744
+ prev_pattern = re.compile(
1745
+ r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
1739
1746
  pattern = re.compile(
1740
1747
  r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
1741
- return bool(pattern.match(name))
1748
+ return bool(pattern.match(name)) or bool(prev_pattern.match(name))
1742
1749
 
1743
1750
  @classmethod
1744
1751
  def _parse_instance_type(
@@ -1755,6 +1762,11 @@ class KubernetesInstanceType:
1755
1762
  r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
1756
1763
  )
1757
1764
  match = pattern.match(name)
1765
+ # TODO(romilb): Backward compatibility, remove after 0.11.0.
1766
+ prev_pattern = re.compile(
1767
+ r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
1768
+ )
1769
+ prev_match = prev_pattern.match(name)
1758
1770
  if match:
1759
1771
  cpus = float(match.group('cpus'))
1760
1772
  memory = float(match.group('memory'))
@@ -1769,6 +1781,19 @@ class KubernetesInstanceType:
1769
1781
  accelerator_count = None
1770
1782
  accelerator_type = None
1771
1783
  return cpus, memory, accelerator_count, accelerator_type
1784
+ # TODO(romilb): Backward compatibility, remove after 0.11.0.
1785
+ elif prev_match:
1786
+ cpus = float(prev_match.group('cpus'))
1787
+ memory = float(prev_match.group('memory'))
1788
+ accelerator_count = prev_match.group('accelerator_count')
1789
+ accelerator_type = prev_match.group('accelerator_type')
1790
+ if accelerator_count:
1791
+ accelerator_count = int(accelerator_count)
1792
+ accelerator_type = str(accelerator_type).replace('_', ' ')
1793
+ else:
1794
+ accelerator_count = None
1795
+ accelerator_type = None
1796
+ return cpus, memory, accelerator_count, accelerator_type
1772
1797
  else:
1773
1798
  raise ValueError(f'Invalid instance name: {name}')
1774
1799