skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/cloud_vm_ray_backend.py +43 -60
  3. sky/cli.py +55 -637
  4. sky/client/cli.py +55 -637
  5. sky/clouds/kubernetes.py +3 -0
  6. sky/clouds/scp.py +7 -26
  7. sky/clouds/utils/scp_utils.py +177 -124
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/dashboard/out/users.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/jobs/controller.py +98 -31
  26. sky/jobs/scheduler.py +37 -29
  27. sky/jobs/server/core.py +36 -3
  28. sky/jobs/state.py +69 -9
  29. sky/jobs/utils.py +11 -0
  30. sky/provision/__init__.py +1 -0
  31. sky/provision/scp/__init__.py +15 -0
  32. sky/provision/scp/config.py +93 -0
  33. sky/provision/scp/instance.py +528 -0
  34. sky/resources.py +164 -29
  35. sky/skylet/constants.py +39 -0
  36. sky/skylet/job_lib.py +8 -0
  37. sky/task.py +171 -21
  38. sky/templates/kubernetes-ray.yml.j2 +51 -4
  39. sky/templates/scp-ray.yml.j2 +3 -50
  40. sky/users/permission.py +19 -36
  41. sky/utils/command_runner.py +1 -1
  42. sky/utils/common_utils.py +16 -14
  43. sky/utils/context.py +1 -1
  44. sky/utils/controller_utils.py +12 -3
  45. sky/utils/dag_utils.py +17 -4
  46. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  47. sky/utils/schemas.py +43 -5
  48. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
  49. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
  50. sky/benchmark/__init__.py +0 -0
  51. sky/benchmark/benchmark_state.py +0 -295
  52. sky/benchmark/benchmark_utils.py +0 -641
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  55. sky/skylet/providers/scp/__init__.py +0 -2
  56. sky/skylet/providers/scp/config.py +0 -149
  57. sky/skylet/providers/scp/node_provider.py +0 -578
  58. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
  59. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
  60. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
  61. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
  62. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/resources.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Resources: compute requirements of Tasks."""
2
2
  import dataclasses
3
+ import math
3
4
  import textwrap
4
5
  import typing
5
6
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
@@ -60,7 +61,7 @@ class AutostopConfig:
60
61
 
61
62
  @classmethod
62
63
  def from_yaml_config(
63
- cls, config: Union[bool, int, Dict[str, Any], None]
64
+ cls, config: Union[bool, int, str, Dict[str, Any], None]
64
65
  ) -> Optional['AutostopConfig']:
65
66
  if isinstance(config, bool):
66
67
  if config:
@@ -71,6 +72,11 @@ class AutostopConfig:
71
72
  if isinstance(config, int):
72
73
  return cls(idle_minutes=config, down=False, enabled=True)
73
74
 
75
+ if isinstance(config, str):
76
+ return cls(idle_minutes=parse_time_minutes(config),
77
+ down=False,
78
+ enabled=True)
79
+
74
80
  if isinstance(config, dict):
75
81
  # If we have a dict, autostop is enabled. (Only way to disable is
76
82
  # with `false`, a bool.)
@@ -101,7 +107,7 @@ class Resources:
101
107
  """
102
108
  # If any fields changed, increment the version. For backward compatibility,
103
109
  # modify the __setstate__ method to handle the old version.
104
- _VERSION = 26
110
+ _VERSION = 27
105
111
 
106
112
  def __init__(
107
113
  self,
@@ -118,12 +124,13 @@ class Resources:
118
124
  region: Optional[str] = None,
119
125
  zone: Optional[str] = None,
120
126
  image_id: Union[Dict[Optional[str], str], str, None] = None,
121
- disk_size: Optional[int] = None,
127
+ disk_size: Optional[Union[str, int]] = None,
122
128
  disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
123
129
  network_tier: Optional[Union[str, resources_utils.NetworkTier]] = None,
124
130
  ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
125
131
  labels: Optional[Dict[str, str]] = None,
126
- autostop: Union[bool, int, Dict[str, Any], None] = None,
132
+ autostop: Union[bool, int, str, Dict[str, Any], None] = None,
133
+ priority: Optional[int] = None,
127
134
  volumes: Optional[List[Dict[str, Any]]] = None,
128
135
  # Internal use only.
129
136
  # pylint: disable=invalid-name
@@ -217,6 +224,9 @@ class Resources:
217
224
  not supported and will be ignored.
218
225
  autostop: the autostop configuration to use. For launched resources,
219
226
  may or may not correspond to the actual current autostop config.
227
+ priority: the priority for this resource configuration. Must be an
228
+ integer from 0 to 1000, where higher values indicate higher priority.
229
+ If None, no priority is set.
220
230
  volumes: the volumes to mount on the instance.
221
231
  _docker_login_config: the docker configuration to use. This includes
222
232
  the docker username, password, and registry server. If None, skip
@@ -279,11 +289,7 @@ class Resources:
279
289
  self._job_recovery = job_recovery
280
290
 
281
291
  if disk_size is not None:
282
- if round(disk_size) != disk_size:
283
- with ux_utils.print_exception_no_traceback():
284
- raise ValueError(
285
- f'OS disk size must be an integer. Got: {disk_size}.')
286
- self._disk_size = int(disk_size)
292
+ self._disk_size = int(parse_memory_resource(disk_size, 'disk_size'))
287
293
  else:
288
294
  self._disk_size = _DEFAULT_DISK_SIZE_GB
289
295
 
@@ -357,10 +363,14 @@ class Resources:
357
363
  self._cluster_config_overrides = _cluster_config_overrides
358
364
  self._cached_repr: Optional[str] = None
359
365
 
366
+ # Initialize _priority before calling the setter
367
+ self._priority: Optional[int] = None
368
+
360
369
  self._set_cpus(cpus)
361
370
  self._set_memory(memory)
362
371
  self._set_accelerators(accelerators, accelerator_args)
363
372
  self._set_autostop_config(autostop)
373
+ self._set_priority(priority)
364
374
  self._set_volumes(volumes)
365
375
 
366
376
  def validate(self):
@@ -617,6 +627,14 @@ class Resources:
617
627
  """
618
628
  return self._autostop_config
619
629
 
630
+ @property
631
+ def priority(self) -> Optional[int]:
632
+ """The priority for this resource configuration.
633
+
634
+ Higher values indicate higher priority. Valid range is 0-1000.
635
+ """
636
+ return self._priority
637
+
620
638
  @property
621
639
  def is_image_managed(self) -> Optional[bool]:
622
640
  return self._is_image_managed
@@ -689,25 +707,27 @@ class Resources:
689
707
  self._memory = None
690
708
  return
691
709
 
692
- self._memory = str(memory)
693
- if isinstance(memory, str):
694
- if memory.endswith(('+', 'x')):
695
- # 'x' is used internally for make sure our resources used by
696
- # jobs controller (memory: 3x) to have enough memory based on
697
- # the vCPUs.
698
- num_memory_gb = memory[:-1]
699
- else:
700
- num_memory_gb = memory
701
-
702
- try:
703
- memory_gb = float(num_memory_gb)
704
- except ValueError:
705
- with ux_utils.print_exception_no_traceback():
706
- raise ValueError(
707
- f'The "memory" field should be either a number or '
708
- f'a string "<number>+". Found: {memory!r}') from None
710
+ memory = parse_memory_resource(str(memory),
711
+ 'memory',
712
+ ret_type=float,
713
+ allow_plus=True,
714
+ allow_x=True)
715
+ self._memory = memory
716
+ if memory.endswith(('+', 'x')):
717
+ # 'x' is used internally for make sure our resources used by
718
+ # jobs controller (memory: 3x) to have enough memory based on
719
+ # the vCPUs.
720
+ num_memory_gb = memory[:-1]
709
721
  else:
710
- memory_gb = float(memory)
722
+ num_memory_gb = memory
723
+
724
+ try:
725
+ memory_gb = float(num_memory_gb)
726
+ except ValueError:
727
+ with ux_utils.print_exception_no_traceback():
728
+ raise ValueError(
729
+ f'The "memory" field should be either a number or '
730
+ f'a string "<number>+". Found: {memory!r}') from None
711
731
 
712
732
  if memory_gb <= 0:
713
733
  with ux_utils.print_exception_no_traceback():
@@ -796,10 +816,24 @@ class Resources:
796
816
 
797
817
  def _set_autostop_config(
798
818
  self,
799
- autostop: Union[bool, int, Dict[str, Any], None],
819
+ autostop: Union[bool, int, str, Dict[str, Any], None],
800
820
  ) -> None:
801
821
  self._autostop_config = AutostopConfig.from_yaml_config(autostop)
802
822
 
823
+ def _set_priority(self, priority: Optional[int]) -> None:
824
+ """Sets the priority for this resource configuration.
825
+
826
+ Args:
827
+ priority: Priority value from 0 to 1000, where higher values
828
+ indicate higher priority. If None, no priority is set.
829
+ """
830
+ if priority is not None:
831
+ if not 0 <= priority <= 1000:
832
+ with ux_utils.print_exception_no_traceback():
833
+ raise ValueError(f'Priority must be between 0 and 1000. '
834
+ f'Found: {priority}')
835
+ self._priority = priority
836
+
803
837
  def _set_volumes(
804
838
  self,
805
839
  volumes: Optional[List[Dict[str, Any]]],
@@ -852,6 +886,7 @@ class Resources:
852
886
  else:
853
887
  volume['attach_mode'] = read_write_mode
854
888
  if volume['storage_type'] == network_type:
889
+ # TODO(luca): add units to this disk_size as well
855
890
  if ('disk_size' in volume and
856
891
  round(volume['disk_size']) != volume['disk_size']):
857
892
  with ux_utils.print_exception_no_traceback():
@@ -1716,6 +1751,7 @@ class Resources:
1716
1751
  ports=override.pop('ports', self.ports),
1717
1752
  labels=override.pop('labels', self.labels),
1718
1753
  autostop=override.pop('autostop', current_autostop_config),
1754
+ priority=override.pop('priority', self.priority),
1719
1755
  volumes=override.pop('volumes', self.volumes),
1720
1756
  infra=override.pop('infra', None),
1721
1757
  _docker_login_config=override.pop('_docker_login_config',
@@ -1936,6 +1972,7 @@ class Resources:
1936
1972
  resources_fields['ports'] = config.pop('ports', None)
1937
1973
  resources_fields['labels'] = config.pop('labels', None)
1938
1974
  resources_fields['autostop'] = config.pop('autostop', None)
1975
+ resources_fields['priority'] = config.pop('priority', None)
1939
1976
  resources_fields['volumes'] = config.pop('volumes', None)
1940
1977
  resources_fields['_docker_login_config'] = config.pop(
1941
1978
  '_docker_login_config', None)
@@ -1955,7 +1992,9 @@ class Resources:
1955
1992
  resources_fields['accelerator_args'] = dict(
1956
1993
  resources_fields['accelerator_args'])
1957
1994
  if resources_fields['disk_size'] is not None:
1958
- resources_fields['disk_size'] = int(resources_fields['disk_size'])
1995
+ # although it will end up being an int, we don't know at this point
1996
+ # if it has units or not, so we store it as a string
1997
+ resources_fields['disk_size'] = str(resources_fields['disk_size'])
1959
1998
 
1960
1999
  assert not config, f'Invalid resource args: {config.keys()}'
1961
2000
  return Resources(**resources_fields)
@@ -2006,6 +2045,7 @@ class Resources:
2006
2045
  config['volumes'] = volumes
2007
2046
  if self._autostop_config is not None:
2008
2047
  config['autostop'] = self._autostop_config.to_yaml_config()
2048
+ add_if_not_none('priority', self.priority)
2009
2049
  if self._docker_login_config is not None:
2010
2050
  config['_docker_login_config'] = dataclasses.asdict(
2011
2051
  self._docker_login_config)
@@ -2174,6 +2214,9 @@ class Resources:
2174
2214
  if version < 26:
2175
2215
  self._network_tier = state.get('_network_tier', None)
2176
2216
 
2217
+ if version < 27:
2218
+ self._priority = None
2219
+
2177
2220
  self.__dict__.update(state)
2178
2221
 
2179
2222
 
@@ -2219,3 +2262,95 @@ def _maybe_add_docker_prefix_to_image_id(
2219
2262
  for k, v in image_id_dict.items():
2220
2263
  if not v.startswith('docker:'):
2221
2264
  image_id_dict[k] = f'docker:{v}'
2265
+
2266
+
2267
+ def parse_time_minutes(time: str) -> int:
2268
+ """Convert a time string to minutes.
2269
+
2270
+ Args:
2271
+ time: Time string with optional unit suffix (e.g., '30m', '2h', '1d')
2272
+
2273
+ Returns:
2274
+ Time in minutes as an integer
2275
+ """
2276
+ time_str = str(time)
2277
+
2278
+ if time_str.isdecimal():
2279
+ # We assume it is already in minutes to maintain backwards
2280
+ # compatibility
2281
+ return int(time_str)
2282
+
2283
+ time_str = time_str.lower()
2284
+ for unit, multiplier in constants.TIME_UNITS.items():
2285
+ if time_str.endswith(unit):
2286
+ try:
2287
+ value = int(time_str[:-len(unit)])
2288
+ return math.ceil(value * multiplier)
2289
+ except ValueError:
2290
+ continue
2291
+
2292
+ raise ValueError(f'Invalid time format: {time}')
2293
+
2294
+
2295
+ def parse_memory_resource(resource_qty_str: Union[str, int, float],
2296
+ field_name: str,
2297
+ ret_type: type = int,
2298
+ unit: str = 'g',
2299
+ allow_plus: bool = False,
2300
+ allow_x: bool = False,
2301
+ allow_rounding: bool = False) -> str:
2302
+ """Returns memory size in chosen units given a resource quantity string.
2303
+
2304
+ Args:
2305
+ resource_qty_str: Resource quantity string
2306
+ unit: Unit to convert to
2307
+ allow_plus: Whether to allow '+' prefix
2308
+ allow_x: Whether to allow 'x' suffix
2309
+ """
2310
+ assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
2311
+
2312
+ error_msg = f'"{field_name}" field should be a <int><b|k|m|g|t|p><+?>,'\
2313
+ f' got {resource_qty_str}'
2314
+
2315
+ resource_str = str(resource_qty_str)
2316
+
2317
+ # Handle plus and x suffixes, x is only used internally for jobs controller
2318
+ plus = ''
2319
+ if resource_str.endswith('+'):
2320
+ if allow_plus:
2321
+ resource_str = resource_str[:-1]
2322
+ plus = '+'
2323
+ else:
2324
+ raise ValueError(error_msg)
2325
+
2326
+ x = ''
2327
+ if resource_str.endswith('x'):
2328
+ if allow_x:
2329
+ resource_str = resource_str[:-1]
2330
+ x = 'x'
2331
+ else:
2332
+ raise ValueError(error_msg)
2333
+
2334
+ try:
2335
+ # We assume it is already in the wanted units to maintain backwards
2336
+ # compatibility
2337
+ ret_type(resource_str)
2338
+ return f'{resource_str}{plus}{x}'
2339
+ except ValueError:
2340
+ pass
2341
+
2342
+ resource_str = resource_str.lower()
2343
+ for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
2344
+ if resource_str.endswith(mem_unit):
2345
+ try:
2346
+ value = ret_type(resource_str[:-len(mem_unit)])
2347
+ converted = (value * multiplier /
2348
+ constants.MEMORY_SIZE_UNITS[unit])
2349
+ if not allow_rounding and ret_type(converted) != converted:
2350
+ raise ValueError(error_msg)
2351
+ converted = ret_type(converted)
2352
+ return f'{converted}{plus}{x}'
2353
+ except ValueError:
2354
+ continue
2355
+
2356
+ raise ValueError(error_msg)
sky/skylet/constants.py CHANGED
@@ -396,6 +396,10 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
396
396
  # persistent through PVC. See kubernetes-ray.yml.j2.
397
397
  PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
398
398
  PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
399
+ # Signal file to indicate that the controller is recovering from a failure.
400
+ # See sky/jobs/utils.py::update_managed_jobs_statuses for more details.
401
+ PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
402
+ '~/.sky/.controller_recovery_restarting_signal')
399
403
 
400
404
  # The placeholder for the local skypilot config path in file mounts for
401
405
  # controllers.
@@ -421,3 +425,38 @@ ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
421
425
 
422
426
  # The user ID of the SkyPilot system.
423
427
  SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
428
+
429
+ # Resources constants
430
+ TIME_UNITS = {
431
+ 's': 1 / 60,
432
+ 'sec': 1 / 60,
433
+ 'm': 1,
434
+ 'min': 1,
435
+ 'h': 60,
436
+ 'hr': 60,
437
+ 'd': 24 * 60,
438
+ 'day': 24 * 60,
439
+ }
440
+
441
+ TIME_PATTERN: str = (
442
+ f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
443
+
444
+ MEMORY_SIZE_UNITS = {
445
+ 'b': 1,
446
+ 'k': 2**10,
447
+ 'kb': 2**10,
448
+ 'm': 2**20,
449
+ 'mb': 2**20,
450
+ 'g': 2**30,
451
+ 'gb': 2**30,
452
+ 't': 2**40,
453
+ 'tb': 2**40,
454
+ 'p': 2**50,
455
+ 'pb': 2**50,
456
+ }
457
+
458
+ MEMORY_SIZE_PATTERN = (
459
+ '^[0-9]+('
460
+ f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
461
+ ')?$/i')
462
+ MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
sky/skylet/job_lib.py CHANGED
@@ -758,6 +758,14 @@ def fail_all_jobs_in_progress() -> None:
758
758
 
759
759
 
760
760
  def update_status() -> None:
761
+ # This signal file suggests that the controller is recovering from a
762
+ # failure. See sky/jobs/utils.py::update_managed_jobs_statuses for more
763
+ # details. When recovering, we should not update the job status to failed
764
+ # driver as they will be recovered later.
765
+ if os.path.exists(
766
+ os.path.expanduser(
767
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
768
+ return
761
769
  # This will be called periodically by the skylet to update the status
762
770
  # of the jobs in the database, to avoid stale job status.
763
771
  nonterminal_jobs = _get_jobs(user_hash=None,