skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +452 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/data_utils.py +21 -1
  26. sky/data/storage.py +12 -0
  27. sky/jobs/__init__.py +3 -0
  28. sky/jobs/client/sdk.py +80 -3
  29. sky/jobs/controller.py +76 -25
  30. sky/jobs/recovery_strategy.py +80 -34
  31. sky/jobs/scheduler.py +68 -20
  32. sky/jobs/server/core.py +228 -136
  33. sky/jobs/server/server.py +40 -0
  34. sky/jobs/state.py +129 -24
  35. sky/jobs/utils.py +109 -51
  36. sky/provision/nebius/constants.py +3 -0
  37. sky/provision/runpod/utils.py +27 -12
  38. sky/py.typed +0 -0
  39. sky/resources.py +16 -12
  40. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  41. sky/serve/autoscalers.py +8 -0
  42. sky/serve/client/impl.py +188 -0
  43. sky/serve/client/sdk.py +12 -82
  44. sky/serve/constants.py +5 -1
  45. sky/serve/controller.py +5 -0
  46. sky/serve/replica_managers.py +112 -37
  47. sky/serve/serve_state.py +16 -6
  48. sky/serve/serve_utils.py +274 -77
  49. sky/serve/server/core.py +8 -525
  50. sky/serve/server/impl.py +709 -0
  51. sky/serve/service.py +13 -9
  52. sky/serve/service_spec.py +74 -4
  53. sky/server/constants.py +1 -1
  54. sky/server/daemons.py +164 -0
  55. sky/server/requests/payloads.py +33 -0
  56. sky/server/requests/requests.py +2 -107
  57. sky/server/requests/serializers/decoders.py +12 -3
  58. sky/server/requests/serializers/encoders.py +13 -2
  59. sky/server/server.py +2 -1
  60. sky/server/uvicorn.py +2 -1
  61. sky/sky_logging.py +30 -0
  62. sky/skylet/constants.py +2 -1
  63. sky/skylet/events.py +9 -0
  64. sky/skypilot_config.py +24 -21
  65. sky/task.py +41 -11
  66. sky/templates/jobs-controller.yaml.j2 +3 -0
  67. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  68. sky/users/server.py +1 -1
  69. sky/utils/command_runner.py +4 -2
  70. sky/utils/controller_utils.py +14 -10
  71. sky/utils/dag_utils.py +4 -2
  72. sky/utils/db/migration_utils.py +2 -4
  73. sky/utils/schemas.py +47 -19
  74. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
  75. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
  76. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
  78. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py CHANGED
@@ -382,7 +382,8 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
382
382
  # we skip the following keys because they are meant to be client-side configs.
383
383
  SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
384
384
  ('allowed_clouds',),
385
- ('workspaces',), ('db',)]
385
+ ('workspaces',), ('db',),
386
+ ('daemons',)]
386
387
 
387
388
  # Constants for Azure blob storage
388
389
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
sky/skylet/events.py CHANGED
@@ -75,7 +75,16 @@ class ManagedJobEvent(SkyletEvent):
75
75
  EVENT_INTERVAL_SECONDS = 300
76
76
 
77
77
  def _run(self):
78
+ logger.info('=== Updating managed job status ===')
78
79
  managed_job_utils.update_managed_jobs_statuses()
80
+
81
+
82
+ class ManagedJobSchedulingEvent(SkyletEvent):
83
+ """Skylet event for scheduling managed jobs."""
84
+ EVENT_INTERVAL_SECONDS = 20
85
+
86
+ def _run(self):
87
+ logger.info('=== Scheduling next jobs ===')
79
88
  managed_job_scheduler.maybe_schedule_next_jobs()
80
89
 
81
90
 
sky/skypilot_config.py CHANGED
@@ -495,6 +495,12 @@ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
495
495
  try:
496
496
  config_dict = common_utils.read_yaml(config_path)
497
497
  config = config_utils.Config.from_dict(config_dict)
498
+ # pop the db url from the config, and set it to the env var.
499
+ # this is to avoid db url (considered a sensitive value)
500
+ # being printed with the rest of the config.
501
+ db_url = config.pop_nested(('db',), None)
502
+ if db_url:
503
+ os.environ[constants.ENV_VAR_DB_CONNECTION_URI] = db_url
498
504
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
499
505
  logger.debug(f'Config loaded from {config_path}:\n'
500
506
  f'{common_utils.dump_yaml_str(dict(config))}')
@@ -556,21 +562,16 @@ def _reload_config_as_server() -> None:
556
562
  _set_loaded_config_path(None)
557
563
 
558
564
  server_config_path = _resolve_server_config_path()
559
- db_url_from_env = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
560
565
  server_config = _get_config_from_path(server_config_path)
561
- if db_url_from_env:
562
- server_config.set_nested(('db',), db_url_from_env)
563
-
564
- if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
565
- logger.debug(f'server config: \n'
566
- f'{common_utils.dump_yaml_str(dict(server_config))}')
567
-
568
- db_url = server_config.get_nested(('db',), None)
569
- if db_url and len(server_config.keys()) > 1:
570
- raise ValueError(
571
- 'if db config is specified, no other config is allowed')
566
+ # Get the db url from the env var. _get_config_from_path should have moved
567
+ # the db url specified in config file to the env var.
568
+ db_url = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
572
569
 
573
570
  if db_url:
571
+ if len(server_config.keys()) > 1:
572
+ raise ValueError(
573
+ 'If db config is specified, no other config is allowed')
574
+ logger.debug('retrieving config from database')
574
575
  with _DB_USE_LOCK:
575
576
  sqlalchemy_engine = sqlalchemy.create_engine(db_url,
576
577
  poolclass=NullPool)
@@ -591,14 +592,13 @@ def _reload_config_as_server() -> None:
591
592
 
592
593
  db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
593
594
  if db_config:
594
- if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
595
- logger.debug(
596
- f'Config loaded from db:\n'
597
- f'{common_utils.dump_yaml_str(dict(db_config))}')
598
595
  server_config = overlay_skypilot_config(server_config,
599
596
  db_config)
600
597
  # Close the engine to avoid connection leaks
601
598
  sqlalchemy_engine.dispose()
599
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
600
+ logger.debug(f'server config: \n'
601
+ f'{common_utils.dump_yaml_str(dict(server_config))}')
602
602
  _set_loaded_config(server_config)
603
603
  _set_loaded_config_path(server_config_path)
604
604
 
@@ -681,6 +681,10 @@ def override_skypilot_config(
681
681
 
682
682
  disallowed_diff_keys = []
683
683
  for key in constants.SKIPPED_CLIENT_OVERRIDE_KEYS:
684
+ if key == ('db',):
685
+ # since db key is popped out of server config, the key is expected
686
+ # to be different between client and server.
687
+ continue
684
688
  value = override_configs.pop_nested(key, default_value=None)
685
689
  if (value is not None and
686
690
  value != original_config.get_nested(key, default_value=None)):
@@ -855,11 +859,11 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
855
859
 
856
860
  db_updated = False
857
861
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
858
- existing_db_url = get_nested(('db',), None)
862
+ existing_db_url = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
863
+ new_db_url = config.pop_nested(('db',), None)
864
+ if new_db_url and new_db_url != existing_db_url:
865
+ raise ValueError('Cannot change db url while server is running')
859
866
  if existing_db_url:
860
- new_db_url = config.get_nested(('db',), None)
861
- if new_db_url and new_db_url != existing_db_url:
862
- raise ValueError('Cannot change db url while server is running')
863
867
  with _DB_USE_LOCK:
864
868
  sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
865
869
  poolclass=NullPool)
@@ -869,7 +873,6 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
869
873
  def _set_config_yaml_to_db(key: str,
870
874
  config: config_utils.Config):
871
875
  assert sqlalchemy_engine is not None
872
- config.pop_nested(('db',), None)
873
876
  config_str = common_utils.dump_yaml_str(dict(config))
874
877
  with orm.Session(sqlalchemy_engine) as session:
875
878
  if (sqlalchemy_engine.dialect.name ==
sky/task.py CHANGED
@@ -256,6 +256,7 @@ class Task:
256
256
  file_mounts_mapping: Optional[Dict[str, str]] = None,
257
257
  volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
258
258
  metadata: Optional[Dict[str, Any]] = None,
259
+ _user_specified_yaml: Optional[str] = None,
259
260
  ):
260
261
  """Initializes a Task.
261
262
 
@@ -381,6 +382,8 @@ class Task:
381
382
  if dag is not None:
382
383
  dag.add(self)
383
384
 
385
+ self._user_specified_yaml = _user_specified_yaml
386
+
384
387
  def validate(self,
385
388
  skip_file_mounts: bool = False,
386
389
  skip_workdir: bool = False):
@@ -525,6 +528,8 @@ class Task:
525
528
  env_overrides: Optional[List[Tuple[str, str]]] = None,
526
529
  secrets_overrides: Optional[List[Tuple[str, str]]] = None,
527
530
  ) -> 'Task':
531
+ user_specified_yaml = config.pop('_user_specified_yaml',
532
+ common_utils.dump_yaml_str(config))
528
533
  # More robust handling for 'envs': explicitly convert keys and values to
529
534
  # str, since users may pass '123' as keys/values which will get parsed
530
535
  # as int causing validate_schema() to fail.
@@ -590,19 +595,23 @@ class Task:
590
595
 
591
596
  # Fill in any Task.envs into file_mounts (src/dst paths, storage
592
597
  # name/source).
598
+ env_vars = config.get('envs', {})
599
+ secrets = config.get('secrets', {})
600
+ env_and_secrets = env_vars.copy()
601
+ env_and_secrets.update(secrets)
593
602
  if config.get('file_mounts') is not None:
594
603
  config['file_mounts'] = _fill_in_env_vars(config['file_mounts'],
595
- config.get('envs', {}))
604
+ env_and_secrets)
596
605
 
597
606
  # Fill in any Task.envs into service (e.g. MODEL_NAME).
598
607
  if config.get('service') is not None:
599
608
  config['service'] = _fill_in_env_vars(config['service'],
600
- config.get('envs', {}))
609
+ env_and_secrets)
601
610
 
602
611
  # Fill in any Task.envs into workdir
603
612
  if config.get('workdir') is not None:
604
613
  config['workdir'] = _fill_in_env_vars(config['workdir'],
605
- config.get('envs', {}))
614
+ env_and_secrets)
606
615
 
607
616
  task = Task(
608
617
  config.pop('name', None),
@@ -616,6 +625,7 @@ class Task:
616
625
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
617
626
  volumes=config.pop('volumes', None),
618
627
  metadata=config.pop('_metadata', None),
628
+ _user_specified_yaml=user_specified_yaml,
619
629
  )
620
630
 
621
631
  # Create lists to store storage objects inlined in file_mounts.
@@ -736,9 +746,19 @@ class Task:
736
746
  task.set_resources(sky.Resources.from_yaml_config(resources_config))
737
747
 
738
748
  service = config.pop('service', None)
749
+ pool = config.pop('pool', None)
750
+ if service is not None and pool is not None:
751
+ with ux_utils.print_exception_no_traceback():
752
+ raise ValueError(
753
+ 'Cannot set both service and pool in the same task.')
754
+
739
755
  if service is not None:
740
756
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
741
- task.set_service(service)
757
+ task.set_service(service)
758
+ elif pool is not None:
759
+ pool['pool'] = True
760
+ pool = service_spec.SkyServiceSpec.from_yaml_config(pool)
761
+ task.set_service(pool)
742
762
 
743
763
  volume_mounts = config.pop('volume_mounts', None)
744
764
  if volume_mounts is not None:
@@ -773,7 +793,8 @@ class Task:
773
793
  # TODO(zongheng): use
774
794
  # https://github.com/yaml/pyyaml/issues/165#issuecomment-430074049
775
795
  # to raise errors on duplicate keys.
776
- config = yaml.safe_load(f)
796
+ user_specified_yaml = f.read()
797
+ config = yaml.safe_load(user_specified_yaml)
777
798
 
778
799
  if isinstance(config, str):
779
800
  with ux_utils.print_exception_no_traceback():
@@ -782,6 +803,7 @@ class Task:
782
803
 
783
804
  if config is None:
784
805
  config = {}
806
+ config['_user_specified_yaml'] = user_specified_yaml
785
807
  return Task.from_yaml_config(config)
786
808
 
787
809
  def resolve_and_validate_volumes(self) -> None:
@@ -1537,11 +1559,22 @@ class Task:
1537
1559
  d[k] = v
1538
1560
  return d
1539
1561
 
1540
- def to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
1562
+ def to_yaml_config(self,
1563
+ use_user_specified_yaml: bool = False) -> Dict[str, Any]:
1541
1564
  """Returns a yaml-style dict representation of the task.
1542
1565
 
1543
1566
  INTERNAL: this method is internal-facing.
1544
1567
  """
1568
+ if use_user_specified_yaml:
1569
+ if self._user_specified_yaml is None:
1570
+ return self._to_yaml_config(redact_secrets=True)
1571
+ config = yaml.safe_load(self._user_specified_yaml)
1572
+ if config.get('secrets') is not None:
1573
+ config['secrets'] = {k: '<redacted>' for k in config['secrets']}
1574
+ return config
1575
+ return self._to_yaml_config()
1576
+
1577
+ def _to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
1545
1578
  config = {}
1546
1579
 
1547
1580
  def add_if_not_none(key, value, no_empty: bool = False):
@@ -1586,13 +1619,9 @@ class Task:
1586
1619
  # Add envs without redaction
1587
1620
  add_if_not_none('envs', self.envs, no_empty=True)
1588
1621
 
1589
- # Add secrets with redaction if requested
1590
1622
  secrets = self.secrets
1591
1623
  if secrets and redact_secrets:
1592
- secrets = {
1593
- k: '<redacted>' if isinstance(v, str) else v
1594
- for k, v in secrets.items()
1595
- }
1624
+ secrets = {k: '<redacted>' for k in secrets}
1596
1625
  add_if_not_none('secrets', secrets, no_empty=True)
1597
1626
 
1598
1627
  add_if_not_none('file_mounts', {})
@@ -1615,6 +1644,7 @@ class Task:
1615
1644
  ]
1616
1645
  # we manually check if its empty to not clog up the generated yaml
1617
1646
  add_if_not_none('_metadata', self._metadata if self._metadata else None)
1647
+ add_if_not_none('_user_specified_yaml', self._user_specified_yaml)
1618
1648
  return config
1619
1649
 
1620
1650
  def get_required_cloud_features(
@@ -57,6 +57,9 @@ run: |
57
57
  --job-id $SKYPILOT_INTERNAL_JOB_ID \
58
58
  {%- endif %}
59
59
  --env-file {{remote_env_file_path}} \
60
+ {%- if pool is not none %}
61
+ --pool {{pool}} \
62
+ {%- endif %}
60
63
  --priority {{priority}}
61
64
 
62
65
 
@@ -45,13 +45,29 @@ file_mounts:
45
45
  run: |
46
46
  # Activate the Python environment, so that cloud SDKs can be found in the
47
47
  # PATH.
48
+ {%- if consolidation_mode_job_id is none %}
48
49
  {{ sky_activate_python_env }}
50
+ {%- endif %}
49
51
  # Start sky serve service.
50
- python -u -m sky.serve.service \
52
+ {%- if consolidation_mode_job_id is not none %}
53
+ {{sky_python_cmd}} \
54
+ {%- else %}
55
+ python \
56
+ {%- endif %}
57
+ -u -m sky.serve.service \
51
58
  --service-name {{service_name}} \
52
59
  --task-yaml {{remote_task_yaml_path}} \
60
+ {%- if consolidation_mode_job_id is not none %}
61
+ --job-id {{consolidation_mode_job_id}} \
62
+ {%- else %}
53
63
  --job-id $SKYPILOT_INTERNAL_JOB_ID \
54
- >> {{controller_log_file}} 2>&1
64
+ {%- endif %}
65
+ >> {{controller_log_file}} 2>&1 \
66
+ {%- if consolidation_mode_job_id is not none %}
67
+ &
68
+ {%- endif %}
69
+ # For consolidation mode, we need to run the service in the background so
70
+ # that it can immediately return in serve.core.up().
55
71
 
56
72
  envs:
57
73
  {%- for env_name, env_value in controller_envs.items() %}
sky/users/server.py CHANGED
@@ -414,7 +414,7 @@ async def get_service_account_tokens(
414
414
 
415
415
  def _generate_service_account_user_id() -> str:
416
416
  """Generate a unique user ID for a service account."""
417
- random_suffix = secrets.token_hex(16) # 16 character hex string
417
+ random_suffix = secrets.token_hex(8) # 16 character hex string
418
418
  service_account_id = (f'sa-{random_suffix}')
419
419
  return service_account_id
420
420
 
@@ -201,6 +201,7 @@ class CommandRunner:
201
201
  separate_stderr: bool,
202
202
  skip_num_lines: int,
203
203
  source_bashrc: bool = False,
204
+ use_login: bool = True,
204
205
  ) -> str:
205
206
  """Returns the command to run."""
206
207
  if isinstance(cmd, list):
@@ -211,7 +212,7 @@ class CommandRunner:
211
212
  '/bin/bash',
212
213
  '--login',
213
214
  '-c',
214
- ]
215
+ ] if use_login else ['/bin/bash', '-c']
215
216
  if source_bashrc:
216
217
  command += [
217
218
  # Need this `-i` option to make sure `source ~/.bashrc` work.
@@ -1124,7 +1125,8 @@ class LocalProcessCommandRunner(CommandRunner):
1124
1125
  process_stream,
1125
1126
  separate_stderr,
1126
1127
  skip_num_lines=skip_num_lines,
1127
- source_bashrc=source_bashrc)
1128
+ source_bashrc=source_bashrc,
1129
+ use_login=False)
1128
1130
 
1129
1131
  log_dir = os.path.expanduser(os.path.dirname(log_path))
1130
1132
  os.makedirs(log_dir, exist_ok=True)
@@ -5,7 +5,7 @@ import enum
5
5
  import os
6
6
  import tempfile
7
7
  import typing
8
- from typing import Any, Dict, Iterable, List, Optional, Set
8
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Set
9
9
  import uuid
10
10
 
11
11
  import colorama
@@ -64,7 +64,7 @@ class _ControllerSpec:
64
64
  controller_type: str
65
65
  name: str
66
66
  cluster_name: str
67
- in_progress_hint: str
67
+ in_progress_hint: Callable[[bool], str]
68
68
  decline_cancel_hint: str
69
69
  _decline_down_when_failed_to_fetch_status_hint: str
70
70
  decline_down_for_dirty_controller_hint: str
@@ -94,9 +94,9 @@ class Controllers(enum.Enum):
94
94
  controller_type='jobs',
95
95
  name='managed jobs controller',
96
96
  cluster_name=common.JOB_CONTROLLER_NAME,
97
- in_progress_hint=(
98
- '* {job_info}To see all managed jobs: '
99
- f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
97
+ in_progress_hint=lambda _:
98
+ ('* {job_info}To see all managed jobs: '
99
+ f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
100
100
  decline_cancel_hint=(
101
101
  'Cancelling the jobs controller\'s jobs is not allowed.\nTo cancel '
102
102
  f'managed jobs, use: {colorama.Style.BRIGHT}sky jobs cancel '
@@ -126,8 +126,11 @@ class Controllers(enum.Enum):
126
126
  name='serve controller',
127
127
  cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
128
128
  in_progress_hint=(
129
- f'* To see detailed service status: {colorama.Style.BRIGHT}'
130
- f'sky serve status -v{colorama.Style.RESET_ALL}'),
129
+ lambda pool:
130
+ (f'* To see detailed pool status: {colorama.Style.BRIGHT}'
131
+ f'sky jobs pool status -v{colorama.Style.RESET_ALL}') if pool else
132
+ (f'* To see detailed service status: {colorama.Style.BRIGHT}'
133
+ f'sky serve status -v{colorama.Style.RESET_ALL}')),
131
134
  decline_cancel_hint=(
132
135
  'Cancelling the sky serve controller\'s jobs is not allowed.'),
133
136
  _decline_down_when_failed_to_fetch_status_hint=(
@@ -391,10 +394,11 @@ def check_cluster_name_not_controller(
391
394
 
392
395
 
393
396
  # Internal only:
394
- def download_and_stream_latest_job_log(
397
+ def download_and_stream_job_log(
395
398
  backend: 'cloud_vm_ray_backend.CloudVmRayBackend',
396
399
  handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
397
- local_dir: str) -> Optional[str]:
400
+ local_dir: str,
401
+ job_ids: Optional[List[str]] = None) -> Optional[str]:
398
402
  """Downloads and streams the latest job log.
399
403
 
400
404
  This function is only used by jobs controller and sky serve controller.
@@ -412,7 +416,7 @@ def download_and_stream_latest_job_log(
412
416
  # multi-node cluster is preempted, and we recover the managed job
413
417
  # on the existing cluster, which leads to a larger job_id. Those
414
418
  # job_ids all represent the same logical managed job.
415
- job_ids=None,
419
+ job_ids=job_ids,
416
420
  local_dir=local_dir)
417
421
  except Exception as e: # pylint: disable=broad-except
418
422
  # We want to avoid crashing the controller. sync_down_logs() is pretty
sky/utils/dag_utils.py CHANGED
@@ -148,7 +148,7 @@ def load_chain_dag_from_yaml_str(
148
148
 
149
149
 
150
150
  def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
151
- redact_secrets: bool = False) -> str:
151
+ use_user_specified_yaml: bool = False) -> str:
152
152
  """Dumps a chain DAG to a YAML string.
153
153
 
154
154
  Args:
@@ -161,7 +161,9 @@ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
161
161
  assert dag.is_chain(), dag
162
162
  configs = [{'name': dag.name}]
163
163
  for task in dag.tasks:
164
- configs.append(task.to_yaml_config(redact_secrets=redact_secrets))
164
+ configs.append(
165
+ task.to_yaml_config(
166
+ use_user_specified_yaml=use_user_specified_yaml))
165
167
  return common_utils.dump_yaml_str(configs)
166
168
 
167
169
 
@@ -12,7 +12,6 @@ import filelock
12
12
  import sqlalchemy
13
13
 
14
14
  from sky import sky_logging
15
- from sky import skypilot_config
16
15
  from sky.skylet import constants
17
16
 
18
17
  logger = sky_logging.init_logger(__name__)
@@ -24,16 +23,15 @@ GLOBAL_USER_STATE_VERSION = '001'
24
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
25
24
 
26
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
27
- SPOT_JOBS_VERSION = '001'
26
+ SPOT_JOBS_VERSION = '002'
28
27
  SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
29
28
 
30
29
 
31
30
  def get_engine(db_name: str):
32
31
  conn_string = None
33
32
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
34
- conn_string = skypilot_config.get_nested(('db',), None)
33
+ conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
35
34
  if conn_string:
36
- logger.debug(f'using db URI from {conn_string}')
37
35
  engine = sqlalchemy.create_engine(conn_string,
38
36
  poolclass=sqlalchemy.NullPool)
39
37
  else:
sky/utils/schemas.py CHANGED
@@ -605,7 +605,6 @@ def get_service_schema():
605
605
  return {
606
606
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
607
607
  'type': 'object',
608
- 'required': ['readiness_probe'],
609
608
  'additionalProperties': False,
610
609
  'properties': {
611
610
  'readiness_probe': {
@@ -641,6 +640,9 @@ def get_service_schema():
641
640
  }
642
641
  }]
643
642
  },
643
+ 'pool': {
644
+ 'type': 'boolean',
645
+ },
644
646
  'replica_policy': {
645
647
  'type': 'object',
646
648
  'required': ['min_replicas'],
@@ -688,6 +690,9 @@ def get_service_schema():
688
690
  'replicas': {
689
691
  'type': 'integer',
690
692
  },
693
+ 'workers': {
694
+ 'type': 'integer',
695
+ },
691
696
  'load_balancing_policy': {
692
697
  'type': 'string',
693
698
  'case_insensitive_enum': list(
@@ -825,6 +830,9 @@ def get_task_schema():
825
830
  'service': {
826
831
  'type': 'object',
827
832
  },
833
+ 'pool': {
834
+ 'type': 'object',
835
+ },
828
836
  'setup': {
829
837
  'type': 'string',
830
838
  },
@@ -1128,6 +1136,7 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
1128
1136
 
1129
1137
  def get_config_schema():
1130
1138
  # pylint: disable=import-outside-toplevel
1139
+ from sky.server import daemons
1131
1140
 
1132
1141
  resources_schema = {
1133
1142
  k: v
@@ -1137,21 +1146,7 @@ def get_config_schema():
1137
1146
  }
1138
1147
  resources_schema['properties'].pop('ports')
1139
1148
 
1140
- def _get_controller_schema(add_consolidation_mode: bool = False):
1141
- controller_properties = {
1142
- 'resources': resources_schema,
1143
- 'high_availability': {
1144
- 'type': 'boolean',
1145
- 'default': False,
1146
- },
1147
- 'autostop': _AUTOSTOP_SCHEMA,
1148
- }
1149
- if add_consolidation_mode:
1150
- controller_properties['consolidation_mode'] = {
1151
- 'type': 'boolean',
1152
- 'default': False,
1153
- }
1154
-
1149
+ def _get_controller_schema():
1155
1150
  return {
1156
1151
  'type': 'object',
1157
1152
  'required': [],
@@ -1161,7 +1156,18 @@ def get_config_schema():
1161
1156
  'type': 'object',
1162
1157
  'required': [],
1163
1158
  'additionalProperties': False,
1164
- 'properties': controller_properties,
1159
+ 'properties': {
1160
+ 'resources': resources_schema,
1161
+ 'high_availability': {
1162
+ 'type': 'boolean',
1163
+ 'default': False,
1164
+ },
1165
+ 'autostop': _AUTOSTOP_SCHEMA,
1166
+ 'consolidation_mode': {
1167
+ 'type': 'boolean',
1168
+ 'default': False,
1169
+ }
1170
+ },
1165
1171
  },
1166
1172
  'bucket': {
1167
1173
  'type': 'string',
@@ -1474,6 +1480,27 @@ def get_config_schema():
1474
1480
  }
1475
1481
  }
1476
1482
 
1483
+ daemon_config = {
1484
+ 'type': 'object',
1485
+ 'required': [],
1486
+ 'properties': {
1487
+ 'log_level': {
1488
+ 'type': 'string',
1489
+ 'case_insensitive_enum': ['DEBUG', 'INFO', 'WARNING'],
1490
+ },
1491
+ }
1492
+ }
1493
+
1494
+ daemon_schema = {
1495
+ 'type': 'object',
1496
+ 'required': [],
1497
+ 'additionalProperties': False,
1498
+ 'properties': {}
1499
+ }
1500
+
1501
+ for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
1502
+ daemon_schema['properties'][daemon.id] = daemon_config
1503
+
1477
1504
  api_server = {
1478
1505
  'type': 'object',
1479
1506
  'required': [],
@@ -1707,8 +1734,8 @@ def get_config_schema():
1707
1734
  'db': {
1708
1735
  'type': 'string',
1709
1736
  },
1710
- 'jobs': _get_controller_schema(add_consolidation_mode=True),
1711
- 'serve': _get_controller_schema(add_consolidation_mode=False),
1737
+ 'jobs': _get_controller_schema(),
1738
+ 'serve': _get_controller_schema(),
1712
1739
  'allowed_clouds': allowed_clouds,
1713
1740
  'admin_policy': admin_policy_schema,
1714
1741
  'docker': docker_configs,
@@ -1719,6 +1746,7 @@ def get_config_schema():
1719
1746
  'provision': provision_configs,
1720
1747
  'rbac': rbac_schema,
1721
1748
  'logs': logs_schema,
1749
+ 'daemons': daemon_schema,
1722
1750
  **cloud_configs,
1723
1751
  },
1724
1752
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250730
3
+ Version: 1.0.0.dev20250801
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0