skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +452 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/provision/runpod/utils.py +27 -12
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +2 -107
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/sky_logging.py +30 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +47 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py
CHANGED
|
@@ -382,7 +382,8 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
382
382
|
# we skip the following keys because they are meant to be client-side configs.
|
|
383
383
|
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
|
|
384
384
|
('allowed_clouds',),
|
|
385
|
-
('workspaces',), ('db',)
|
|
385
|
+
('workspaces',), ('db',),
|
|
386
|
+
('daemons',)]
|
|
386
387
|
|
|
387
388
|
# Constants for Azure blob storage
|
|
388
389
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
sky/skylet/events.py
CHANGED
|
@@ -75,7 +75,16 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
75
75
|
EVENT_INTERVAL_SECONDS = 300
|
|
76
76
|
|
|
77
77
|
def _run(self):
|
|
78
|
+
logger.info('=== Updating managed job status ===')
|
|
78
79
|
managed_job_utils.update_managed_jobs_statuses()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ManagedJobSchedulingEvent(SkyletEvent):
|
|
83
|
+
"""Skylet event for scheduling managed jobs."""
|
|
84
|
+
EVENT_INTERVAL_SECONDS = 20
|
|
85
|
+
|
|
86
|
+
def _run(self):
|
|
87
|
+
logger.info('=== Scheduling next jobs ===')
|
|
79
88
|
managed_job_scheduler.maybe_schedule_next_jobs()
|
|
80
89
|
|
|
81
90
|
|
sky/skypilot_config.py
CHANGED
|
@@ -495,6 +495,12 @@ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
|
|
|
495
495
|
try:
|
|
496
496
|
config_dict = common_utils.read_yaml(config_path)
|
|
497
497
|
config = config_utils.Config.from_dict(config_dict)
|
|
498
|
+
# pop the db url from the config, and set it to the env var.
|
|
499
|
+
# this is to avoid db url (considered a sensitive value)
|
|
500
|
+
# being printed with the rest of the config.
|
|
501
|
+
db_url = config.pop_nested(('db',), None)
|
|
502
|
+
if db_url:
|
|
503
|
+
os.environ[constants.ENV_VAR_DB_CONNECTION_URI] = db_url
|
|
498
504
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
499
505
|
logger.debug(f'Config loaded from {config_path}:\n'
|
|
500
506
|
f'{common_utils.dump_yaml_str(dict(config))}')
|
|
@@ -556,21 +562,16 @@ def _reload_config_as_server() -> None:
|
|
|
556
562
|
_set_loaded_config_path(None)
|
|
557
563
|
|
|
558
564
|
server_config_path = _resolve_server_config_path()
|
|
559
|
-
db_url_from_env = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
|
560
565
|
server_config = _get_config_from_path(server_config_path)
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
565
|
-
logger.debug(f'server config: \n'
|
|
566
|
-
f'{common_utils.dump_yaml_str(dict(server_config))}')
|
|
567
|
-
|
|
568
|
-
db_url = server_config.get_nested(('db',), None)
|
|
569
|
-
if db_url and len(server_config.keys()) > 1:
|
|
570
|
-
raise ValueError(
|
|
571
|
-
'if db config is specified, no other config is allowed')
|
|
566
|
+
# Get the db url from the env var. _get_config_from_path should have moved
|
|
567
|
+
# the db url specified in config file to the env var.
|
|
568
|
+
db_url = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
|
572
569
|
|
|
573
570
|
if db_url:
|
|
571
|
+
if len(server_config.keys()) > 1:
|
|
572
|
+
raise ValueError(
|
|
573
|
+
'If db config is specified, no other config is allowed')
|
|
574
|
+
logger.debug('retrieving config from database')
|
|
574
575
|
with _DB_USE_LOCK:
|
|
575
576
|
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
|
576
577
|
poolclass=NullPool)
|
|
@@ -591,14 +592,13 @@ def _reload_config_as_server() -> None:
|
|
|
591
592
|
|
|
592
593
|
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
593
594
|
if db_config:
|
|
594
|
-
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
595
|
-
logger.debug(
|
|
596
|
-
f'Config loaded from db:\n'
|
|
597
|
-
f'{common_utils.dump_yaml_str(dict(db_config))}')
|
|
598
595
|
server_config = overlay_skypilot_config(server_config,
|
|
599
596
|
db_config)
|
|
600
597
|
# Close the engine to avoid connection leaks
|
|
601
598
|
sqlalchemy_engine.dispose()
|
|
599
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
600
|
+
logger.debug(f'server config: \n'
|
|
601
|
+
f'{common_utils.dump_yaml_str(dict(server_config))}')
|
|
602
602
|
_set_loaded_config(server_config)
|
|
603
603
|
_set_loaded_config_path(server_config_path)
|
|
604
604
|
|
|
@@ -681,6 +681,10 @@ def override_skypilot_config(
|
|
|
681
681
|
|
|
682
682
|
disallowed_diff_keys = []
|
|
683
683
|
for key in constants.SKIPPED_CLIENT_OVERRIDE_KEYS:
|
|
684
|
+
if key == ('db',):
|
|
685
|
+
# since db key is popped out of server config, the key is expected
|
|
686
|
+
# to be different between client and server.
|
|
687
|
+
continue
|
|
684
688
|
value = override_configs.pop_nested(key, default_value=None)
|
|
685
689
|
if (value is not None and
|
|
686
690
|
value != original_config.get_nested(key, default_value=None)):
|
|
@@ -855,11 +859,11 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
855
859
|
|
|
856
860
|
db_updated = False
|
|
857
861
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
858
|
-
existing_db_url =
|
|
862
|
+
existing_db_url = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
|
863
|
+
new_db_url = config.pop_nested(('db',), None)
|
|
864
|
+
if new_db_url and new_db_url != existing_db_url:
|
|
865
|
+
raise ValueError('Cannot change db url while server is running')
|
|
859
866
|
if existing_db_url:
|
|
860
|
-
new_db_url = config.get_nested(('db',), None)
|
|
861
|
-
if new_db_url and new_db_url != existing_db_url:
|
|
862
|
-
raise ValueError('Cannot change db url while server is running')
|
|
863
867
|
with _DB_USE_LOCK:
|
|
864
868
|
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
|
865
869
|
poolclass=NullPool)
|
|
@@ -869,7 +873,6 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
869
873
|
def _set_config_yaml_to_db(key: str,
|
|
870
874
|
config: config_utils.Config):
|
|
871
875
|
assert sqlalchemy_engine is not None
|
|
872
|
-
config.pop_nested(('db',), None)
|
|
873
876
|
config_str = common_utils.dump_yaml_str(dict(config))
|
|
874
877
|
with orm.Session(sqlalchemy_engine) as session:
|
|
875
878
|
if (sqlalchemy_engine.dialect.name ==
|
sky/task.py
CHANGED
|
@@ -256,6 +256,7 @@ class Task:
|
|
|
256
256
|
file_mounts_mapping: Optional[Dict[str, str]] = None,
|
|
257
257
|
volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
|
|
258
258
|
metadata: Optional[Dict[str, Any]] = None,
|
|
259
|
+
_user_specified_yaml: Optional[str] = None,
|
|
259
260
|
):
|
|
260
261
|
"""Initializes a Task.
|
|
261
262
|
|
|
@@ -381,6 +382,8 @@ class Task:
|
|
|
381
382
|
if dag is not None:
|
|
382
383
|
dag.add(self)
|
|
383
384
|
|
|
385
|
+
self._user_specified_yaml = _user_specified_yaml
|
|
386
|
+
|
|
384
387
|
def validate(self,
|
|
385
388
|
skip_file_mounts: bool = False,
|
|
386
389
|
skip_workdir: bool = False):
|
|
@@ -525,6 +528,8 @@ class Task:
|
|
|
525
528
|
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
|
526
529
|
secrets_overrides: Optional[List[Tuple[str, str]]] = None,
|
|
527
530
|
) -> 'Task':
|
|
531
|
+
user_specified_yaml = config.pop('_user_specified_yaml',
|
|
532
|
+
common_utils.dump_yaml_str(config))
|
|
528
533
|
# More robust handling for 'envs': explicitly convert keys and values to
|
|
529
534
|
# str, since users may pass '123' as keys/values which will get parsed
|
|
530
535
|
# as int causing validate_schema() to fail.
|
|
@@ -590,19 +595,23 @@ class Task:
|
|
|
590
595
|
|
|
591
596
|
# Fill in any Task.envs into file_mounts (src/dst paths, storage
|
|
592
597
|
# name/source).
|
|
598
|
+
env_vars = config.get('envs', {})
|
|
599
|
+
secrets = config.get('secrets', {})
|
|
600
|
+
env_and_secrets = env_vars.copy()
|
|
601
|
+
env_and_secrets.update(secrets)
|
|
593
602
|
if config.get('file_mounts') is not None:
|
|
594
603
|
config['file_mounts'] = _fill_in_env_vars(config['file_mounts'],
|
|
595
|
-
|
|
604
|
+
env_and_secrets)
|
|
596
605
|
|
|
597
606
|
# Fill in any Task.envs into service (e.g. MODEL_NAME).
|
|
598
607
|
if config.get('service') is not None:
|
|
599
608
|
config['service'] = _fill_in_env_vars(config['service'],
|
|
600
|
-
|
|
609
|
+
env_and_secrets)
|
|
601
610
|
|
|
602
611
|
# Fill in any Task.envs into workdir
|
|
603
612
|
if config.get('workdir') is not None:
|
|
604
613
|
config['workdir'] = _fill_in_env_vars(config['workdir'],
|
|
605
|
-
|
|
614
|
+
env_and_secrets)
|
|
606
615
|
|
|
607
616
|
task = Task(
|
|
608
617
|
config.pop('name', None),
|
|
@@ -616,6 +625,7 @@ class Task:
|
|
|
616
625
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
|
617
626
|
volumes=config.pop('volumes', None),
|
|
618
627
|
metadata=config.pop('_metadata', None),
|
|
628
|
+
_user_specified_yaml=user_specified_yaml,
|
|
619
629
|
)
|
|
620
630
|
|
|
621
631
|
# Create lists to store storage objects inlined in file_mounts.
|
|
@@ -736,9 +746,19 @@ class Task:
|
|
|
736
746
|
task.set_resources(sky.Resources.from_yaml_config(resources_config))
|
|
737
747
|
|
|
738
748
|
service = config.pop('service', None)
|
|
749
|
+
pool = config.pop('pool', None)
|
|
750
|
+
if service is not None and pool is not None:
|
|
751
|
+
with ux_utils.print_exception_no_traceback():
|
|
752
|
+
raise ValueError(
|
|
753
|
+
'Cannot set both service and pool in the same task.')
|
|
754
|
+
|
|
739
755
|
if service is not None:
|
|
740
756
|
service = service_spec.SkyServiceSpec.from_yaml_config(service)
|
|
741
|
-
|
|
757
|
+
task.set_service(service)
|
|
758
|
+
elif pool is not None:
|
|
759
|
+
pool['pool'] = True
|
|
760
|
+
pool = service_spec.SkyServiceSpec.from_yaml_config(pool)
|
|
761
|
+
task.set_service(pool)
|
|
742
762
|
|
|
743
763
|
volume_mounts = config.pop('volume_mounts', None)
|
|
744
764
|
if volume_mounts is not None:
|
|
@@ -773,7 +793,8 @@ class Task:
|
|
|
773
793
|
# TODO(zongheng): use
|
|
774
794
|
# https://github.com/yaml/pyyaml/issues/165#issuecomment-430074049
|
|
775
795
|
# to raise errors on duplicate keys.
|
|
776
|
-
|
|
796
|
+
user_specified_yaml = f.read()
|
|
797
|
+
config = yaml.safe_load(user_specified_yaml)
|
|
777
798
|
|
|
778
799
|
if isinstance(config, str):
|
|
779
800
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -782,6 +803,7 @@ class Task:
|
|
|
782
803
|
|
|
783
804
|
if config is None:
|
|
784
805
|
config = {}
|
|
806
|
+
config['_user_specified_yaml'] = user_specified_yaml
|
|
785
807
|
return Task.from_yaml_config(config)
|
|
786
808
|
|
|
787
809
|
def resolve_and_validate_volumes(self) -> None:
|
|
@@ -1537,11 +1559,22 @@ class Task:
|
|
|
1537
1559
|
d[k] = v
|
|
1538
1560
|
return d
|
|
1539
1561
|
|
|
1540
|
-
def to_yaml_config(self,
|
|
1562
|
+
def to_yaml_config(self,
|
|
1563
|
+
use_user_specified_yaml: bool = False) -> Dict[str, Any]:
|
|
1541
1564
|
"""Returns a yaml-style dict representation of the task.
|
|
1542
1565
|
|
|
1543
1566
|
INTERNAL: this method is internal-facing.
|
|
1544
1567
|
"""
|
|
1568
|
+
if use_user_specified_yaml:
|
|
1569
|
+
if self._user_specified_yaml is None:
|
|
1570
|
+
return self._to_yaml_config(redact_secrets=True)
|
|
1571
|
+
config = yaml.safe_load(self._user_specified_yaml)
|
|
1572
|
+
if config.get('secrets') is not None:
|
|
1573
|
+
config['secrets'] = {k: '<redacted>' for k in config['secrets']}
|
|
1574
|
+
return config
|
|
1575
|
+
return self._to_yaml_config()
|
|
1576
|
+
|
|
1577
|
+
def _to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
|
|
1545
1578
|
config = {}
|
|
1546
1579
|
|
|
1547
1580
|
def add_if_not_none(key, value, no_empty: bool = False):
|
|
@@ -1586,13 +1619,9 @@ class Task:
|
|
|
1586
1619
|
# Add envs without redaction
|
|
1587
1620
|
add_if_not_none('envs', self.envs, no_empty=True)
|
|
1588
1621
|
|
|
1589
|
-
# Add secrets with redaction if requested
|
|
1590
1622
|
secrets = self.secrets
|
|
1591
1623
|
if secrets and redact_secrets:
|
|
1592
|
-
secrets = {
|
|
1593
|
-
k: '<redacted>' if isinstance(v, str) else v
|
|
1594
|
-
for k, v in secrets.items()
|
|
1595
|
-
}
|
|
1624
|
+
secrets = {k: '<redacted>' for k in secrets}
|
|
1596
1625
|
add_if_not_none('secrets', secrets, no_empty=True)
|
|
1597
1626
|
|
|
1598
1627
|
add_if_not_none('file_mounts', {})
|
|
@@ -1615,6 +1644,7 @@ class Task:
|
|
|
1615
1644
|
]
|
|
1616
1645
|
# we manually check if its empty to not clog up the generated yaml
|
|
1617
1646
|
add_if_not_none('_metadata', self._metadata if self._metadata else None)
|
|
1647
|
+
add_if_not_none('_user_specified_yaml', self._user_specified_yaml)
|
|
1618
1648
|
return config
|
|
1619
1649
|
|
|
1620
1650
|
def get_required_cloud_features(
|
|
@@ -45,13 +45,29 @@ file_mounts:
|
|
|
45
45
|
run: |
|
|
46
46
|
# Activate the Python environment, so that cloud SDKs can be found in the
|
|
47
47
|
# PATH.
|
|
48
|
+
{%- if consolidation_mode_job_id is none %}
|
|
48
49
|
{{ sky_activate_python_env }}
|
|
50
|
+
{%- endif %}
|
|
49
51
|
# Start sky serve service.
|
|
50
|
-
|
|
52
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
53
|
+
{{sky_python_cmd}} \
|
|
54
|
+
{%- else %}
|
|
55
|
+
python \
|
|
56
|
+
{%- endif %}
|
|
57
|
+
-u -m sky.serve.service \
|
|
51
58
|
--service-name {{service_name}} \
|
|
52
59
|
--task-yaml {{remote_task_yaml_path}} \
|
|
60
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
61
|
+
--job-id {{consolidation_mode_job_id}} \
|
|
62
|
+
{%- else %}
|
|
53
63
|
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
|
54
|
-
|
|
64
|
+
{%- endif %}
|
|
65
|
+
>> {{controller_log_file}} 2>&1 \
|
|
66
|
+
{%- if consolidation_mode_job_id is not none %}
|
|
67
|
+
&
|
|
68
|
+
{%- endif %}
|
|
69
|
+
# For consolidation mode, we need to run the service in the background so
|
|
70
|
+
# that it can immediately return in serve.core.up().
|
|
55
71
|
|
|
56
72
|
envs:
|
|
57
73
|
{%- for env_name, env_value in controller_envs.items() %}
|
sky/users/server.py
CHANGED
|
@@ -414,7 +414,7 @@ async def get_service_account_tokens(
|
|
|
414
414
|
|
|
415
415
|
def _generate_service_account_user_id() -> str:
|
|
416
416
|
"""Generate a unique user ID for a service account."""
|
|
417
|
-
random_suffix = secrets.token_hex(
|
|
417
|
+
random_suffix = secrets.token_hex(8) # 16 character hex string
|
|
418
418
|
service_account_id = (f'sa-{random_suffix}')
|
|
419
419
|
return service_account_id
|
|
420
420
|
|
sky/utils/command_runner.py
CHANGED
|
@@ -201,6 +201,7 @@ class CommandRunner:
|
|
|
201
201
|
separate_stderr: bool,
|
|
202
202
|
skip_num_lines: int,
|
|
203
203
|
source_bashrc: bool = False,
|
|
204
|
+
use_login: bool = True,
|
|
204
205
|
) -> str:
|
|
205
206
|
"""Returns the command to run."""
|
|
206
207
|
if isinstance(cmd, list):
|
|
@@ -211,7 +212,7 @@ class CommandRunner:
|
|
|
211
212
|
'/bin/bash',
|
|
212
213
|
'--login',
|
|
213
214
|
'-c',
|
|
214
|
-
]
|
|
215
|
+
] if use_login else ['/bin/bash', '-c']
|
|
215
216
|
if source_bashrc:
|
|
216
217
|
command += [
|
|
217
218
|
# Need this `-i` option to make sure `source ~/.bashrc` work.
|
|
@@ -1124,7 +1125,8 @@ class LocalProcessCommandRunner(CommandRunner):
|
|
|
1124
1125
|
process_stream,
|
|
1125
1126
|
separate_stderr,
|
|
1126
1127
|
skip_num_lines=skip_num_lines,
|
|
1127
|
-
source_bashrc=source_bashrc
|
|
1128
|
+
source_bashrc=source_bashrc,
|
|
1129
|
+
use_login=False)
|
|
1128
1130
|
|
|
1129
1131
|
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
|
1130
1132
|
os.makedirs(log_dir, exist_ok=True)
|
sky/utils/controller_utils.py
CHANGED
|
@@ -5,7 +5,7 @@ import enum
|
|
|
5
5
|
import os
|
|
6
6
|
import tempfile
|
|
7
7
|
import typing
|
|
8
|
-
from typing import Any, Dict, Iterable, List, Optional, Set
|
|
8
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Set
|
|
9
9
|
import uuid
|
|
10
10
|
|
|
11
11
|
import colorama
|
|
@@ -64,7 +64,7 @@ class _ControllerSpec:
|
|
|
64
64
|
controller_type: str
|
|
65
65
|
name: str
|
|
66
66
|
cluster_name: str
|
|
67
|
-
in_progress_hint: str
|
|
67
|
+
in_progress_hint: Callable[[bool], str]
|
|
68
68
|
decline_cancel_hint: str
|
|
69
69
|
_decline_down_when_failed_to_fetch_status_hint: str
|
|
70
70
|
decline_down_for_dirty_controller_hint: str
|
|
@@ -94,9 +94,9 @@ class Controllers(enum.Enum):
|
|
|
94
94
|
controller_type='jobs',
|
|
95
95
|
name='managed jobs controller',
|
|
96
96
|
cluster_name=common.JOB_CONTROLLER_NAME,
|
|
97
|
-
in_progress_hint=
|
|
98
|
-
|
|
99
|
-
|
|
97
|
+
in_progress_hint=lambda _:
|
|
98
|
+
('* {job_info}To see all managed jobs: '
|
|
99
|
+
f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
|
|
100
100
|
decline_cancel_hint=(
|
|
101
101
|
'Cancelling the jobs controller\'s jobs is not allowed.\nTo cancel '
|
|
102
102
|
f'managed jobs, use: {colorama.Style.BRIGHT}sky jobs cancel '
|
|
@@ -126,8 +126,11 @@ class Controllers(enum.Enum):
|
|
|
126
126
|
name='serve controller',
|
|
127
127
|
cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
128
128
|
in_progress_hint=(
|
|
129
|
-
|
|
130
|
-
f'
|
|
129
|
+
lambda pool:
|
|
130
|
+
(f'* To see detailed pool status: {colorama.Style.BRIGHT}'
|
|
131
|
+
f'sky jobs pool status -v{colorama.Style.RESET_ALL}') if pool else
|
|
132
|
+
(f'* To see detailed service status: {colorama.Style.BRIGHT}'
|
|
133
|
+
f'sky serve status -v{colorama.Style.RESET_ALL}')),
|
|
131
134
|
decline_cancel_hint=(
|
|
132
135
|
'Cancelling the sky serve controller\'s jobs is not allowed.'),
|
|
133
136
|
_decline_down_when_failed_to_fetch_status_hint=(
|
|
@@ -391,10 +394,11 @@ def check_cluster_name_not_controller(
|
|
|
391
394
|
|
|
392
395
|
|
|
393
396
|
# Internal only:
|
|
394
|
-
def
|
|
397
|
+
def download_and_stream_job_log(
|
|
395
398
|
backend: 'cloud_vm_ray_backend.CloudVmRayBackend',
|
|
396
399
|
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
397
|
-
local_dir: str
|
|
400
|
+
local_dir: str,
|
|
401
|
+
job_ids: Optional[List[str]] = None) -> Optional[str]:
|
|
398
402
|
"""Downloads and streams the latest job log.
|
|
399
403
|
|
|
400
404
|
This function is only used by jobs controller and sky serve controller.
|
|
@@ -412,7 +416,7 @@ def download_and_stream_latest_job_log(
|
|
|
412
416
|
# multi-node cluster is preempted, and we recover the managed job
|
|
413
417
|
# on the existing cluster, which leads to a larger job_id. Those
|
|
414
418
|
# job_ids all represent the same logical managed job.
|
|
415
|
-
job_ids=
|
|
419
|
+
job_ids=job_ids,
|
|
416
420
|
local_dir=local_dir)
|
|
417
421
|
except Exception as e: # pylint: disable=broad-except
|
|
418
422
|
# We want to avoid crashing the controller. sync_down_logs() is pretty
|
sky/utils/dag_utils.py
CHANGED
|
@@ -148,7 +148,7 @@ def load_chain_dag_from_yaml_str(
|
|
|
148
148
|
|
|
149
149
|
|
|
150
150
|
def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
|
|
151
|
-
|
|
151
|
+
use_user_specified_yaml: bool = False) -> str:
|
|
152
152
|
"""Dumps a chain DAG to a YAML string.
|
|
153
153
|
|
|
154
154
|
Args:
|
|
@@ -161,7 +161,9 @@ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
|
|
|
161
161
|
assert dag.is_chain(), dag
|
|
162
162
|
configs = [{'name': dag.name}]
|
|
163
163
|
for task in dag.tasks:
|
|
164
|
-
configs.append(
|
|
164
|
+
configs.append(
|
|
165
|
+
task.to_yaml_config(
|
|
166
|
+
use_user_specified_yaml=use_user_specified_yaml))
|
|
165
167
|
return common_utils.dump_yaml_str(configs)
|
|
166
168
|
|
|
167
169
|
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -12,7 +12,6 @@ import filelock
|
|
|
12
12
|
import sqlalchemy
|
|
13
13
|
|
|
14
14
|
from sky import sky_logging
|
|
15
|
-
from sky import skypilot_config
|
|
16
15
|
from sky.skylet import constants
|
|
17
16
|
|
|
18
17
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -24,16 +23,15 @@ GLOBAL_USER_STATE_VERSION = '001'
|
|
|
24
23
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
25
24
|
|
|
26
25
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
|
27
|
-
SPOT_JOBS_VERSION = '
|
|
26
|
+
SPOT_JOBS_VERSION = '002'
|
|
28
27
|
SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
def get_engine(db_name: str):
|
|
32
31
|
conn_string = None
|
|
33
32
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
34
|
-
conn_string =
|
|
33
|
+
conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
|
35
34
|
if conn_string:
|
|
36
|
-
logger.debug(f'using db URI from {conn_string}')
|
|
37
35
|
engine = sqlalchemy.create_engine(conn_string,
|
|
38
36
|
poolclass=sqlalchemy.NullPool)
|
|
39
37
|
else:
|
sky/utils/schemas.py
CHANGED
|
@@ -605,7 +605,6 @@ def get_service_schema():
|
|
|
605
605
|
return {
|
|
606
606
|
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
|
607
607
|
'type': 'object',
|
|
608
|
-
'required': ['readiness_probe'],
|
|
609
608
|
'additionalProperties': False,
|
|
610
609
|
'properties': {
|
|
611
610
|
'readiness_probe': {
|
|
@@ -641,6 +640,9 @@ def get_service_schema():
|
|
|
641
640
|
}
|
|
642
641
|
}]
|
|
643
642
|
},
|
|
643
|
+
'pool': {
|
|
644
|
+
'type': 'boolean',
|
|
645
|
+
},
|
|
644
646
|
'replica_policy': {
|
|
645
647
|
'type': 'object',
|
|
646
648
|
'required': ['min_replicas'],
|
|
@@ -688,6 +690,9 @@ def get_service_schema():
|
|
|
688
690
|
'replicas': {
|
|
689
691
|
'type': 'integer',
|
|
690
692
|
},
|
|
693
|
+
'workers': {
|
|
694
|
+
'type': 'integer',
|
|
695
|
+
},
|
|
691
696
|
'load_balancing_policy': {
|
|
692
697
|
'type': 'string',
|
|
693
698
|
'case_insensitive_enum': list(
|
|
@@ -825,6 +830,9 @@ def get_task_schema():
|
|
|
825
830
|
'service': {
|
|
826
831
|
'type': 'object',
|
|
827
832
|
},
|
|
833
|
+
'pool': {
|
|
834
|
+
'type': 'object',
|
|
835
|
+
},
|
|
828
836
|
'setup': {
|
|
829
837
|
'type': 'string',
|
|
830
838
|
},
|
|
@@ -1128,6 +1136,7 @@ _CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
|
|
|
1128
1136
|
|
|
1129
1137
|
def get_config_schema():
|
|
1130
1138
|
# pylint: disable=import-outside-toplevel
|
|
1139
|
+
from sky.server import daemons
|
|
1131
1140
|
|
|
1132
1141
|
resources_schema = {
|
|
1133
1142
|
k: v
|
|
@@ -1137,21 +1146,7 @@ def get_config_schema():
|
|
|
1137
1146
|
}
|
|
1138
1147
|
resources_schema['properties'].pop('ports')
|
|
1139
1148
|
|
|
1140
|
-
def _get_controller_schema(
|
|
1141
|
-
controller_properties = {
|
|
1142
|
-
'resources': resources_schema,
|
|
1143
|
-
'high_availability': {
|
|
1144
|
-
'type': 'boolean',
|
|
1145
|
-
'default': False,
|
|
1146
|
-
},
|
|
1147
|
-
'autostop': _AUTOSTOP_SCHEMA,
|
|
1148
|
-
}
|
|
1149
|
-
if add_consolidation_mode:
|
|
1150
|
-
controller_properties['consolidation_mode'] = {
|
|
1151
|
-
'type': 'boolean',
|
|
1152
|
-
'default': False,
|
|
1153
|
-
}
|
|
1154
|
-
|
|
1149
|
+
def _get_controller_schema():
|
|
1155
1150
|
return {
|
|
1156
1151
|
'type': 'object',
|
|
1157
1152
|
'required': [],
|
|
@@ -1161,7 +1156,18 @@ def get_config_schema():
|
|
|
1161
1156
|
'type': 'object',
|
|
1162
1157
|
'required': [],
|
|
1163
1158
|
'additionalProperties': False,
|
|
1164
|
-
'properties':
|
|
1159
|
+
'properties': {
|
|
1160
|
+
'resources': resources_schema,
|
|
1161
|
+
'high_availability': {
|
|
1162
|
+
'type': 'boolean',
|
|
1163
|
+
'default': False,
|
|
1164
|
+
},
|
|
1165
|
+
'autostop': _AUTOSTOP_SCHEMA,
|
|
1166
|
+
'consolidation_mode': {
|
|
1167
|
+
'type': 'boolean',
|
|
1168
|
+
'default': False,
|
|
1169
|
+
}
|
|
1170
|
+
},
|
|
1165
1171
|
},
|
|
1166
1172
|
'bucket': {
|
|
1167
1173
|
'type': 'string',
|
|
@@ -1474,6 +1480,27 @@ def get_config_schema():
|
|
|
1474
1480
|
}
|
|
1475
1481
|
}
|
|
1476
1482
|
|
|
1483
|
+
daemon_config = {
|
|
1484
|
+
'type': 'object',
|
|
1485
|
+
'required': [],
|
|
1486
|
+
'properties': {
|
|
1487
|
+
'log_level': {
|
|
1488
|
+
'type': 'string',
|
|
1489
|
+
'case_insensitive_enum': ['DEBUG', 'INFO', 'WARNING'],
|
|
1490
|
+
},
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
daemon_schema = {
|
|
1495
|
+
'type': 'object',
|
|
1496
|
+
'required': [],
|
|
1497
|
+
'additionalProperties': False,
|
|
1498
|
+
'properties': {}
|
|
1499
|
+
}
|
|
1500
|
+
|
|
1501
|
+
for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
1502
|
+
daemon_schema['properties'][daemon.id] = daemon_config
|
|
1503
|
+
|
|
1477
1504
|
api_server = {
|
|
1478
1505
|
'type': 'object',
|
|
1479
1506
|
'required': [],
|
|
@@ -1707,8 +1734,8 @@ def get_config_schema():
|
|
|
1707
1734
|
'db': {
|
|
1708
1735
|
'type': 'string',
|
|
1709
1736
|
},
|
|
1710
|
-
'jobs': _get_controller_schema(
|
|
1711
|
-
'serve': _get_controller_schema(
|
|
1737
|
+
'jobs': _get_controller_schema(),
|
|
1738
|
+
'serve': _get_controller_schema(),
|
|
1712
1739
|
'allowed_clouds': allowed_clouds,
|
|
1713
1740
|
'admin_policy': admin_policy_schema,
|
|
1714
1741
|
'docker': docker_configs,
|
|
@@ -1719,6 +1746,7 @@ def get_config_schema():
|
|
|
1719
1746
|
'provision': provision_configs,
|
|
1720
1747
|
'rbac': rbac_schema,
|
|
1721
1748
|
'logs': logs_schema,
|
|
1749
|
+
'daemons': daemon_schema,
|
|
1722
1750
|
**cloud_configs,
|
|
1723
1751
|
},
|
|
1724
1752
|
}
|