skypilot-nightly 1.0.0.dev20250617__py3-none-any.whl → 1.0.0.dev20250619__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +48 -36
- sky/cli.py +5 -5729
- sky/client/cli/__init__.py +0 -0
- sky/client/{cli.py → cli/command.py} +108 -632
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +342 -0
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +5 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → whetcrnbXtqQcMRbXUbhW}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/jobs/constants.py +0 -2
- sky/jobs/scheduler.py +7 -4
- sky/jobs/server/core.py +6 -3
- sky/jobs/state.py +9 -8
- sky/jobs/utils.py +1 -1
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/common.py +10 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/resources.py +7 -6
- sky/serve/server/core.py +5 -0
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +13 -1
- sky/skylet/job_lib.py +75 -19
- sky/templates/kubernetes-ray.yml.j2 +9 -0
- sky/users/permission.py +49 -19
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/env_options.py +6 -0
- sky/utils/schemas.py +42 -2
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/RECORD +70 -63
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- /sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → whetcrnbXtqQcMRbXUbhW}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
@@ -113,7 +113,7 @@ def create_table(cursor, conn):
|
|
113
113
|
|
114
114
|
# `job_info` contains the mapping from job_id to the job_name, as well as
|
115
115
|
# information used by the scheduler.
|
116
|
-
cursor.execute("""\
|
116
|
+
cursor.execute(f"""\
|
117
117
|
CREATE TABLE IF NOT EXISTS job_info (
|
118
118
|
spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
119
119
|
name TEXT,
|
@@ -123,7 +123,7 @@ def create_table(cursor, conn):
|
|
123
123
|
env_file_path TEXT,
|
124
124
|
user_hash TEXT,
|
125
125
|
workspace TEXT DEFAULT NULL,
|
126
|
-
priority INTEGER DEFAULT
|
126
|
+
priority INTEGER DEFAULT {constants.DEFAULT_PRIORITY},
|
127
127
|
entrypoint TEXT DEFAULT NULL,
|
128
128
|
original_user_yaml_path TEXT DEFAULT NULL)""")
|
129
129
|
|
@@ -148,12 +148,13 @@ def create_table(cursor, conn):
|
|
148
148
|
'TEXT DEFAULT NULL',
|
149
149
|
value_to_replace_existing_entries='default')
|
150
150
|
|
151
|
-
db_utils.add_column_to_table(
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
151
|
+
db_utils.add_column_to_table(
|
152
|
+
cursor,
|
153
|
+
conn,
|
154
|
+
'job_info',
|
155
|
+
'priority',
|
156
|
+
'INTEGER',
|
157
|
+
value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
|
157
158
|
|
158
159
|
db_utils.add_column_to_table(cursor, conn, 'job_info', 'entrypoint', 'TEXT')
|
159
160
|
db_utils.add_column_to_table(cursor, conn, 'job_info',
|
sky/jobs/utils.py
CHANGED
@@ -945,7 +945,7 @@ def dump_managed_job_queue() -> str:
|
|
945
945
|
# Figure out what the highest priority blocking job is. We need to know in
|
946
946
|
# order to determine if other jobs are blocked by a higher priority job, or
|
947
947
|
# just by the limited controller resources.
|
948
|
-
highest_blocking_priority =
|
948
|
+
highest_blocking_priority = constants.MIN_PRIORITY
|
949
949
|
for job in jobs:
|
950
950
|
if job['schedule_state'] not in (
|
951
951
|
# LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
|
sky/logs/__init__.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
"""Sky logging agents."""
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from sky import exceptions
|
5
|
+
from sky import skypilot_config
|
6
|
+
from sky.logs.agent import LoggingAgent
|
7
|
+
from sky.logs.gcp import GCPLoggingAgent
|
8
|
+
|
9
|
+
|
10
|
+
def get_logging_agent() -> Optional[LoggingAgent]:
|
11
|
+
store = skypilot_config.get_nested(('logs', 'store'), None)
|
12
|
+
if store is None:
|
13
|
+
return None
|
14
|
+
if store == 'gcp':
|
15
|
+
return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
|
16
|
+
raise exceptions.InvalidSkyPilotConfigError(
|
17
|
+
f'Invalid logging store: {store}')
|
sky/logs/agent.py
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
"""Base class for all logging agents."""
|
2
|
+
import abc
|
3
|
+
import os
|
4
|
+
import shlex
|
5
|
+
from typing import Any, Dict
|
6
|
+
|
7
|
+
from sky.skylet import constants
|
8
|
+
from sky.utils import common_utils
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
|
11
|
+
|
12
|
+
class LoggingAgent(abc.ABC):
|
13
|
+
"""Base class for all logging agents.
|
14
|
+
|
15
|
+
Each agent should implement the `get_setup_command` and
|
16
|
+
`get_credential_file_mounts` methods to return the setup command and
|
17
|
+
credential file mounts for the agent for provisioner to setup the agent
|
18
|
+
on each node.
|
19
|
+
"""
|
20
|
+
|
21
|
+
@abc.abstractmethod
|
22
|
+
def get_setup_command(self,
|
23
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
24
|
+
pass
|
25
|
+
|
26
|
+
@abc.abstractmethod
|
27
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
28
|
+
pass
|
29
|
+
|
30
|
+
|
31
|
+
class FluentbitAgent(LoggingAgent):
|
32
|
+
"""Base class for logging store that use fluentbit as the agent."""
|
33
|
+
|
34
|
+
def get_setup_command(self,
|
35
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
36
|
+
install_cmd = (
|
37
|
+
'if ! command -v fluent-bit >/dev/null 2>&1; then '
|
38
|
+
'sudo apt-get install -y gnupg; '
|
39
|
+
# pylint: disable=line-too-long
|
40
|
+
'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
|
41
|
+
'fi')
|
42
|
+
cfg = self.fluentbit_config(cluster_name)
|
43
|
+
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
44
|
+
config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
|
45
|
+
f'echo {shlex.quote(cfg)} > {cfg_path}')
|
46
|
+
start_cmd = ('nohup $(command -v fluent-bit || '
|
47
|
+
'echo "/opt/fluent-bit/bin/fluent-bit") '
|
48
|
+
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
|
49
|
+
return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
|
50
|
+
|
51
|
+
def fluentbit_config(self,
|
52
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
53
|
+
cfg_dict = {
|
54
|
+
'pipeline': {
|
55
|
+
'inputs': [{
|
56
|
+
'name': 'tail',
|
57
|
+
'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
|
58
|
+
'path_key': 'log_path',
|
59
|
+
# Shorten the refresh interval from 60s to 1s since every
|
60
|
+
# job creates a new log file and we must be responsive
|
61
|
+
# for this: the VM might be autodown within a minute
|
62
|
+
# right after the job completion.
|
63
|
+
'refresh_interval': 1,
|
64
|
+
}],
|
65
|
+
'outputs': [self.fluentbit_output_config(cluster_name)],
|
66
|
+
}
|
67
|
+
}
|
68
|
+
return common_utils.dump_yaml_str(cfg_dict)
|
69
|
+
|
70
|
+
@abc.abstractmethod
|
71
|
+
def fluentbit_output_config(
|
72
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
73
|
+
pass
|
sky/logs/gcp.py
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
"""GCP logging agent."""
|
2
|
+
|
3
|
+
from typing import Any, Dict, Optional
|
4
|
+
|
5
|
+
import pydantic
|
6
|
+
|
7
|
+
from sky.clouds import gcp
|
8
|
+
from sky.logs.agent import FluentbitAgent
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
|
11
|
+
|
12
|
+
class _GCPLoggingConfig(pydantic.BaseModel):
|
13
|
+
"""Configuration for GCP logging agent."""
|
14
|
+
project_id: Optional[str] = None
|
15
|
+
credentials_file: Optional[str] = None
|
16
|
+
additional_labels: Optional[Dict[str, str]] = None
|
17
|
+
|
18
|
+
|
19
|
+
class _StackdriverOutputConfig(pydantic.BaseModel):
|
20
|
+
"""Auxiliary model for building stackdriver output config in YAML.
|
21
|
+
|
22
|
+
Ref: https://docs.fluentbit.io/manual/1.7/pipeline/outputs/stackdriver
|
23
|
+
"""
|
24
|
+
name: str = 'stackdriver'
|
25
|
+
match: str = '*'
|
26
|
+
export_to_project_id: Optional[str] = None
|
27
|
+
labels: Optional[Dict[str, str]] = None
|
28
|
+
|
29
|
+
def to_dict(self) -> Dict[str, Any]:
|
30
|
+
config = self.model_dump(exclude_none=True)
|
31
|
+
if self.labels:
|
32
|
+
# Replace the label format from `{k: v}` to `k=v`
|
33
|
+
label_str = ','.join([f'{k}={v}' for k, v in self.labels.items()])
|
34
|
+
config['labels'] = label_str
|
35
|
+
return config
|
36
|
+
|
37
|
+
|
38
|
+
class GCPLoggingAgent(FluentbitAgent):
|
39
|
+
"""GCP logging agent."""
|
40
|
+
|
41
|
+
def __init__(self, config: Dict[str, Any]):
|
42
|
+
self.config = _GCPLoggingConfig(**config)
|
43
|
+
|
44
|
+
def get_setup_command(self,
|
45
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
46
|
+
credential_path = gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
|
47
|
+
if self.config.credentials_file:
|
48
|
+
credential_path = self.config.credentials_file
|
49
|
+
# Set GOOGLE_APPLICATION_CREDENTIALS and check whether credentials
|
50
|
+
# is valid.
|
51
|
+
# Stackdriver only support service account credentials or credentials
|
52
|
+
# from metadata server (only available on GCE or GKE). If the default
|
53
|
+
# credentials uploaded by API server is NOT a service account key and
|
54
|
+
# there is NO metadata server available, the logging agent will fail to
|
55
|
+
# authenticate and we require the user to upload a service account key
|
56
|
+
# via logs.gcp.credentials_file in this case.
|
57
|
+
# Also note that we use env var instead of YAML config to specify the
|
58
|
+
# service account key file path in order to resolve the home directory
|
59
|
+
# more reliably.
|
60
|
+
# Ref: https://github.com/fluent/fluent-bit/issues/8804
|
61
|
+
# TODO(aylei): check whether the credentials config is valid before
|
62
|
+
# provision.
|
63
|
+
pre_cmd = (f'export GOOGLE_APPLICATION_CREDENTIALS={credential_path}; '
|
64
|
+
f'cat {credential_path} | grep "service_account" || '
|
65
|
+
f'(echo "Credentials file {credential_path} is not a '
|
66
|
+
'service account key, check metadata server" && '
|
67
|
+
'curl -s http://metadata.google.internal >/dev/null || '
|
68
|
+
f'(echo "Neither service account key nor metadata server is '
|
69
|
+
'available. Set logs.gcp.credentials_file to a service '
|
70
|
+
'account key in server config and retry." && '
|
71
|
+
'exit 1;))')
|
72
|
+
return pre_cmd + ' && ' + super().get_setup_command(cluster_name)
|
73
|
+
|
74
|
+
def fluentbit_output_config(
|
75
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
76
|
+
display_name = cluster_name.display_name
|
77
|
+
unique_name = cluster_name.name_on_cloud
|
78
|
+
|
79
|
+
return _StackdriverOutputConfig(
|
80
|
+
export_to_project_id=self.config.project_id,
|
81
|
+
labels={
|
82
|
+
'skypilot_cluster_name': display_name,
|
83
|
+
'skypilot_cluster_id': unique_name,
|
84
|
+
**(self.config.additional_labels or {})
|
85
|
+
},
|
86
|
+
).to_dict()
|
87
|
+
|
88
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
89
|
+
if self.config.credentials_file:
|
90
|
+
return {self.config.credentials_file: self.config.credentials_file}
|
91
|
+
return {}
|
sky/models.py
CHANGED
sky/provision/common.py
CHANGED
@@ -6,6 +6,7 @@ import os
|
|
6
6
|
from typing import Any, Dict, List, Optional, Tuple
|
7
7
|
|
8
8
|
from sky import sky_logging
|
9
|
+
from sky.utils import env_options
|
9
10
|
from sky.utils import resources_utils
|
10
11
|
|
11
12
|
# NOTE: we can use pydantic instead of dataclasses or namedtuples, because
|
@@ -244,6 +245,15 @@ class SocketEndpoint(Endpoint):
|
|
244
245
|
|
245
246
|
def url(self, override_ip: Optional[str] = None) -> str:
|
246
247
|
host = override_ip if override_ip else self.host
|
248
|
+
if env_options.Options.RUNNING_IN_BUILDKITE.get(
|
249
|
+
) and 'localhost' in host:
|
250
|
+
# In Buildkite CI, we run a kind (Kubernetes in Docker) cluster.
|
251
|
+
# The controller pod runs inside this kind cluster, which itself
|
252
|
+
# runs in a container. When the pod tries to access 'localhost',
|
253
|
+
# it can't reach the host machine's localhost. Using
|
254
|
+
# 'host.docker.internal' allows the pod to properly communicate
|
255
|
+
# with services running on the host machine's localhost.
|
256
|
+
host = 'host.docker.internal'
|
247
257
|
return f'{host}{":" + str(self.port) if self.port else ""}'
|
248
258
|
|
249
259
|
|
sky/provision/instance_setup.py
CHANGED
@@ -8,6 +8,7 @@ import time
|
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
9
9
|
|
10
10
|
from sky import exceptions
|
11
|
+
from sky import logs
|
11
12
|
from sky import provision
|
12
13
|
from sky import sky_logging
|
13
14
|
from sky.provision import common
|
@@ -21,6 +22,7 @@ from sky.utils import accelerator_registry
|
|
21
22
|
from sky.utils import command_runner
|
22
23
|
from sky.utils import common_utils
|
23
24
|
from sky.utils import env_options
|
25
|
+
from sky.utils import resources_utils
|
24
26
|
from sky.utils import subprocess_utils
|
25
27
|
from sky.utils import timeline
|
26
28
|
from sky.utils import ux_utils
|
@@ -557,3 +559,36 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
|
|
557
559
|
ssh_credentials=ssh_credentials,
|
558
560
|
max_workers=subprocess_utils.get_max_workers_for_file_mounts(
|
559
561
|
common_file_mounts, cluster_info.provider_name))
|
562
|
+
|
563
|
+
|
564
|
+
@common.log_function_start_end
|
565
|
+
@timeline.event
|
566
|
+
def setup_logging_on_cluster(logging_agent: logs.LoggingAgent,
|
567
|
+
cluster_name: resources_utils.ClusterName,
|
568
|
+
cluster_info: common.ClusterInfo,
|
569
|
+
ssh_credentials: Dict[str, Any]) -> None:
|
570
|
+
"""Setup logging agent (fluentbit) on all nodes after provisioning."""
|
571
|
+
_hint_worker_log_path(cluster_name.name_on_cloud, cluster_info,
|
572
|
+
'logging_setup')
|
573
|
+
|
574
|
+
@_auto_retry()
|
575
|
+
def _setup_node(runner: command_runner.CommandRunner, log_path: str):
|
576
|
+
cmd = logging_agent.get_setup_command(cluster_name)
|
577
|
+
logger.info(f'Running command on node: {cmd}')
|
578
|
+
returncode, stdout, stderr = runner.run(cmd,
|
579
|
+
stream_logs=False,
|
580
|
+
require_outputs=True,
|
581
|
+
log_path=log_path,
|
582
|
+
source_bashrc=True)
|
583
|
+
if returncode:
|
584
|
+
raise RuntimeError(f'Failed to setup logging agent\n{cmd}\n'
|
585
|
+
f'(exit code {returncode}). Error: '
|
586
|
+
f'===== stdout ===== \n{stdout}\n'
|
587
|
+
f'===== stderr ====={stderr}')
|
588
|
+
|
589
|
+
_parallel_ssh_with_cache(_setup_node,
|
590
|
+
cluster_name.name_on_cloud,
|
591
|
+
stage_name='logging_setup',
|
592
|
+
digest=None,
|
593
|
+
cluster_info=cluster_info,
|
594
|
+
ssh_credentials=ssh_credentials)
|
sky/provision/provisioner.py
CHANGED
@@ -16,6 +16,7 @@ import sky
|
|
16
16
|
from sky import clouds
|
17
17
|
from sky import exceptions
|
18
18
|
from sky import global_user_state
|
19
|
+
from sky import logs
|
19
20
|
from sky import provision
|
20
21
|
from sky import sky_logging
|
21
22
|
from sky import skypilot_config
|
@@ -648,6 +649,15 @@ def _post_provision_setup(
|
|
648
649
|
logger.debug('Ray cluster is ready. Skip starting ray cluster on '
|
649
650
|
'worker nodes.')
|
650
651
|
|
652
|
+
logging_agent = logs.get_logging_agent()
|
653
|
+
if logging_agent:
|
654
|
+
status.update(
|
655
|
+
ux_utils.spinner_message('Setting up logging agent',
|
656
|
+
provision_logging.config.log_path))
|
657
|
+
instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
|
658
|
+
cluster_info,
|
659
|
+
ssh_credentials)
|
660
|
+
|
651
661
|
instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
|
652
662
|
cluster_info, ssh_credentials)
|
653
663
|
|
@@ -672,6 +682,7 @@ def post_provision_runtime_setup(
|
|
672
682
|
and other necessary files to the VM.
|
673
683
|
3. Run setup commands to install dependencies.
|
674
684
|
4. Start ray cluster and skylet.
|
685
|
+
5. (Optional) Setup logging agent.
|
675
686
|
|
676
687
|
Raises:
|
677
688
|
RuntimeError: If the setup process encounters any error.
|
sky/resources.py
CHANGED
@@ -225,7 +225,7 @@ class Resources:
|
|
225
225
|
autostop: the autostop configuration to use. For launched resources,
|
226
226
|
may or may not correspond to the actual current autostop config.
|
227
227
|
priority: the priority for this resource configuration. Must be an
|
228
|
-
integer from
|
228
|
+
integer from -1000 to 1000, where higher values indicate higher priority.
|
229
229
|
If None, no priority is set.
|
230
230
|
volumes: the volumes to mount on the instance.
|
231
231
|
_docker_login_config: the docker configuration to use. This includes
|
@@ -631,7 +631,7 @@ class Resources:
|
|
631
631
|
def priority(self) -> Optional[int]:
|
632
632
|
"""The priority for this resource configuration.
|
633
633
|
|
634
|
-
Higher values indicate higher priority. Valid range is
|
634
|
+
Higher values indicate higher priority. Valid range is -1000 to 1000.
|
635
635
|
"""
|
636
636
|
return self._priority
|
637
637
|
|
@@ -824,14 +824,15 @@ class Resources:
|
|
824
824
|
"""Sets the priority for this resource configuration.
|
825
825
|
|
826
826
|
Args:
|
827
|
-
priority: Priority value from
|
827
|
+
priority: Priority value from -1000 to 1000, where higher values
|
828
828
|
indicate higher priority. If None, no priority is set.
|
829
829
|
"""
|
830
830
|
if priority is not None:
|
831
|
-
if not
|
831
|
+
if not constants.MIN_PRIORITY <= priority <= constants.MAX_PRIORITY:
|
832
832
|
with ux_utils.print_exception_no_traceback():
|
833
|
-
raise ValueError(
|
834
|
-
|
833
|
+
raise ValueError(
|
834
|
+
f'Priority must be between {constants.MIN_PRIORITY} and'
|
835
|
+
f' {constants.MAX_PRIORITY}. Found: {priority}')
|
835
836
|
self._priority = priority
|
836
837
|
|
837
838
|
def _set_volumes(
|
sky/serve/server/core.py
CHANGED
@@ -297,6 +297,8 @@ def up(
|
|
297
297
|
assert task.service is not None
|
298
298
|
protocol = ('http'
|
299
299
|
if task.service.tls_credential is None else 'https')
|
300
|
+
socket_endpoint = socket_endpoint.replace('https://', '').replace(
|
301
|
+
'http://', '')
|
300
302
|
endpoint = f'{protocol}://{socket_endpoint}'
|
301
303
|
|
302
304
|
logger.info(
|
@@ -716,6 +718,9 @@ def status(
|
|
716
718
|
else:
|
717
719
|
protocol = ('https'
|
718
720
|
if service_record['tls_encrypted'] else 'http')
|
721
|
+
if endpoint is not None:
|
722
|
+
endpoint = endpoint.replace('https://',
|
723
|
+
'').replace('http://', '')
|
719
724
|
service_record['endpoint'] = f'{protocol}://{endpoint}'
|
720
725
|
|
721
726
|
return service_records
|
sky/server/common.py
CHANGED
@@ -13,7 +13,7 @@ import subprocess
|
|
13
13
|
import sys
|
14
14
|
import time
|
15
15
|
import typing
|
16
|
-
from typing import Any, Dict, Literal, Optional
|
16
|
+
from typing import Any, Dict, Literal, Optional, Tuple
|
17
17
|
from urllib import parse
|
18
18
|
import uuid
|
19
19
|
|
@@ -128,6 +128,8 @@ class ApiServerInfo:
|
|
128
128
|
version: Optional[str] = None
|
129
129
|
version_on_disk: Optional[str] = None
|
130
130
|
commit: Optional[str] = None
|
131
|
+
user: Optional[Dict[str, Any]] = None
|
132
|
+
basic_auth_enabled: bool = False
|
131
133
|
|
132
134
|
|
133
135
|
def get_api_cookie_jar_path() -> pathlib.Path:
|
@@ -261,11 +263,15 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
261
263
|
version = result.get('version')
|
262
264
|
version_on_disk = result.get('version_on_disk')
|
263
265
|
commit = result.get('commit')
|
266
|
+
user = result.get('user')
|
267
|
+
basic_auth_enabled = result.get('basic_auth_enabled')
|
264
268
|
server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
|
265
269
|
api_version=api_version,
|
266
270
|
version=version,
|
267
271
|
version_on_disk=version_on_disk,
|
268
|
-
commit=commit
|
272
|
+
commit=commit,
|
273
|
+
user=user,
|
274
|
+
basic_auth_enabled=basic_auth_enabled)
|
269
275
|
if api_version is None or version is None or commit is None:
|
270
276
|
logger.warning(f'API server response missing '
|
271
277
|
f'version info. {server_url} may '
|
@@ -320,7 +326,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
|
|
320
326
|
|
321
327
|
def _start_api_server(deploy: bool = False,
|
322
328
|
host: str = '127.0.0.1',
|
323
|
-
foreground: bool = False
|
329
|
+
foreground: bool = False,
|
330
|
+
enable_basic_auth: bool = False):
|
324
331
|
"""Starts a SkyPilot API server locally."""
|
325
332
|
server_url = get_server_url(host)
|
326
333
|
assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
|
@@ -354,6 +361,8 @@ def _start_api_server(deploy: bool = False,
|
|
354
361
|
if foreground:
|
355
362
|
# Replaces the current process with the API server
|
356
363
|
os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
364
|
+
if enable_basic_auth:
|
365
|
+
os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
357
366
|
os.execvp(args[0], args)
|
358
367
|
|
359
368
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
@@ -365,6 +374,8 @@ def _start_api_server(deploy: bool = False,
|
|
365
374
|
# the API server.
|
366
375
|
server_env = os.environ.copy()
|
367
376
|
server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
377
|
+
if enable_basic_auth:
|
378
|
+
server_env[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
368
379
|
with open(log_path, 'w', encoding='utf-8') as log_file:
|
369
380
|
# Because the log file is opened using a with statement, it may seem
|
370
381
|
# that the file will be closed when the with statement is exited
|
@@ -428,10 +439,10 @@ def _start_api_server(deploy: bool = False,
|
|
428
439
|
|
429
440
|
def check_server_healthy(
|
430
441
|
endpoint: Optional[str] = None
|
431
|
-
) -> Literal[
|
442
|
+
) -> Tuple[Literal[
|
432
443
|
# Use an incomplete list of Literals here to enforce raising for other
|
433
444
|
# enum values.
|
434
|
-
ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH]:
|
445
|
+
ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH], ApiServerInfo]:
|
435
446
|
"""Check if the API server is healthy.
|
436
447
|
|
437
448
|
Args:
|
@@ -508,7 +519,7 @@ def check_server_healthy(
|
|
508
519
|
|
509
520
|
hinted_for_server_install_version_mismatch = True
|
510
521
|
|
511
|
-
return api_server_status
|
522
|
+
return api_server_status, api_server_info
|
512
523
|
|
513
524
|
|
514
525
|
def _get_version_info_hint(server_info: ApiServerInfo) -> str:
|
@@ -559,10 +570,11 @@ def get_skypilot_version_on_disk() -> str:
|
|
559
570
|
|
560
571
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
561
572
|
host: str = '127.0.0.1',
|
562
|
-
foreground: bool = False
|
573
|
+
foreground: bool = False,
|
574
|
+
enable_basic_auth: bool = False):
|
563
575
|
api_server_status = None
|
564
576
|
try:
|
565
|
-
api_server_status = check_server_healthy()
|
577
|
+
api_server_status, _ = check_server_healthy()
|
566
578
|
if api_server_status == ApiServerStatus.NEEDS_AUTH:
|
567
579
|
endpoint = get_server_url()
|
568
580
|
with ux_utils.print_exception_no_traceback():
|
@@ -580,7 +592,7 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
580
592
|
# have started the server while we were waiting for the lock.
|
581
593
|
api_server_info = get_api_server_status(endpoint)
|
582
594
|
if api_server_info.status == ApiServerStatus.UNHEALTHY:
|
583
|
-
_start_api_server(deploy, host, foreground)
|
595
|
+
_start_api_server(deploy, host, foreground, enable_basic_auth)
|
584
596
|
|
585
597
|
|
586
598
|
def check_server_healthy_or_start(func):
|
sky/server/requests/payloads.py
CHANGED
@@ -336,10 +336,28 @@ class ClusterJobsDownloadLogsBody(RequestBody):
|
|
336
336
|
local_dir: str = constants.SKY_LOGS_DIRECTORY
|
337
337
|
|
338
338
|
|
339
|
+
class UserCreateBody(RequestBody):
|
340
|
+
"""The request body for the user create endpoint."""
|
341
|
+
username: str
|
342
|
+
password: str
|
343
|
+
role: Optional[str] = None
|
344
|
+
|
345
|
+
|
346
|
+
class UserDeleteBody(RequestBody):
|
347
|
+
"""The request body for the user delete endpoint."""
|
348
|
+
user_id: str
|
349
|
+
|
350
|
+
|
339
351
|
class UserUpdateBody(RequestBody):
|
340
352
|
"""The request body for the user update endpoint."""
|
341
353
|
user_id: str
|
342
|
-
role: str
|
354
|
+
role: Optional[str] = None
|
355
|
+
password: Optional[str] = None
|
356
|
+
|
357
|
+
|
358
|
+
class UserImportBody(RequestBody):
|
359
|
+
"""The request body for the user import endpoint."""
|
360
|
+
csv_content: str
|
343
361
|
|
344
362
|
|
345
363
|
class DownloadBody(RequestBody):
|