skypilot-nightly 1.0.0.dev20250617__py3-none-any.whl → 1.0.0.dev20250619__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +48 -36
  4. sky/cli.py +5 -5729
  5. sky/client/cli/__init__.py +0 -0
  6. sky/client/{cli.py → cli/command.py} +108 -632
  7. sky/client/cli/deprecation_utils.py +99 -0
  8. sky/client/cli/flags.py +342 -0
  9. sky/client/sdk.py +22 -2
  10. sky/clouds/kubernetes.py +5 -0
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +1 -0
  16. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  17. sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → whetcrnbXtqQcMRbXUbhW}/_buildManifest.js +1 -1
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs.html +1 -1
  27. sky/dashboard/out/users.html +1 -1
  28. sky/dashboard/out/workspace/new.html +1 -1
  29. sky/dashboard/out/workspaces/[name].html +1 -1
  30. sky/dashboard/out/workspaces.html +1 -1
  31. sky/global_user_state.py +50 -11
  32. sky/jobs/constants.py +0 -2
  33. sky/jobs/scheduler.py +7 -4
  34. sky/jobs/server/core.py +6 -3
  35. sky/jobs/state.py +9 -8
  36. sky/jobs/utils.py +1 -1
  37. sky/logs/__init__.py +17 -0
  38. sky/logs/agent.py +73 -0
  39. sky/logs/gcp.py +91 -0
  40. sky/models.py +1 -0
  41. sky/provision/common.py +10 -0
  42. sky/provision/instance_setup.py +35 -0
  43. sky/provision/provisioner.py +11 -0
  44. sky/resources.py +7 -6
  45. sky/serve/server/core.py +5 -0
  46. sky/server/common.py +21 -9
  47. sky/server/requests/payloads.py +19 -1
  48. sky/server/server.py +121 -29
  49. sky/setup_files/dependencies.py +11 -1
  50. sky/skylet/constants.py +13 -1
  51. sky/skylet/job_lib.py +75 -19
  52. sky/templates/kubernetes-ray.yml.j2 +9 -0
  53. sky/users/permission.py +49 -19
  54. sky/users/rbac.py +10 -1
  55. sky/users/server.py +274 -9
  56. sky/utils/env_options.py +6 -0
  57. sky/utils/schemas.py +42 -2
  58. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/METADATA +9 -1
  59. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/RECORD +70 -63
  60. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  61. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  62. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  63. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  64. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  65. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  66. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  67. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  68. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  69. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  70. /sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → whetcrnbXtqQcMRbXUbhW}/_ssgManifest.js +0 -0
  71. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/WHEEL +0 -0
  72. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/entry_points.txt +0 -0
  73. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/licenses/LICENSE +0 -0
  74. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250619.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -113,7 +113,7 @@ def create_table(cursor, conn):
113
113
 
114
114
  # `job_info` contains the mapping from job_id to the job_name, as well as
115
115
  # information used by the scheduler.
116
- cursor.execute("""\
116
+ cursor.execute(f"""\
117
117
  CREATE TABLE IF NOT EXISTS job_info (
118
118
  spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
119
119
  name TEXT,
@@ -123,7 +123,7 @@ def create_table(cursor, conn):
123
123
  env_file_path TEXT,
124
124
  user_hash TEXT,
125
125
  workspace TEXT DEFAULT NULL,
126
- priority INTEGER DEFAULT 500,
126
+ priority INTEGER DEFAULT {constants.DEFAULT_PRIORITY},
127
127
  entrypoint TEXT DEFAULT NULL,
128
128
  original_user_yaml_path TEXT DEFAULT NULL)""")
129
129
 
@@ -148,12 +148,13 @@ def create_table(cursor, conn):
148
148
  'TEXT DEFAULT NULL',
149
149
  value_to_replace_existing_entries='default')
150
150
 
151
- db_utils.add_column_to_table(cursor,
152
- conn,
153
- 'job_info',
154
- 'priority',
155
- 'INTEGER',
156
- value_to_replace_existing_entries=500)
151
+ db_utils.add_column_to_table(
152
+ cursor,
153
+ conn,
154
+ 'job_info',
155
+ 'priority',
156
+ 'INTEGER',
157
+ value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
157
158
 
158
159
  db_utils.add_column_to_table(cursor, conn, 'job_info', 'entrypoint', 'TEXT')
159
160
  db_utils.add_column_to_table(cursor, conn, 'job_info',
sky/jobs/utils.py CHANGED
@@ -945,7 +945,7 @@ def dump_managed_job_queue() -> str:
945
945
  # Figure out what the highest priority blocking job is. We need to know in
946
946
  # order to determine if other jobs are blocked by a higher priority job, or
947
947
  # just by the limited controller resources.
948
- highest_blocking_priority = 0
948
+ highest_blocking_priority = constants.MIN_PRIORITY
949
949
  for job in jobs:
950
950
  if job['schedule_state'] not in (
951
951
  # LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
sky/logs/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """Sky logging agents."""
2
+ from typing import Optional
3
+
4
+ from sky import exceptions
5
+ from sky import skypilot_config
6
+ from sky.logs.agent import LoggingAgent
7
+ from sky.logs.gcp import GCPLoggingAgent
8
+
9
+
10
+ def get_logging_agent() -> Optional[LoggingAgent]:
11
+ store = skypilot_config.get_nested(('logs', 'store'), None)
12
+ if store is None:
13
+ return None
14
+ if store == 'gcp':
15
+ return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
16
+ raise exceptions.InvalidSkyPilotConfigError(
17
+ f'Invalid logging store: {store}')
sky/logs/agent.py ADDED
@@ -0,0 +1,73 @@
1
+ """Base class for all logging agents."""
2
+ import abc
3
+ import os
4
+ import shlex
5
+ from typing import Any, Dict
6
+
7
+ from sky.skylet import constants
8
+ from sky.utils import common_utils
9
+ from sky.utils import resources_utils
10
+
11
+
12
+ class LoggingAgent(abc.ABC):
13
+ """Base class for all logging agents.
14
+
15
+ Each agent should implement the `get_setup_command` and
16
+ `get_credential_file_mounts` methods to return the setup command and
17
+ credential file mounts for the agent for provisioner to setup the agent
18
+ on each node.
19
+ """
20
+
21
+ @abc.abstractmethod
22
+ def get_setup_command(self,
23
+ cluster_name: resources_utils.ClusterName) -> str:
24
+ pass
25
+
26
+ @abc.abstractmethod
27
+ def get_credential_file_mounts(self) -> Dict[str, str]:
28
+ pass
29
+
30
+
31
+ class FluentbitAgent(LoggingAgent):
32
+ """Base class for logging store that use fluentbit as the agent."""
33
+
34
+ def get_setup_command(self,
35
+ cluster_name: resources_utils.ClusterName) -> str:
36
+ install_cmd = (
37
+ 'if ! command -v fluent-bit >/dev/null 2>&1; then '
38
+ 'sudo apt-get install -y gnupg; '
39
+ # pylint: disable=line-too-long
40
+ 'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
41
+ 'fi')
42
+ cfg = self.fluentbit_config(cluster_name)
43
+ cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
44
+ config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
45
+ f'echo {shlex.quote(cfg)} > {cfg_path}')
46
+ start_cmd = ('nohup $(command -v fluent-bit || '
47
+ 'echo "/opt/fluent-bit/bin/fluent-bit") '
48
+ f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
49
+ return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
50
+
51
+ def fluentbit_config(self,
52
+ cluster_name: resources_utils.ClusterName) -> str:
53
+ cfg_dict = {
54
+ 'pipeline': {
55
+ 'inputs': [{
56
+ 'name': 'tail',
57
+ 'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
58
+ 'path_key': 'log_path',
59
+ # Shorten the refresh interval from 60s to 1s since every
60
+ # job creates a new log file and we must be responsive
61
+ # for this: the VM might be autodown within a minute
62
+ # right after the job completion.
63
+ 'refresh_interval': 1,
64
+ }],
65
+ 'outputs': [self.fluentbit_output_config(cluster_name)],
66
+ }
67
+ }
68
+ return common_utils.dump_yaml_str(cfg_dict)
69
+
70
+ @abc.abstractmethod
71
+ def fluentbit_output_config(
72
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
73
+ pass
sky/logs/gcp.py ADDED
@@ -0,0 +1,91 @@
1
+ """GCP logging agent."""
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import pydantic
6
+
7
+ from sky.clouds import gcp
8
+ from sky.logs.agent import FluentbitAgent
9
+ from sky.utils import resources_utils
10
+
11
+
12
+ class _GCPLoggingConfig(pydantic.BaseModel):
13
+ """Configuration for GCP logging agent."""
14
+ project_id: Optional[str] = None
15
+ credentials_file: Optional[str] = None
16
+ additional_labels: Optional[Dict[str, str]] = None
17
+
18
+
19
+ class _StackdriverOutputConfig(pydantic.BaseModel):
20
+ """Auxiliary model for building stackdriver output config in YAML.
21
+
22
+ Ref: https://docs.fluentbit.io/manual/1.7/pipeline/outputs/stackdriver
23
+ """
24
+ name: str = 'stackdriver'
25
+ match: str = '*'
26
+ export_to_project_id: Optional[str] = None
27
+ labels: Optional[Dict[str, str]] = None
28
+
29
+ def to_dict(self) -> Dict[str, Any]:
30
+ config = self.model_dump(exclude_none=True)
31
+ if self.labels:
32
+ # Replace the label format from `{k: v}` to `k=v`
33
+ label_str = ','.join([f'{k}={v}' for k, v in self.labels.items()])
34
+ config['labels'] = label_str
35
+ return config
36
+
37
+
38
+ class GCPLoggingAgent(FluentbitAgent):
39
+ """GCP logging agent."""
40
+
41
+ def __init__(self, config: Dict[str, Any]):
42
+ self.config = _GCPLoggingConfig(**config)
43
+
44
+ def get_setup_command(self,
45
+ cluster_name: resources_utils.ClusterName) -> str:
46
+ credential_path = gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
47
+ if self.config.credentials_file:
48
+ credential_path = self.config.credentials_file
49
+ # Set GOOGLE_APPLICATION_CREDENTIALS and check whether credentials
50
+ # is valid.
51
+ # Stackdriver only support service account credentials or credentials
52
+ # from metadata server (only available on GCE or GKE). If the default
53
+ # credentials uploaded by API server is NOT a service account key and
54
+ # there is NO metadata server available, the logging agent will fail to
55
+ # authenticate and we require the user to upload a service account key
56
+ # via logs.gcp.credentials_file in this case.
57
+ # Also note that we use env var instead of YAML config to specify the
58
+ # service account key file path in order to resolve the home directory
59
+ # more reliably.
60
+ # Ref: https://github.com/fluent/fluent-bit/issues/8804
61
+ # TODO(aylei): check whether the credentials config is valid before
62
+ # provision.
63
+ pre_cmd = (f'export GOOGLE_APPLICATION_CREDENTIALS={credential_path}; '
64
+ f'cat {credential_path} | grep "service_account" || '
65
+ f'(echo "Credentials file {credential_path} is not a '
66
+ 'service account key, check metadata server" && '
67
+ 'curl -s http://metadata.google.internal >/dev/null || '
68
+ f'(echo "Neither service account key nor metadata server is '
69
+ 'available. Set logs.gcp.credentials_file to a service '
70
+ 'account key in server config and retry." && '
71
+ 'exit 1;))')
72
+ return pre_cmd + ' && ' + super().get_setup_command(cluster_name)
73
+
74
+ def fluentbit_output_config(
75
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
76
+ display_name = cluster_name.display_name
77
+ unique_name = cluster_name.name_on_cloud
78
+
79
+ return _StackdriverOutputConfig(
80
+ export_to_project_id=self.config.project_id,
81
+ labels={
82
+ 'skypilot_cluster_name': display_name,
83
+ 'skypilot_cluster_id': unique_name,
84
+ **(self.config.additional_labels or {})
85
+ },
86
+ ).to_dict()
87
+
88
+ def get_credential_file_mounts(self) -> Dict[str, str]:
89
+ if self.config.credentials_file:
90
+ return {self.config.credentials_file: self.config.credentials_file}
91
+ return {}
sky/models.py CHANGED
@@ -17,6 +17,7 @@ class User:
17
17
  id: str
18
18
  # Display name of the user
19
19
  name: Optional[str] = None
20
+ password: Optional[str] = None
20
21
 
21
22
  def to_dict(self) -> Dict[str, Any]:
22
23
  return {'id': self.id, 'name': self.name}
sky/provision/common.py CHANGED
@@ -6,6 +6,7 @@ import os
6
6
  from typing import Any, Dict, List, Optional, Tuple
7
7
 
8
8
  from sky import sky_logging
9
+ from sky.utils import env_options
9
10
  from sky.utils import resources_utils
10
11
 
11
12
  # NOTE: we can use pydantic instead of dataclasses or namedtuples, because
@@ -244,6 +245,15 @@ class SocketEndpoint(Endpoint):
244
245
 
245
246
  def url(self, override_ip: Optional[str] = None) -> str:
246
247
  host = override_ip if override_ip else self.host
248
+ if env_options.Options.RUNNING_IN_BUILDKITE.get(
249
+ ) and 'localhost' in host:
250
+ # In Buildkite CI, we run a kind (Kubernetes in Docker) cluster.
251
+ # The controller pod runs inside this kind cluster, which itself
252
+ # runs in a container. When the pod tries to access 'localhost',
253
+ # it can't reach the host machine's localhost. Using
254
+ # 'host.docker.internal' allows the pod to properly communicate
255
+ # with services running on the host machine's localhost.
256
+ host = 'host.docker.internal'
247
257
  return f'{host}{":" + str(self.port) if self.port else ""}'
248
258
 
249
259
 
@@ -8,6 +8,7 @@ import time
8
8
  from typing import Any, Callable, Dict, List, Optional, Tuple
9
9
 
10
10
  from sky import exceptions
11
+ from sky import logs
11
12
  from sky import provision
12
13
  from sky import sky_logging
13
14
  from sky.provision import common
@@ -21,6 +22,7 @@ from sky.utils import accelerator_registry
21
22
  from sky.utils import command_runner
22
23
  from sky.utils import common_utils
23
24
  from sky.utils import env_options
25
+ from sky.utils import resources_utils
24
26
  from sky.utils import subprocess_utils
25
27
  from sky.utils import timeline
26
28
  from sky.utils import ux_utils
@@ -557,3 +559,36 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
557
559
  ssh_credentials=ssh_credentials,
558
560
  max_workers=subprocess_utils.get_max_workers_for_file_mounts(
559
561
  common_file_mounts, cluster_info.provider_name))
562
+
563
+
564
+ @common.log_function_start_end
565
+ @timeline.event
566
+ def setup_logging_on_cluster(logging_agent: logs.LoggingAgent,
567
+ cluster_name: resources_utils.ClusterName,
568
+ cluster_info: common.ClusterInfo,
569
+ ssh_credentials: Dict[str, Any]) -> None:
570
+ """Setup logging agent (fluentbit) on all nodes after provisioning."""
571
+ _hint_worker_log_path(cluster_name.name_on_cloud, cluster_info,
572
+ 'logging_setup')
573
+
574
+ @_auto_retry()
575
+ def _setup_node(runner: command_runner.CommandRunner, log_path: str):
576
+ cmd = logging_agent.get_setup_command(cluster_name)
577
+ logger.info(f'Running command on node: {cmd}')
578
+ returncode, stdout, stderr = runner.run(cmd,
579
+ stream_logs=False,
580
+ require_outputs=True,
581
+ log_path=log_path,
582
+ source_bashrc=True)
583
+ if returncode:
584
+ raise RuntimeError(f'Failed to setup logging agent\n{cmd}\n'
585
+ f'(exit code {returncode}). Error: '
586
+ f'===== stdout ===== \n{stdout}\n'
587
+ f'===== stderr ====={stderr}')
588
+
589
+ _parallel_ssh_with_cache(_setup_node,
590
+ cluster_name.name_on_cloud,
591
+ stage_name='logging_setup',
592
+ digest=None,
593
+ cluster_info=cluster_info,
594
+ ssh_credentials=ssh_credentials)
@@ -16,6 +16,7 @@ import sky
16
16
  from sky import clouds
17
17
  from sky import exceptions
18
18
  from sky import global_user_state
19
+ from sky import logs
19
20
  from sky import provision
20
21
  from sky import sky_logging
21
22
  from sky import skypilot_config
@@ -648,6 +649,15 @@ def _post_provision_setup(
648
649
  logger.debug('Ray cluster is ready. Skip starting ray cluster on '
649
650
  'worker nodes.')
650
651
 
652
+ logging_agent = logs.get_logging_agent()
653
+ if logging_agent:
654
+ status.update(
655
+ ux_utils.spinner_message('Setting up logging agent',
656
+ provision_logging.config.log_path))
657
+ instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
658
+ cluster_info,
659
+ ssh_credentials)
660
+
651
661
  instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
652
662
  cluster_info, ssh_credentials)
653
663
 
@@ -672,6 +682,7 @@ def post_provision_runtime_setup(
672
682
  and other necessary files to the VM.
673
683
  3. Run setup commands to install dependencies.
674
684
  4. Start ray cluster and skylet.
685
+ 5. (Optional) Setup logging agent.
675
686
 
676
687
  Raises:
677
688
  RuntimeError: If the setup process encounters any error.
sky/resources.py CHANGED
@@ -225,7 +225,7 @@ class Resources:
225
225
  autostop: the autostop configuration to use. For launched resources,
226
226
  may or may not correspond to the actual current autostop config.
227
227
  priority: the priority for this resource configuration. Must be an
228
- integer from 0 to 1000, where higher values indicate higher priority.
228
+ integer from -1000 to 1000, where higher values indicate higher priority.
229
229
  If None, no priority is set.
230
230
  volumes: the volumes to mount on the instance.
231
231
  _docker_login_config: the docker configuration to use. This includes
@@ -631,7 +631,7 @@ class Resources:
631
631
  def priority(self) -> Optional[int]:
632
632
  """The priority for this resource configuration.
633
633
 
634
- Higher values indicate higher priority. Valid range is 0-1000.
634
+ Higher values indicate higher priority. Valid range is -1000 to 1000.
635
635
  """
636
636
  return self._priority
637
637
 
@@ -824,14 +824,15 @@ class Resources:
824
824
  """Sets the priority for this resource configuration.
825
825
 
826
826
  Args:
827
- priority: Priority value from 0 to 1000, where higher values
827
+ priority: Priority value from -1000 to 1000, where higher values
828
828
  indicate higher priority. If None, no priority is set.
829
829
  """
830
830
  if priority is not None:
831
- if not 0 <= priority <= 1000:
831
+ if not constants.MIN_PRIORITY <= priority <= constants.MAX_PRIORITY:
832
832
  with ux_utils.print_exception_no_traceback():
833
- raise ValueError(f'Priority must be between 0 and 1000. '
834
- f'Found: {priority}')
833
+ raise ValueError(
834
+ f'Priority must be between {constants.MIN_PRIORITY} and'
835
+ f' {constants.MAX_PRIORITY}. Found: {priority}')
835
836
  self._priority = priority
836
837
 
837
838
  def _set_volumes(
sky/serve/server/core.py CHANGED
@@ -297,6 +297,8 @@ def up(
297
297
  assert task.service is not None
298
298
  protocol = ('http'
299
299
  if task.service.tls_credential is None else 'https')
300
+ socket_endpoint = socket_endpoint.replace('https://', '').replace(
301
+ 'http://', '')
300
302
  endpoint = f'{protocol}://{socket_endpoint}'
301
303
 
302
304
  logger.info(
@@ -716,6 +718,9 @@ def status(
716
718
  else:
717
719
  protocol = ('https'
718
720
  if service_record['tls_encrypted'] else 'http')
721
+ if endpoint is not None:
722
+ endpoint = endpoint.replace('https://',
723
+ '').replace('http://', '')
719
724
  service_record['endpoint'] = f'{protocol}://{endpoint}'
720
725
 
721
726
  return service_records
sky/server/common.py CHANGED
@@ -13,7 +13,7 @@ import subprocess
13
13
  import sys
14
14
  import time
15
15
  import typing
16
- from typing import Any, Dict, Literal, Optional
16
+ from typing import Any, Dict, Literal, Optional, Tuple
17
17
  from urllib import parse
18
18
  import uuid
19
19
 
@@ -128,6 +128,8 @@ class ApiServerInfo:
128
128
  version: Optional[str] = None
129
129
  version_on_disk: Optional[str] = None
130
130
  commit: Optional[str] = None
131
+ user: Optional[Dict[str, Any]] = None
132
+ basic_auth_enabled: bool = False
131
133
 
132
134
 
133
135
  def get_api_cookie_jar_path() -> pathlib.Path:
@@ -261,11 +263,15 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
261
263
  version = result.get('version')
262
264
  version_on_disk = result.get('version_on_disk')
263
265
  commit = result.get('commit')
266
+ user = result.get('user')
267
+ basic_auth_enabled = result.get('basic_auth_enabled')
264
268
  server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
265
269
  api_version=api_version,
266
270
  version=version,
267
271
  version_on_disk=version_on_disk,
268
- commit=commit)
272
+ commit=commit,
273
+ user=user,
274
+ basic_auth_enabled=basic_auth_enabled)
269
275
  if api_version is None or version is None or commit is None:
270
276
  logger.warning(f'API server response missing '
271
277
  f'version info. {server_url} may '
@@ -320,7 +326,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
320
326
 
321
327
  def _start_api_server(deploy: bool = False,
322
328
  host: str = '127.0.0.1',
323
- foreground: bool = False):
329
+ foreground: bool = False,
330
+ enable_basic_auth: bool = False):
324
331
  """Starts a SkyPilot API server locally."""
325
332
  server_url = get_server_url(host)
326
333
  assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
@@ -354,6 +361,8 @@ def _start_api_server(deploy: bool = False,
354
361
  if foreground:
355
362
  # Replaces the current process with the API server
356
363
  os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
364
+ if enable_basic_auth:
365
+ os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
357
366
  os.execvp(args[0], args)
358
367
 
359
368
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
@@ -365,6 +374,8 @@ def _start_api_server(deploy: bool = False,
365
374
  # the API server.
366
375
  server_env = os.environ.copy()
367
376
  server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
377
+ if enable_basic_auth:
378
+ server_env[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
368
379
  with open(log_path, 'w', encoding='utf-8') as log_file:
369
380
  # Because the log file is opened using a with statement, it may seem
370
381
  # that the file will be closed when the with statement is exited
@@ -428,10 +439,10 @@ def _start_api_server(deploy: bool = False,
428
439
 
429
440
  def check_server_healthy(
430
441
  endpoint: Optional[str] = None
431
- ) -> Literal[
442
+ ) -> Tuple[Literal[
432
443
  # Use an incomplete list of Literals here to enforce raising for other
433
444
  # enum values.
434
- ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH]:
445
+ ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH], ApiServerInfo]:
435
446
  """Check if the API server is healthy.
436
447
 
437
448
  Args:
@@ -508,7 +519,7 @@ def check_server_healthy(
508
519
 
509
520
  hinted_for_server_install_version_mismatch = True
510
521
 
511
- return api_server_status
522
+ return api_server_status, api_server_info
512
523
 
513
524
 
514
525
  def _get_version_info_hint(server_info: ApiServerInfo) -> str:
@@ -559,10 +570,11 @@ def get_skypilot_version_on_disk() -> str:
559
570
 
560
571
  def check_server_healthy_or_start_fn(deploy: bool = False,
561
572
  host: str = '127.0.0.1',
562
- foreground: bool = False):
573
+ foreground: bool = False,
574
+ enable_basic_auth: bool = False):
563
575
  api_server_status = None
564
576
  try:
565
- api_server_status = check_server_healthy()
577
+ api_server_status, _ = check_server_healthy()
566
578
  if api_server_status == ApiServerStatus.NEEDS_AUTH:
567
579
  endpoint = get_server_url()
568
580
  with ux_utils.print_exception_no_traceback():
@@ -580,7 +592,7 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
580
592
  # have started the server while we were waiting for the lock.
581
593
  api_server_info = get_api_server_status(endpoint)
582
594
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
583
- _start_api_server(deploy, host, foreground)
595
+ _start_api_server(deploy, host, foreground, enable_basic_auth)
584
596
 
585
597
 
586
598
  def check_server_healthy_or_start(func):
@@ -336,10 +336,28 @@ class ClusterJobsDownloadLogsBody(RequestBody):
336
336
  local_dir: str = constants.SKY_LOGS_DIRECTORY
337
337
 
338
338
 
339
+ class UserCreateBody(RequestBody):
340
+ """The request body for the user create endpoint."""
341
+ username: str
342
+ password: str
343
+ role: Optional[str] = None
344
+
345
+
346
+ class UserDeleteBody(RequestBody):
347
+ """The request body for the user delete endpoint."""
348
+ user_id: str
349
+
350
+
339
351
  class UserUpdateBody(RequestBody):
340
352
  """The request body for the user update endpoint."""
341
353
  user_id: str
342
- role: str
354
+ role: Optional[str] = None
355
+ password: Optional[str] = None
356
+
357
+
358
+ class UserImportBody(RequestBody):
359
+ """The request body for the user import endpoint."""
360
+ csv_content: str
343
361
 
344
362
 
345
363
  class DownloadBody(RequestBody):