skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/logs/agent.py ADDED
@@ -0,0 +1,73 @@
1
+ """Base class for all logging agents."""
2
+ import abc
3
+ import os
4
+ import shlex
5
+ from typing import Any, Dict
6
+
7
+ from sky.skylet import constants
8
+ from sky.utils import common_utils
9
+ from sky.utils import resources_utils
10
+
11
+
12
+ class LoggingAgent(abc.ABC):
13
+ """Base class for all logging agents.
14
+
15
+ Each agent should implement the `get_setup_command` and
16
+ `get_credential_file_mounts` methods to return the setup command and
17
+ credential file mounts for the agent for provisioner to setup the agent
18
+ on each node.
19
+ """
20
+
21
+ @abc.abstractmethod
22
+ def get_setup_command(self,
23
+ cluster_name: resources_utils.ClusterName) -> str:
24
+ pass
25
+
26
+ @abc.abstractmethod
27
+ def get_credential_file_mounts(self) -> Dict[str, str]:
28
+ pass
29
+
30
+
31
+ class FluentbitAgent(LoggingAgent):
32
+ """Base class for logging store that use fluentbit as the agent."""
33
+
34
+ def get_setup_command(self,
35
+ cluster_name: resources_utils.ClusterName) -> str:
36
+ install_cmd = (
37
+ 'if ! command -v fluent-bit >/dev/null 2>&1; then '
38
+ 'sudo apt-get install -y gnupg; '
39
+ # pylint: disable=line-too-long
40
+ 'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
41
+ 'fi')
42
+ cfg = self.fluentbit_config(cluster_name)
43
+ cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
44
+ config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
45
+ f'echo {shlex.quote(cfg)} > {cfg_path}')
46
+ start_cmd = ('nohup $(command -v fluent-bit || '
47
+ 'echo "/opt/fluent-bit/bin/fluent-bit") '
48
+ f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
49
+ return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
50
+
51
+ def fluentbit_config(self,
52
+ cluster_name: resources_utils.ClusterName) -> str:
53
+ cfg_dict = {
54
+ 'pipeline': {
55
+ 'inputs': [{
56
+ 'name': 'tail',
57
+ 'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
58
+ 'path_key': 'log_path',
59
+ # Shorten the refresh interval from 60s to 1s since every
60
+ # job creates a new log file and we must be responsive
61
+ # for this: the VM might be autodown within a minute
62
+ # right after the job completion.
63
+ 'refresh_interval': 1,
64
+ }],
65
+ 'outputs': [self.fluentbit_output_config(cluster_name)],
66
+ }
67
+ }
68
+ return common_utils.dump_yaml_str(cfg_dict)
69
+
70
+ @abc.abstractmethod
71
+ def fluentbit_output_config(
72
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
73
+ pass
sky/logs/gcp.py ADDED
@@ -0,0 +1,91 @@
1
+ """GCP logging agent."""
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import pydantic
6
+
7
+ from sky.clouds import gcp
8
+ from sky.logs.agent import FluentbitAgent
9
+ from sky.utils import resources_utils
10
+
11
+
12
+ class _GCPLoggingConfig(pydantic.BaseModel):
13
+ """Configuration for GCP logging agent."""
14
+ project_id: Optional[str] = None
15
+ credentials_file: Optional[str] = None
16
+ additional_labels: Optional[Dict[str, str]] = None
17
+
18
+
19
+ class _StackdriverOutputConfig(pydantic.BaseModel):
20
+ """Auxiliary model for building stackdriver output config in YAML.
21
+
22
+ Ref: https://docs.fluentbit.io/manual/1.7/pipeline/outputs/stackdriver
23
+ """
24
+ name: str = 'stackdriver'
25
+ match: str = '*'
26
+ export_to_project_id: Optional[str] = None
27
+ labels: Optional[Dict[str, str]] = None
28
+
29
+ def to_dict(self) -> Dict[str, Any]:
30
+ config = self.model_dump(exclude_none=True)
31
+ if self.labels:
32
+ # Replace the label format from `{k: v}` to `k=v`
33
+ label_str = ','.join([f'{k}={v}' for k, v in self.labels.items()])
34
+ config['labels'] = label_str
35
+ return config
36
+
37
+
38
+ class GCPLoggingAgent(FluentbitAgent):
39
+ """GCP logging agent."""
40
+
41
+ def __init__(self, config: Dict[str, Any]):
42
+ self.config = _GCPLoggingConfig(**config)
43
+
44
+ def get_setup_command(self,
45
+ cluster_name: resources_utils.ClusterName) -> str:
46
+ credential_path = gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
47
+ if self.config.credentials_file:
48
+ credential_path = self.config.credentials_file
49
+ # Set GOOGLE_APPLICATION_CREDENTIALS and check whether credentials
50
+ # is valid.
51
+ # Stackdriver only support service account credentials or credentials
52
+ # from metadata server (only available on GCE or GKE). If the default
53
+ # credentials uploaded by API server is NOT a service account key and
54
+ # there is NO metadata server available, the logging agent will fail to
55
+ # authenticate and we require the user to upload a service account key
56
+ # via logs.gcp.credentials_file in this case.
57
+ # Also note that we use env var instead of YAML config to specify the
58
+ # service account key file path in order to resolve the home directory
59
+ # more reliably.
60
+ # Ref: https://github.com/fluent/fluent-bit/issues/8804
61
+ # TODO(aylei): check whether the credentials config is valid before
62
+ # provision.
63
+ pre_cmd = (f'export GOOGLE_APPLICATION_CREDENTIALS={credential_path}; '
64
+ f'cat {credential_path} | grep "service_account" || '
65
+ f'(echo "Credentials file {credential_path} is not a '
66
+ 'service account key, check metadata server" && '
67
+ 'curl -s http://metadata.google.internal >/dev/null || '
68
+ f'(echo "Neither service account key nor metadata server is '
69
+ 'available. Set logs.gcp.credentials_file to a service '
70
+ 'account key in server config and retry." && '
71
+ 'exit 1;))')
72
+ return pre_cmd + ' && ' + super().get_setup_command(cluster_name)
73
+
74
+ def fluentbit_output_config(
75
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
76
+ display_name = cluster_name.display_name
77
+ unique_name = cluster_name.name_on_cloud
78
+
79
+ return _StackdriverOutputConfig(
80
+ export_to_project_id=self.config.project_id,
81
+ labels={
82
+ 'skypilot_cluster_name': display_name,
83
+ 'skypilot_cluster_id': unique_name,
84
+ **(self.config.additional_labels or {})
85
+ },
86
+ ).to_dict()
87
+
88
+ def get_credential_file_mounts(self) -> Dict[str, str]:
89
+ if self.config.credentials_file:
90
+ return {self.config.credentials_file: self.config.credentials_file}
91
+ return {}
sky/models.py CHANGED
@@ -17,6 +17,7 @@ class User:
17
17
  id: str
18
18
  # Display name of the user
19
19
  name: Optional[str] = None
20
+ password: Optional[str] = None
20
21
 
21
22
  def to_dict(self) -> Dict[str, Any]:
22
23
  return {'id': self.id, 'name': self.name}
sky/provision/__init__.py CHANGED
@@ -24,6 +24,7 @@ from sky.provision import lambda_cloud
24
24
  from sky.provision import nebius
25
25
  from sky.provision import oci
26
26
  from sky.provision import runpod
27
+ from sky.provision import scp
27
28
  from sky.provision import ssh
28
29
  from sky.provision import vast
29
30
  from sky.provision import vsphere
@@ -8,6 +8,7 @@ import time
8
8
  from typing import Any, Callable, Dict, List, Optional, Tuple
9
9
 
10
10
  from sky import exceptions
11
+ from sky import logs
11
12
  from sky import provision
12
13
  from sky import sky_logging
13
14
  from sky.provision import common
@@ -21,6 +22,7 @@ from sky.utils import accelerator_registry
21
22
  from sky.utils import command_runner
22
23
  from sky.utils import common_utils
23
24
  from sky.utils import env_options
25
+ from sky.utils import resources_utils
24
26
  from sky.utils import subprocess_utils
25
27
  from sky.utils import timeline
26
28
  from sky.utils import ux_utils
@@ -557,3 +559,36 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
557
559
  ssh_credentials=ssh_credentials,
558
560
  max_workers=subprocess_utils.get_max_workers_for_file_mounts(
559
561
  common_file_mounts, cluster_info.provider_name))
562
+
563
+
564
+ @common.log_function_start_end
565
+ @timeline.event
566
+ def setup_logging_on_cluster(logging_agent: logs.LoggingAgent,
567
+ cluster_name: resources_utils.ClusterName,
568
+ cluster_info: common.ClusterInfo,
569
+ ssh_credentials: Dict[str, Any]) -> None:
570
+ """Setup logging agent (fluentbit) on all nodes after provisioning."""
571
+ _hint_worker_log_path(cluster_name.name_on_cloud, cluster_info,
572
+ 'logging_setup')
573
+
574
+ @_auto_retry()
575
+ def _setup_node(runner: command_runner.CommandRunner, log_path: str):
576
+ cmd = logging_agent.get_setup_command(cluster_name)
577
+ logger.info(f'Running command on node: {cmd}')
578
+ returncode, stdout, stderr = runner.run(cmd,
579
+ stream_logs=False,
580
+ require_outputs=True,
581
+ log_path=log_path,
582
+ source_bashrc=True)
583
+ if returncode:
584
+ raise RuntimeError(f'Failed to setup logging agent\n{cmd}\n'
585
+ f'(exit code {returncode}). Error: '
586
+ f'===== stdout ===== \n{stdout}\n'
587
+ f'===== stderr ====={stderr}')
588
+
589
+ _parallel_ssh_with_cache(_setup_node,
590
+ cluster_name.name_on_cloud,
591
+ stage_name='logging_setup',
592
+ digest=None,
593
+ cluster_info=cluster_info,
594
+ ssh_credentials=ssh_credentials)
@@ -16,6 +16,7 @@ import sky
16
16
  from sky import clouds
17
17
  from sky import exceptions
18
18
  from sky import global_user_state
19
+ from sky import logs
19
20
  from sky import provision
20
21
  from sky import sky_logging
21
22
  from sky import skypilot_config
@@ -648,6 +649,15 @@ def _post_provision_setup(
648
649
  logger.debug('Ray cluster is ready. Skip starting ray cluster on '
649
650
  'worker nodes.')
650
651
 
652
+ logging_agent = logs.get_logging_agent()
653
+ if logging_agent:
654
+ status.update(
655
+ ux_utils.spinner_message('Setting up logging agent',
656
+ provision_logging.config.log_path))
657
+ instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
658
+ cluster_info,
659
+ ssh_credentials)
660
+
651
661
  instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
652
662
  cluster_info, ssh_credentials)
653
663
 
@@ -672,6 +682,7 @@ def post_provision_runtime_setup(
672
682
  and other necessary files to the VM.
673
683
  3. Run setup commands to install dependencies.
674
684
  4. Start ray cluster and skylet.
685
+ 5. (Optional) Setup logging agent.
675
686
 
676
687
  Raises:
677
688
  RuntimeError: If the setup process encounters any error.
@@ -0,0 +1,15 @@
1
+ """SCP provisioner for SkyPilot."""
2
+
3
+ from sky.provision.scp.config import bootstrap_instances
4
+ from sky.provision.scp.instance import cleanup_ports
5
+ from sky.provision.scp.instance import get_cluster_info
6
+ from sky.provision.scp.instance import open_ports
7
+ from sky.provision.scp.instance import query_instances
8
+ from sky.provision.scp.instance import run_instances
9
+ from sky.provision.scp.instance import stop_instances
10
+ from sky.provision.scp.instance import terminate_instances
11
+ from sky.provision.scp.instance import wait_instances
12
+
13
+ __all__ = ('bootstrap_instances', 'cleanup_ports', 'get_cluster_info',
14
+ 'open_ports', 'query_instances', 'run_instances', 'stop_instances',
15
+ 'terminate_instances', 'wait_instances')
@@ -0,0 +1,93 @@
1
+ """SCP configuration bootstrapping."""
2
+
3
+ import subprocess
4
+
5
+ from sky.clouds.utils import scp_utils
6
+ from sky.provision import common
7
+
8
+
9
+ def bootstrap_instances(
10
+ region: str, cluster_name: str,
11
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
12
+ """Bootstraps instances for the given cluster."""
13
+ del cluster_name
14
+
15
+ node_cfg = config.node_config
16
+ zone_id = _get_zone_id(region)
17
+ node_cfg['zone_id'] = zone_id
18
+
19
+ docker_cfg = config.docker_config
20
+ docker_cfg['imageId'] = node_cfg['imageId']
21
+ docker_cfg['serviceZoneId'] = zone_id
22
+ docker_cfg['serverType'] = node_cfg['InstanceType']
23
+ docker_cfg['contractId'] = 'None'
24
+ ssh_public_key = node_cfg['AuthorizedKey']
25
+ docker_cfg['initialScript'] = _get_init_script(ssh_public_key)
26
+
27
+ key_pair_id = _get_key_pair_id()
28
+ miscellaneous = {
29
+ 'deletionProtectionEnabled': False,
30
+ 'keyPairId': key_pair_id,
31
+ 'blockStorage': {
32
+ 'blockStorageName': 'skystorage',
33
+ 'diskSize': node_cfg['diskSize'],
34
+ 'encryptEnabled': False,
35
+ 'productId': 'PRODUCT-sRlJ34iBr9hOxN9J5PrQxo'
36
+ },
37
+ 'nic': {
38
+ 'natEnabled': True
39
+ },
40
+ }
41
+
42
+ docker_cfg.update(miscellaneous)
43
+
44
+ return config
45
+
46
+
47
+ def _get_zone_id(region_name: str):
48
+ zone_contents = scp_utils.SCPClient().get_zones()
49
+ zone_dict = {
50
+ item['serviceZoneName']: item['serviceZoneId'] for item in zone_contents
51
+ }
52
+ return zone_dict[region_name]
53
+
54
+
55
+ def _get_init_script(ssh_public_key: str):
56
+ init_script_content = _get_default_config_cmd() + _get_ssh_key_gen_cmd(
57
+ ssh_public_key)
58
+ init_script_content_string = f'"{init_script_content}"'
59
+ command = f'echo {init_script_content_string} | base64'
60
+ result = subprocess.run(command,
61
+ shell=True,
62
+ capture_output=True,
63
+ text=True,
64
+ check=True)
65
+ init_script_content_base64 = result.stdout
66
+ return {
67
+ 'encodingType': 'base64',
68
+ 'initialScriptShell': 'bash',
69
+ 'initialScriptType': 'text',
70
+ 'initialScriptContent': init_script_content_base64
71
+ }
72
+
73
+
74
+ def _get_default_config_cmd():
75
+ cmd_list = ['apt-get update', 'apt-get -y install python3-pip']
76
+ res = ''
77
+ for cmd in cmd_list:
78
+ res += cmd + '; '
79
+ return res
80
+
81
+
82
+ def _get_ssh_key_gen_cmd(ssh_public_key: str):
83
+ cmd_st = 'mkdir -p ~/.ssh/; touch ~/.ssh/authorized_keys;'
84
+ cmd_ed = 'chmod 644 ~/.ssh/authorized_keys; chmod 700 ~/.ssh/'
85
+ cmd = "echo '{}' &>>~/.ssh/authorized_keys;".format(ssh_public_key) # pylint: disable=invalid-string-quote
86
+ return cmd_st + cmd + cmd_ed
87
+
88
+
89
+ def _get_key_pair_id():
90
+ key_pairs = scp_utils.SCPClient().get_key_pairs()
91
+ if key_pairs['totalCount'] == 0:
92
+ raise RuntimeError('create key pair')
93
+ return key_pairs['contents'][0]['keyPairId']