skypilot-nightly 1.0.0.dev20250617__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +48 -36
- sky/cli.py +5 -5729
- sky/client/cli.py +11 -2
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +5 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +9 -1
- sky/skylet/job_lib.py +75 -19
- sky/templates/kubernetes-ray.yml.j2 +9 -0
- sky/users/permission.py +49 -19
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/schemas.py +40 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +58 -54
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- /sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '903f8a7f3955084316b26af619b6b043f43de01c'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250618'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -28,6 +28,7 @@ from sky import check as sky_check
|
|
28
28
|
from sky import clouds
|
29
29
|
from sky import exceptions
|
30
30
|
from sky import global_user_state
|
31
|
+
from sky import logs
|
31
32
|
from sky import provision as provision_lib
|
32
33
|
from sky import sky_logging
|
33
34
|
from sky import skypilot_config
|
@@ -660,6 +661,12 @@ def write_cluster_config(
|
|
660
661
|
|
661
662
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
662
663
|
|
664
|
+
logging_agent = logs.get_logging_agent()
|
665
|
+
if logging_agent:
|
666
|
+
for k, v in logging_agent.get_credential_file_mounts().items():
|
667
|
+
assert k not in credentials, f'{k} already in credentials'
|
668
|
+
credentials[k] = v
|
669
|
+
|
663
670
|
private_key_path, _ = auth.get_or_generate_keys()
|
664
671
|
auth_config = {'ssh_private_key': private_key_path}
|
665
672
|
region_name = resources_vars.get('region')
|
@@ -142,6 +142,7 @@ _MAX_RAY_UP_RETRY = 5
|
|
142
142
|
_MAX_GET_ZONE_RETRY = 3
|
143
143
|
|
144
144
|
_JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
|
145
|
+
_LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
|
145
146
|
|
146
147
|
# Path to the monkey-patched ray up script.
|
147
148
|
# We don't do import then __file__ because that script needs to be filled in
|
@@ -3461,10 +3462,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3461
3462
|
job_id: int,
|
3462
3463
|
detach_run: bool = False,
|
3463
3464
|
managed_job_dag: Optional['dag.Dag'] = None,
|
3465
|
+
remote_log_dir: Optional[str] = None,
|
3464
3466
|
) -> None:
|
3465
3467
|
"""Executes generated code on the head node."""
|
3466
3468
|
script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
|
3467
|
-
remote_log_dir
|
3469
|
+
if remote_log_dir is None:
|
3470
|
+
remote_log_dir = self.log_dir
|
3468
3471
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
3469
3472
|
|
3470
3473
|
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
@@ -3583,13 +3586,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3583
3586
|
self.tail_logs(handle, job_id)
|
3584
3587
|
|
3585
3588
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
3586
|
-
job_name: Optional[str],
|
3589
|
+
job_name: Optional[str],
|
3590
|
+
resources_str: str) -> Tuple[int, str]:
|
3587
3591
|
code = job_lib.JobLibCodeGen.add_job(
|
3588
3592
|
job_name=job_name,
|
3589
3593
|
username=common_utils.get_user_hash(),
|
3590
3594
|
run_timestamp=self.run_timestamp,
|
3591
3595
|
resources_str=resources_str)
|
3592
|
-
returncode,
|
3596
|
+
returncode, result_str, stderr = self.run_on_head(handle,
|
3593
3597
|
code,
|
3594
3598
|
stream_logs=False,
|
3595
3599
|
require_outputs=True,
|
@@ -3603,17 +3607,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3603
3607
|
subprocess_utils.handle_returncode(returncode, code,
|
3604
3608
|
'Failed to fetch job id.', stderr)
|
3605
3609
|
try:
|
3606
|
-
job_id_match = _JOB_ID_PATTERN.search(
|
3610
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
3607
3611
|
if job_id_match is not None:
|
3608
3612
|
job_id = int(job_id_match.group(1))
|
3609
3613
|
else:
|
3610
3614
|
# For backward compatibility.
|
3611
|
-
job_id = int(
|
3615
|
+
job_id = int(result_str)
|
3616
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
3617
|
+
if log_dir_match is not None:
|
3618
|
+
log_dir = log_dir_match.group(1).strip()
|
3619
|
+
else:
|
3620
|
+
# For backward compatibility, use the same log dir as local.
|
3621
|
+
log_dir = self.log_dir
|
3612
3622
|
except ValueError as e:
|
3613
3623
|
logger.error(stderr)
|
3614
|
-
raise ValueError(f'Failed to parse job id: {
|
3624
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
3615
3625
|
f'Returncode: {returncode}') from e
|
3616
|
-
return job_id
|
3626
|
+
return job_id, log_dir
|
3617
3627
|
|
3618
3628
|
def _execute(
|
3619
3629
|
self,
|
@@ -3665,15 +3675,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3665
3675
|
logger.info(f'Dryrun complete. Would have run:\n{task}')
|
3666
3676
|
return None
|
3667
3677
|
|
3668
|
-
job_id = self._add_job(handle, task_copy.name, resources_str)
|
3678
|
+
job_id, log_dir = self._add_job(handle, task_copy.name, resources_str)
|
3669
3679
|
|
3670
3680
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
3671
3681
|
# Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
|
3672
3682
|
if num_actual_nodes > 1:
|
3673
|
-
self._execute_task_n_nodes(handle, task_copy, job_id, detach_run
|
3683
|
+
self._execute_task_n_nodes(handle, task_copy, job_id, detach_run,
|
3684
|
+
log_dir)
|
3674
3685
|
else:
|
3675
3686
|
# Case: task_lib.Task(run, num_nodes=1)
|
3676
|
-
self._execute_task_one_node(handle, task_copy, job_id, detach_run
|
3687
|
+
self._execute_task_one_node(handle, task_copy, job_id, detach_run,
|
3688
|
+
log_dir)
|
3677
3689
|
|
3678
3690
|
return job_id
|
3679
3691
|
|
@@ -3836,32 +3848,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3836
3848
|
Returns:
|
3837
3849
|
A dictionary mapping job_id to log path.
|
3838
3850
|
"""
|
3839
|
-
code = job_lib.JobLibCodeGen.
|
3840
|
-
returncode,
|
3841
|
-
|
3842
|
-
|
3843
|
-
|
3844
|
-
|
3845
|
-
separate_stderr=True)
|
3851
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
3852
|
+
returncode, job_to_dir, stderr = self.run_on_head(handle,
|
3853
|
+
code,
|
3854
|
+
stream_logs=False,
|
3855
|
+
require_outputs=True,
|
3856
|
+
separate_stderr=True)
|
3846
3857
|
subprocess_utils.handle_returncode(returncode, code,
|
3847
3858
|
'Failed to sync logs.', stderr)
|
3848
|
-
|
3849
|
-
if not
|
3859
|
+
job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
|
3860
|
+
if not job_to_dir:
|
3850
3861
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3851
3862
|
'No matching log directories found'
|
3852
3863
|
f'{colorama.Style.RESET_ALL}')
|
3853
3864
|
return {}
|
3854
3865
|
|
3855
|
-
job_ids = list(
|
3856
|
-
|
3866
|
+
job_ids = list(job_to_dir.keys())
|
3867
|
+
dirs = list(job_to_dir.values())
|
3857
3868
|
remote_log_dirs = [
|
3858
|
-
|
3859
|
-
|
3860
|
-
|
3861
|
-
|
3862
|
-
os.path.join(local_dir, run_timestamp)
|
3863
|
-
for run_timestamp in run_timestamps
|
3869
|
+
# TODO(aylei): backward compatibility for legacy runtime that
|
3870
|
+
# returns run_timestamp only, remove after 0.12.0
|
3871
|
+
(dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
|
3872
|
+
constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
|
3864
3873
|
]
|
3874
|
+
local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
|
3875
|
+
if constants.SKY_LOGS_DIRECTORY in dir else
|
3876
|
+
os.path.join(local_dir, dir)) for dir in dirs]
|
3865
3877
|
|
3866
3878
|
runners = handle.get_command_runners()
|
3867
3879
|
|
@@ -4033,8 +4045,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4033
4045
|
|
4034
4046
|
# get the run_timestamp
|
4035
4047
|
# the function takes in [job_id]
|
4036
|
-
code = job_lib.JobLibCodeGen.
|
4037
|
-
[str(job_id)])
|
4048
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
|
4038
4049
|
returncode, run_timestamps, stderr = self.run_on_head(
|
4039
4050
|
handle,
|
4040
4051
|
code,
|
@@ -5182,9 +5193,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5182
5193
|
|
5183
5194
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
5184
5195
|
task: task_lib.Task, job_id: int,
|
5185
|
-
detach_run: bool) -> None:
|
5196
|
+
detach_run: bool, remote_log_dir: str) -> None:
|
5186
5197
|
# Launch the command as a Ray task.
|
5187
|
-
log_dir = os.path.join(
|
5198
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
5188
5199
|
|
5189
5200
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
5190
5201
|
internal_ips = handle.internal_ips()
|
@@ -5222,17 +5233,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5222
5233
|
codegen.build(),
|
5223
5234
|
job_id,
|
5224
5235
|
detach_run=detach_run,
|
5225
|
-
managed_job_dag=task.managed_job_dag
|
5236
|
+
managed_job_dag=task.managed_job_dag,
|
5237
|
+
remote_log_dir=remote_log_dir)
|
5226
5238
|
|
5227
5239
|
def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
|
5228
5240
|
task: task_lib.Task, job_id: int,
|
5229
|
-
detach_run: bool) -> None:
|
5241
|
+
detach_run: bool, remote_log_dir: str) -> None:
|
5230
5242
|
# Strategy:
|
5231
5243
|
# ray.init(...)
|
5232
5244
|
# for node:
|
5233
5245
|
# submit _run_cmd(cmd) with resource {node_i: 1}
|
5234
|
-
|
5235
|
-
log_dir = os.path.join(log_dir_base, 'tasks')
|
5246
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
5236
5247
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
5237
5248
|
internal_ips = handle.internal_ips()
|
5238
5249
|
assert internal_ips is not None, 'internal_ips is not cached in handle'
|
@@ -5278,4 +5289,5 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
5278
5289
|
codegen.build(),
|
5279
5290
|
job_id,
|
5280
5291
|
detach_run=detach_run,
|
5281
|
-
managed_job_dag=task.managed_job_dag
|
5292
|
+
managed_job_dag=task.managed_job_dag,
|
5293
|
+
remote_log_dir=remote_log_dir)
|