skypilot-nightly 1.0.0.dev20250617__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +48 -36
  4. sky/cli.py +5 -5729
  5. sky/client/cli.py +11 -2
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +5 -0
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  10. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  12. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  14. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  15. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  16. sky/dashboard/out/clusters/[cluster].html +1 -1
  17. sky/dashboard/out/clusters.html +1 -1
  18. sky/dashboard/out/config.html +1 -1
  19. sky/dashboard/out/index.html +1 -1
  20. sky/dashboard/out/infra/[context].html +1 -1
  21. sky/dashboard/out/infra.html +1 -1
  22. sky/dashboard/out/jobs/[job].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/workspace/new.html +1 -1
  26. sky/dashboard/out/workspaces/[name].html +1 -1
  27. sky/dashboard/out/workspaces.html +1 -1
  28. sky/global_user_state.py +50 -11
  29. sky/logs/__init__.py +17 -0
  30. sky/logs/agent.py +73 -0
  31. sky/logs/gcp.py +91 -0
  32. sky/models.py +1 -0
  33. sky/provision/instance_setup.py +35 -0
  34. sky/provision/provisioner.py +11 -0
  35. sky/server/common.py +21 -9
  36. sky/server/requests/payloads.py +19 -1
  37. sky/server/server.py +121 -29
  38. sky/setup_files/dependencies.py +11 -1
  39. sky/skylet/constants.py +9 -1
  40. sky/skylet/job_lib.py +75 -19
  41. sky/templates/kubernetes-ray.yml.j2 +9 -0
  42. sky/users/permission.py +49 -19
  43. sky/users/rbac.py +10 -1
  44. sky/users/server.py +274 -9
  45. sky/utils/schemas.py +40 -0
  46. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  47. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +58 -54
  48. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  49. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  50. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  51. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  52. /sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  53. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  54. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  55. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  56. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  57. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  58. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  59. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  60. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  61. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  62. {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'fa78e63ee618b8695df1bca87911a231cce3d7da'
8
+ _SKYPILOT_COMMIT_SHA = '903f8a7f3955084316b26af619b6b043f43de01c'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250617'
38
+ __version__ = '1.0.0.dev20250618'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -28,6 +28,7 @@ from sky import check as sky_check
28
28
  from sky import clouds
29
29
  from sky import exceptions
30
30
  from sky import global_user_state
31
+ from sky import logs
31
32
  from sky import provision as provision_lib
32
33
  from sky import sky_logging
33
34
  from sky import skypilot_config
@@ -660,6 +661,12 @@ def write_cluster_config(
660
661
 
661
662
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
662
663
 
664
+ logging_agent = logs.get_logging_agent()
665
+ if logging_agent:
666
+ for k, v in logging_agent.get_credential_file_mounts().items():
667
+ assert k not in credentials, f'{k} already in credentials'
668
+ credentials[k] = v
669
+
663
670
  private_key_path, _ = auth.get_or_generate_keys()
664
671
  auth_config = {'ssh_private_key': private_key_path}
665
672
  region_name = resources_vars.get('region')
@@ -142,6 +142,7 @@ _MAX_RAY_UP_RETRY = 5
142
142
  _MAX_GET_ZONE_RETRY = 3
143
143
 
144
144
  _JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
145
+ _LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
145
146
 
146
147
  # Path to the monkey-patched ray up script.
147
148
  # We don't do import then __file__ because that script needs to be filled in
@@ -3461,10 +3462,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3461
3462
  job_id: int,
3462
3463
  detach_run: bool = False,
3463
3464
  managed_job_dag: Optional['dag.Dag'] = None,
3465
+ remote_log_dir: Optional[str] = None,
3464
3466
  ) -> None:
3465
3467
  """Executes generated code on the head node."""
3466
3468
  script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3467
- remote_log_dir = self.log_dir
3469
+ if remote_log_dir is None:
3470
+ remote_log_dir = self.log_dir
3468
3471
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3469
3472
 
3470
3473
  cd = f'cd {SKY_REMOTE_WORKDIR}'
@@ -3583,13 +3586,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3583
3586
  self.tail_logs(handle, job_id)
3584
3587
 
3585
3588
  def _add_job(self, handle: CloudVmRayResourceHandle,
3586
- job_name: Optional[str], resources_str: str) -> int:
3589
+ job_name: Optional[str],
3590
+ resources_str: str) -> Tuple[int, str]:
3587
3591
  code = job_lib.JobLibCodeGen.add_job(
3588
3592
  job_name=job_name,
3589
3593
  username=common_utils.get_user_hash(),
3590
3594
  run_timestamp=self.run_timestamp,
3591
3595
  resources_str=resources_str)
3592
- returncode, job_id_str, stderr = self.run_on_head(handle,
3596
+ returncode, result_str, stderr = self.run_on_head(handle,
3593
3597
  code,
3594
3598
  stream_logs=False,
3595
3599
  require_outputs=True,
@@ -3603,17 +3607,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3603
3607
  subprocess_utils.handle_returncode(returncode, code,
3604
3608
  'Failed to fetch job id.', stderr)
3605
3609
  try:
3606
- job_id_match = _JOB_ID_PATTERN.search(job_id_str)
3610
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
3607
3611
  if job_id_match is not None:
3608
3612
  job_id = int(job_id_match.group(1))
3609
3613
  else:
3610
3614
  # For backward compatibility.
3611
- job_id = int(job_id_str)
3615
+ job_id = int(result_str)
3616
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3617
+ if log_dir_match is not None:
3618
+ log_dir = log_dir_match.group(1).strip()
3619
+ else:
3620
+ # For backward compatibility, use the same log dir as local.
3621
+ log_dir = self.log_dir
3612
3622
  except ValueError as e:
3613
3623
  logger.error(stderr)
3614
- raise ValueError(f'Failed to parse job id: {job_id_str}; '
3624
+ raise ValueError(f'Failed to parse job id: {result_str}; '
3615
3625
  f'Returncode: {returncode}') from e
3616
- return job_id
3626
+ return job_id, log_dir
3617
3627
 
3618
3628
  def _execute(
3619
3629
  self,
@@ -3665,15 +3675,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3665
3675
  logger.info(f'Dryrun complete. Would have run:\n{task}')
3666
3676
  return None
3667
3677
 
3668
- job_id = self._add_job(handle, task_copy.name, resources_str)
3678
+ job_id, log_dir = self._add_job(handle, task_copy.name, resources_str)
3669
3679
 
3670
3680
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
3671
3681
  # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
3672
3682
  if num_actual_nodes > 1:
3673
- self._execute_task_n_nodes(handle, task_copy, job_id, detach_run)
3683
+ self._execute_task_n_nodes(handle, task_copy, job_id, detach_run,
3684
+ log_dir)
3674
3685
  else:
3675
3686
  # Case: task_lib.Task(run, num_nodes=1)
3676
- self._execute_task_one_node(handle, task_copy, job_id, detach_run)
3687
+ self._execute_task_one_node(handle, task_copy, job_id, detach_run,
3688
+ log_dir)
3677
3689
 
3678
3690
  return job_id
3679
3691
 
@@ -3836,32 +3848,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3836
3848
  Returns:
3837
3849
  A dictionary mapping job_id to log path.
3838
3850
  """
3839
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
3840
- returncode, run_timestamps, stderr = self.run_on_head(
3841
- handle,
3842
- code,
3843
- stream_logs=False,
3844
- require_outputs=True,
3845
- separate_stderr=True)
3851
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
3852
+ returncode, job_to_dir, stderr = self.run_on_head(handle,
3853
+ code,
3854
+ stream_logs=False,
3855
+ require_outputs=True,
3856
+ separate_stderr=True)
3846
3857
  subprocess_utils.handle_returncode(returncode, code,
3847
3858
  'Failed to sync logs.', stderr)
3848
- run_timestamps = message_utils.decode_payload(run_timestamps)
3849
- if not run_timestamps:
3859
+ job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
3860
+ if not job_to_dir:
3850
3861
  logger.info(f'{colorama.Fore.YELLOW}'
3851
3862
  'No matching log directories found'
3852
3863
  f'{colorama.Style.RESET_ALL}')
3853
3864
  return {}
3854
3865
 
3855
- job_ids = list(run_timestamps.keys())
3856
- run_timestamps = list(run_timestamps.values())
3866
+ job_ids = list(job_to_dir.keys())
3867
+ dirs = list(job_to_dir.values())
3857
3868
  remote_log_dirs = [
3858
- os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
3859
- for run_timestamp in run_timestamps
3860
- ]
3861
- local_log_dirs = [
3862
- os.path.join(local_dir, run_timestamp)
3863
- for run_timestamp in run_timestamps
3869
+ # TODO(aylei): backward compatibility for legacy runtime that
3870
+ # returns run_timestamp only, remove after 0.12.0
3871
+ (dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
3872
+ constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
3864
3873
  ]
3874
+ local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
3875
+ if constants.SKY_LOGS_DIRECTORY in dir else
3876
+ os.path.join(local_dir, dir)) for dir in dirs]
3865
3877
 
3866
3878
  runners = handle.get_command_runners()
3867
3879
 
@@ -4033,8 +4045,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4033
4045
 
4034
4046
  # get the run_timestamp
4035
4047
  # the function takes in [job_id]
4036
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
4037
- [str(job_id)])
4048
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
4038
4049
  returncode, run_timestamps, stderr = self.run_on_head(
4039
4050
  handle,
4040
4051
  code,
@@ -5182,9 +5193,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5182
5193
 
5183
5194
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5184
5195
  task: task_lib.Task, job_id: int,
5185
- detach_run: bool) -> None:
5196
+ detach_run: bool, remote_log_dir: str) -> None:
5186
5197
  # Launch the command as a Ray task.
5187
- log_dir = os.path.join(self.log_dir, 'tasks')
5198
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5188
5199
 
5189
5200
  resources_dict = backend_utils.get_task_demands_dict(task)
5190
5201
  internal_ips = handle.internal_ips()
@@ -5222,17 +5233,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5222
5233
  codegen.build(),
5223
5234
  job_id,
5224
5235
  detach_run=detach_run,
5225
- managed_job_dag=task.managed_job_dag)
5236
+ managed_job_dag=task.managed_job_dag,
5237
+ remote_log_dir=remote_log_dir)
5226
5238
 
5227
5239
  def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
5228
5240
  task: task_lib.Task, job_id: int,
5229
- detach_run: bool) -> None:
5241
+ detach_run: bool, remote_log_dir: str) -> None:
5230
5242
  # Strategy:
5231
5243
  # ray.init(...)
5232
5244
  # for node:
5233
5245
  # submit _run_cmd(cmd) with resource {node_i: 1}
5234
- log_dir_base = self.log_dir
5235
- log_dir = os.path.join(log_dir_base, 'tasks')
5246
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5236
5247
  resources_dict = backend_utils.get_task_demands_dict(task)
5237
5248
  internal_ips = handle.internal_ips()
5238
5249
  assert internal_ips is not None, 'internal_ips is not cached in handle'
@@ -5278,4 +5289,5 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5278
5289
  codegen.build(),
5279
5290
  job_id,
5280
5291
  detach_run=detach_run,
5281
- managed_job_dag=task.managed_job_dag)
5292
+ managed_job_dag=task.managed_job_dag,
5293
+ remote_log_dir=remote_log_dir)