skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +31 -3
  5. sky/backends/cloud_vm_ray_backend.py +22 -29
  6. sky/backends/wheel_utils.py +9 -0
  7. sky/check.py +1 -1
  8. sky/cli.py +253 -74
  9. sky/client/cli.py +253 -74
  10. sky/client/common.py +10 -3
  11. sky/client/sdk.py +11 -8
  12. sky/clouds/aws.py +2 -2
  13. sky/clouds/kubernetes.py +0 -8
  14. sky/clouds/oci.py +1 -1
  15. sky/core.py +17 -11
  16. sky/dashboard/out/404.html +1 -0
  17. sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
  19. sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
  21. sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
  25. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
  37. sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
  38. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
  39. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
  40. sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
  41. sky/dashboard/out/clusters/[cluster].html +1 -0
  42. sky/dashboard/out/clusters.html +1 -0
  43. sky/dashboard/out/favicon.ico +0 -0
  44. sky/dashboard/out/index.html +1 -0
  45. sky/dashboard/out/jobs/[job].html +1 -0
  46. sky/dashboard/out/jobs.html +1 -0
  47. sky/dashboard/out/skypilot.svg +15 -0
  48. sky/dashboard/out/videos/cursor-small.mp4 +0 -0
  49. sky/data/data_transfer.py +2 -1
  50. sky/data/storage.py +24 -14
  51. sky/exceptions.py +5 -0
  52. sky/jobs/constants.py +8 -1
  53. sky/jobs/server/core.py +12 -8
  54. sky/models.py +28 -0
  55. sky/optimizer.py +7 -9
  56. sky/provision/kubernetes/config.py +1 -1
  57. sky/provision/kubernetes/instance.py +16 -14
  58. sky/provision/kubernetes/network_utils.py +1 -1
  59. sky/provision/kubernetes/utils.py +50 -22
  60. sky/provision/provisioner.py +2 -1
  61. sky/resources.py +56 -2
  62. sky/serve/__init__.py +2 -0
  63. sky/serve/autoscalers.py +6 -2
  64. sky/serve/client/sdk.py +61 -0
  65. sky/serve/constants.py +6 -0
  66. sky/serve/load_balancing_policies.py +0 -4
  67. sky/serve/replica_managers.py +6 -8
  68. sky/serve/serve_state.py +0 -6
  69. sky/serve/serve_utils.py +33 -1
  70. sky/serve/server/core.py +192 -7
  71. sky/serve/server/server.py +28 -0
  72. sky/server/common.py +152 -47
  73. sky/server/constants.py +7 -1
  74. sky/server/requests/executor.py +4 -0
  75. sky/server/requests/payloads.py +12 -15
  76. sky/server/requests/serializers/decoders.py +2 -5
  77. sky/server/requests/serializers/encoders.py +2 -5
  78. sky/server/server.py +44 -1
  79. sky/setup_files/MANIFEST.in +1 -0
  80. sky/setup_files/dependencies.py +1 -0
  81. sky/sky_logging.py +12 -2
  82. sky/skylet/constants.py +5 -7
  83. sky/skylet/job_lib.py +3 -3
  84. sky/skypilot_config.py +225 -84
  85. sky/templates/kubernetes-ray.yml.j2 +7 -3
  86. sky/utils/cli_utils/status_utils.py +12 -5
  87. sky/utils/config_utils.py +39 -15
  88. sky/utils/controller_utils.py +44 -7
  89. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  90. sky/utils/kubernetes/gpu_labeler.py +99 -16
  91. sky/utils/schemas.py +24 -0
  92. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
  93. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
  94. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
  95. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,6 @@ with the backend functions. The benefit of having the default values in the
6
6
  payloads is that a user can find the default values in the Restful API docs.
7
7
  """
8
8
  import getpass
9
- import json
10
9
  import os
11
10
  import typing
12
11
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,7 +46,7 @@ def request_body_env_vars() -> dict:
47
46
  # Remove the path to config file, as the config content is included in the
48
47
  # request body and will be merged with the config on the server side.
49
48
  env_vars.pop(skypilot_config.ENV_VAR_SKYPILOT_CONFIG, None)
50
- env_vars.pop(skypilot_config.ENV_VAR_USER_CONFIG, None)
49
+ env_vars.pop(skypilot_config.ENV_VAR_GLOBAL_CONFIG, None)
51
50
  env_vars.pop(skypilot_config.ENV_VAR_PROJECT_CONFIG, None)
52
51
  return env_vars
53
52
 
@@ -56,20 +55,9 @@ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
56
55
  """Returns the override configs from the client."""
57
56
  config = skypilot_config.to_dict()
58
57
  # Remove the API server config, as we should not specify the SkyPilot
59
- # server endpoint on the server side. This avoids the warning below.
58
+ # server endpoint on the server side. This avoids the warning at
59
+ # server-side.
60
60
  config.pop_nested(('api_server',), default_value=None)
61
- ignored_key_values = {}
62
- for nested_key in constants.SKIPPED_CLIENT_OVERRIDE_KEYS:
63
- value = config.pop_nested(nested_key, default_value=None)
64
- if value is not None:
65
- ignored_key_values['.'.join(nested_key)] = value
66
- if ignored_key_values:
67
- logger.debug(f'The following keys ({json.dumps(ignored_key_values)}) '
68
- 'are specified in the client SkyPilot config at '
69
- f'{skypilot_config.loaded_config_path()!r}. '
70
- 'This will be ignored. If you want to specify it, '
71
- 'please modify it on server side or contact your '
72
- 'administrator.')
73
61
  return config
74
62
 
75
63
 
@@ -420,6 +408,15 @@ class ServeLogsBody(RequestBody):
420
408
  follow: bool = True
421
409
 
422
410
 
411
+ class ServeDownloadLogsBody(RequestBody):
412
+ """The request body for the serve download logs endpoint."""
413
+ service_name: str
414
+ local_dir: str
415
+ targets: Optional[Union[str, serve.ServiceComponent,
416
+ List[Union[str, serve.ServiceComponent]]]]
417
+ replica_ids: Optional[List[int]] = None
418
+
419
+
423
420
  class ServeStatusBody(RequestBody):
424
421
  """The request body for the serve status endpoint."""
425
422
  service_names: Optional[Union[str, List[str]]]
@@ -188,8 +188,5 @@ def decode_job_status(
188
188
 
189
189
  @register_decoders('kubernetes_node_info')
190
190
  def decode_kubernetes_node_info(
191
- return_value: Dict[str, Any]) -> Dict[str, models.KubernetesNodeInfo]:
192
- return {
193
- node_name: models.KubernetesNodeInfo(**node_info)
194
- for node_name, node_info in return_value.items()
195
- }
191
+ return_value: Dict[str, Any]) -> models.KubernetesNodesInfo:
192
+ return models.KubernetesNodesInfo.from_dict(return_value)
@@ -159,8 +159,5 @@ def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
159
159
 
160
160
  @register_encoder('kubernetes_node_info')
161
161
  def encode_kubernetes_node_info(
162
- return_value: Dict[str, 'models.KubernetesNodeInfo']) -> Dict[str, Any]:
163
- return {
164
- node_name: dataclasses.asdict(node_info)
165
- for node_name, node_info in return_value.items()
166
- }
162
+ return_value: 'models.KubernetesNodesInfo') -> Dict[str, Any]:
163
+ return return_value.to_dict()
sky/server/server.py CHANGED
@@ -150,7 +150,21 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
150
150
  # Shutdown: Add any cleanup code here if needed
151
151
 
152
152
 
153
+ # Add a new middleware class to handle /internal/dashboard prefix
154
+ class InternalDashboardPrefixMiddleware(
155
+ starlette.middleware.base.BaseHTTPMiddleware):
156
+ """Middleware to handle /internal/dashboard prefix in requests."""
157
+
158
+ async def dispatch(self, request: fastapi.Request, call_next):
159
+ path = request.url.path
160
+ if path.startswith('/internal/dashboard/'):
161
+ # Remove /internal/dashboard prefix and update request scope
162
+ request.scope['path'] = path.replace('/internal/dashboard/', '/', 1)
163
+ return await call_next(request)
164
+
165
+
153
166
  app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
167
+ app.add_middleware(InternalDashboardPrefixMiddleware)
154
168
  app.add_middleware(
155
169
  cors.CORSMiddleware,
156
170
  # TODO(zhwu): in production deployment, we should restrict the allowed
@@ -210,7 +224,7 @@ async def kubernetes_node_info(
210
224
  request: fastapi.Request,
211
225
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
212
226
  ) -> None:
213
- """Gets Kubernetes node information."""
227
+ """Gets Kubernetes nodes information and hints."""
214
228
  executor.schedule_request(
215
229
  request_id=request.state.request_id,
216
230
  request_name='kubernetes_node_info',
@@ -1101,6 +1115,35 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
1101
1115
  return global_user_state.get_storage_names_start_with(incomplete)
1102
1116
 
1103
1117
 
1118
+ # Add a route to serve static files
1119
+ @app.get('/{full_path:path}')
1120
+ async def serve_static_or_dashboard(full_path: str):
1121
+ """Serves static files for any unmatched routes.
1122
+
1123
+ Handles the /dashboard prefix from Next.js configuration.
1124
+ """
1125
+ # Check if the path starts with 'dashboard/' and remove it if it does
1126
+ if full_path.startswith('dashboard/'):
1127
+ full_path = full_path[len('dashboard/'):]
1128
+
1129
+ # Try to serve the file directly from the out directory first
1130
+ file_path = os.path.join(server_constants.DASHBOARD_DIR, full_path)
1131
+ if os.path.isfile(file_path):
1132
+ return fastapi.responses.FileResponse(file_path)
1133
+
1134
+ # If file not found, serve the index.html for client-side routing.
1135
+ # For example, the non-matched arbitrary route (/ or /test) from
1136
+ # client will be redirected to the index.html.
1137
+ index_path = os.path.join(server_constants.DASHBOARD_DIR, 'index.html')
1138
+ try:
1139
+ with open(index_path, 'r', encoding='utf-8') as f:
1140
+ content = f.read()
1141
+ return fastapi.responses.HTMLResponse(content=content)
1142
+ except Exception as e:
1143
+ logger.error(f'Error serving dashboard: {e}')
1144
+ raise fastapi.HTTPException(status_code=500, detail=str(e))
1145
+
1146
+
1104
1147
  if __name__ == '__main__':
1105
1148
  import uvicorn
1106
1149
 
@@ -15,3 +15,4 @@ include sky/jobs/dashboard/static/*
15
15
  include sky/templates/*
16
16
  include sky/utils/kubernetes/*
17
17
  include sky/server/html/*
18
+ recursive-include sky/dashboard/out *
@@ -53,6 +53,7 @@ install_requires = [
53
53
  'aiofiles',
54
54
  'httpx',
55
55
  'setproctitle',
56
+ 'omegaconf>=2.4.0dev3,<2.5',
56
57
  ]
57
58
 
58
59
  local_ray = [
sky/sky_logging.py CHANGED
@@ -18,6 +18,12 @@ _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
18
18
  _DATE_FORMAT = '%m-%d %H:%M:%S'
19
19
  _SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
20
20
 
21
+ DEBUG = logging.DEBUG
22
+ INFO = logging.INFO
23
+ WARNING = logging.WARNING
24
+ ERROR = logging.ERROR
25
+ CRITICAL = logging.CRITICAL
26
+
21
27
 
22
28
  def _show_logging_prefix():
23
29
  return env_options.Options.SHOW_DEBUG_INFO.get(
@@ -97,8 +103,8 @@ def _setup_logger():
97
103
  def reload_logger():
98
104
  """Reload the logger.
99
105
 
100
- This is useful when the logging configuration is changed.
101
- e.g., the logging level is changed or stdout/stderr is reset.
106
+ This ensures that the logger takes the new environment variables,
107
+ such as SKYPILOT_DEBUG.
102
108
  """
103
109
  global _default_handler
104
110
  _root_logger.removeHandler(_default_handler)
@@ -127,6 +133,10 @@ def set_logging_level(logger: str, level: int):
127
133
  logger.setLevel(original_level)
128
134
 
129
135
 
136
+ def logging_enabled(logger: logging.Logger, level: int) -> bool:
137
+ return logger.level <= level
138
+
139
+
130
140
  @contextlib.contextmanager
131
141
  def silent():
132
142
  """Make all sky_logging.print() and logger.{info, warning...} silent.
sky/skylet/constants.py CHANGED
@@ -117,7 +117,7 @@ RUNPOD_DOCKER_USERNAME_ENV_VAR = 'SKYPILOT_RUNPOD_DOCKER_USERNAME'
117
117
 
118
118
  # Commands for disable GPU ECC, which can improve the performance of the GPU
119
119
  # for some workloads by 30%. This will only be applied when a user specify
120
- # `nvidia_gpus.disable_ecc: true` in ~/.sky/skyconfig.yaml.
120
+ # `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml.
121
121
  # Running this command will reboot the machine, introducing overhead for
122
122
  # provisioning the machine.
123
123
  # https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW
@@ -299,11 +299,6 @@ FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
299
299
  FILE_MOUNTS_SUBPATH = 'job-{run_id}/local-file-mounts/{i}'
300
300
  FILE_MOUNTS_TMP_SUBPATH = 'job-{run_id}/tmp-files'
301
301
 
302
- # The default idle timeout for SkyPilot controllers. This include jobs
303
- # controller and sky serve controller.
304
- # TODO(tian): Refactor to controller_utils. Current blocker: circular import.
305
- CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
306
-
307
302
  # Due to the CPU/memory usage of the controller process launched with sky jobs (
308
303
  # use ray job under the hood), we need to reserve some CPU/memory for each jobs/
309
304
  # serve controller process.
@@ -337,7 +332,7 @@ RCLONE_LOG_DIR = '~/.sky/rclone_log'
337
332
  RCLONE_CACHE_DIR = '~/.cache/rclone'
338
333
  RCLONE_CACHE_REFRESH_INTERVAL = 10
339
334
 
340
- # The keys that can be overridden in the `~/.sky/skyconfig.yaml` file. The
335
+ # The keys that can be overridden in the `~/.sky/config.yaml` file. The
341
336
  # overrides are specified in task YAMLs.
342
337
  OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
343
338
  ('docker', 'run_options'),
@@ -367,3 +362,6 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
367
362
 
368
363
  # Path to the generated cluster config yamls and ssh configs.
369
364
  SKY_USER_FILE_PATH = '~/.sky/generated'
365
+
366
+ # Environment variable that is set to 'true' if this is a skypilot server.
367
+ ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
sky/skylet/job_lib.py CHANGED
@@ -630,7 +630,7 @@ def update_job_status(job_ids: List[int],
630
630
  'it to FAILED_DRIVER')
631
631
  status = JobStatus.FAILED_DRIVER
632
632
  elif job_pid < 0:
633
- # TODO(zhwu): Backward compatibility, remove after 0.9.0.
633
+ # TODO(zhwu): Backward compatibility, remove after 0.10.0.
634
634
  # We set the job status to PENDING instead of actually
635
635
  # checking ray job status and let the status in job table
636
636
  # take effect in the later max.
@@ -882,7 +882,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
882
882
  # child processes.
883
883
  elif job['pid'] < 0:
884
884
  try:
885
- # TODO(zhwu): Backward compatibility, remove after 0.9.0.
885
+ # TODO(zhwu): Backward compatibility, remove after 0.10.0.
886
886
  # The job was submitted with ray job submit before #4318.
887
887
  job_client = _create_ray_job_submission_client()
888
888
  job_client.stop_job(_make_ray_job_id(job['job_id']))
@@ -1008,7 +1008,7 @@ class JobLibCodeGen:
1008
1008
  # Print cancelled IDs. Caller should parse by decoding.
1009
1009
  'print(cancelled, flush=True)',
1010
1010
  ]
1011
- # TODO(zhwu): Backward compatibility, remove after 0.9.0.
1011
+ # TODO(zhwu): Backward compatibility, remove after 0.12.0.
1012
1012
  if user_hash is None:
1013
1013
  code = [
1014
1014
  (f'cancelled = job_lib.cancel_jobs_encoded_results('