skypilot-nightly 1.0.0.dev20250621__py3-none-any.whl → 1.0.0.dev20250624__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +12 -0
  5. sky/backends/cloud_vm_ray_backend.py +36 -13
  6. sky/client/cli/command.py +42 -21
  7. sky/client/sdk.py +12 -6
  8. sky/clouds/kubernetes.py +1 -0
  9. sky/core.py +88 -15
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +6 -0
  12. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/938-ce7991c156584b06.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-ce31493da9747ef4.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +6 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +6 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-cf490d1fa38f3740.js → [job]-171c27f4ca94861c.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-ecc5a7003776cfa7.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +1 -0
  32. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +3 -0
  33. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +1 -0
  34. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  35. sky/dashboard/out/clusters/[cluster].html +1 -1
  36. sky/dashboard/out/clusters.html +1 -1
  37. sky/dashboard/out/config.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra/[context].html +1 -1
  40. sky/dashboard/out/infra.html +1 -1
  41. sky/dashboard/out/jobs/[job].html +1 -1
  42. sky/dashboard/out/jobs.html +1 -1
  43. sky/dashboard/out/users.html +1 -1
  44. sky/dashboard/out/workspace/new.html +1 -1
  45. sky/dashboard/out/workspaces/[name].html +1 -1
  46. sky/dashboard/out/workspaces.html +1 -1
  47. sky/exceptions.py +11 -0
  48. sky/global_user_state.py +134 -20
  49. sky/jobs/client/sdk.py +0 -1
  50. sky/jobs/controller.py +5 -1
  51. sky/jobs/scheduler.py +4 -3
  52. sky/jobs/server/core.py +117 -51
  53. sky/jobs/state.py +15 -0
  54. sky/jobs/utils.py +114 -8
  55. sky/resources.py +1 -1
  56. sky/server/requests/payloads.py +6 -3
  57. sky/server/requests/requests.py +24 -1
  58. sky/server/server.py +4 -3
  59. sky/skylet/constants.py +5 -11
  60. sky/task.py +1 -26
  61. sky/templates/jobs-controller.yaml.j2 +12 -1
  62. sky/templates/kubernetes-ray.yml.j2 +1 -1
  63. sky/utils/admin_policy_utils.py +5 -1
  64. sky/utils/cli_utils/status_utils.py +25 -17
  65. sky/utils/command_runner.py +118 -12
  66. sky/utils/command_runner.pyi +57 -0
  67. sky/utils/common_utils.py +9 -1
  68. sky/utils/controller_utils.py +1 -2
  69. sky/utils/schemas.py +34 -35
  70. {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/METADATA +1 -1
  71. {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/RECORD +78 -77
  72. sky/dashboard/out/_next/static/PZWXta2b3IpViuIKI97hg/_buildManifest.js +0 -1
  73. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  74. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  76. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  78. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  79. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  80. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  81. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  87. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  89. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  90. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-bde186946d353355.js} +0 -0
  91. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-56412c7976b4655b.js} +0 -0
  92. /sky/dashboard/out/_next/static/{PZWXta2b3IpViuIKI97hg → zsALxITkbP8J8NVwSDwMo}/_ssgManifest.js +0 -0
  93. {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/WHEEL +0 -0
  94. {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/entry_points.txt +0 -0
  95. {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/licenses/LICENSE +0 -0
  96. {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -5,6 +5,7 @@ jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
5
5
  ManagedJobCodeGen.
6
6
  """
7
7
  import collections
8
+ import datetime
8
9
  import enum
9
10
  import os
10
11
  import pathlib
@@ -33,7 +34,10 @@ from sky.skylet import constants
33
34
  from sky.skylet import job_lib
34
35
  from sky.skylet import log_lib
35
36
  from sky.usage import usage_lib
37
+ from sky.utils import annotations
38
+ from sky.utils import command_runner
36
39
  from sky.utils import common_utils
40
+ from sky.utils import controller_utils
37
41
  from sky.utils import infra_utils
38
42
  from sky.utils import log_utils
39
43
  from sky.utils import message_utils
@@ -124,6 +128,114 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
124
128
  time.sleep(backoff.current_backoff())
125
129
 
126
130
 
131
+ def _check_consolidation_mode_consistency(
132
+ current_is_consolidation_mode: bool) -> None:
133
+ """Check the consistency of the consolidation mode."""
134
+ # Check whether the consolidation mode config is changed.
135
+ if current_is_consolidation_mode:
136
+ controller_cn = (
137
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
138
+ if global_user_state.get_cluster_from_name(controller_cn) is not None:
139
+ with ux_utils.print_exception_no_traceback():
140
+ raise exceptions.InconsistentConsolidationModeError(
141
+ f'{colorama.Fore.RED}Consolidation mode is '
142
+ f'enabled, but the controller cluster '
143
+ f'{controller_cn} is still running. Please '
144
+ 'terminate the controller cluster first.'
145
+ f'{colorama.Style.RESET_ALL}')
146
+ else:
147
+ all_jobs = managed_job_state.get_managed_jobs()
148
+ if all_jobs:
149
+ nonterminal_jobs = (
150
+ managed_job_state.get_nonterminal_job_ids_by_name(
151
+ None, all_users=True))
152
+ if nonterminal_jobs:
153
+ with ux_utils.print_exception_no_traceback():
154
+ raise exceptions.InconsistentConsolidationModeError(
155
+ f'{colorama.Fore.RED}Consolidation mode '
156
+ 'is disabled, but there are still '
157
+ f'{len(nonterminal_jobs)} managed jobs '
158
+ 'running. Please terminate those jobs '
159
+ f'first.{colorama.Style.RESET_ALL}')
160
+ else:
161
+ logger.warning(
162
+ f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
163
+ f'but there are {len(all_jobs)} jobs from previous '
164
+ 'consolidation mode. Reset the `jobs.controller.'
165
+ 'consolidation_mode` to `true` and run `sky jobs queue` '
166
+ 'to see those jobs. Switching to normal mode will '
167
+ f'lose the job history.{colorama.Style.RESET_ALL}')
168
+
169
+
170
+ # Whether to use consolidation mode or not. When this is enabled, the managed
171
+ # jobs controller will not be running on a separate cluster, but locally on the
172
+ # API Server. Under the hood, we submit the job monitoring logic as processes
173
+ # directly in the API Server.
174
+ # Use LRU Cache so that the check is only done once.
175
+ @annotations.lru_cache(scope='request', maxsize=1)
176
+ def is_consolidation_mode() -> bool:
177
+ consolidation_mode = skypilot_config.get_nested(
178
+ ('jobs', 'controller', 'consolidation_mode'), default_value=False)
179
+ _check_consolidation_mode_consistency(consolidation_mode)
180
+ return consolidation_mode
181
+
182
+
183
+ def get_ha_dump_script_path(job_id: int) -> pathlib.Path:
184
+ """Get the path to the HA dump script for a job."""
185
+ return pathlib.Path(constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser(
186
+ ).resolve() / f'sky_job_{job_id}'
187
+
188
+
189
+ def ha_recovery_for_consolidation_mode():
190
+ """Recovery logic for HA mode."""
191
+ # No setup recovery is needed in consolidation mode, as the API server
192
+ # already has all runtime installed. Directly start jobs recovery here.
193
+ # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
194
+ runner = command_runner.LocalProcessCommandRunner()
195
+ with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH, 'w',
196
+ encoding='utf-8') as f:
197
+ start = time.time()
198
+ f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
199
+ for job in managed_job_state.get_managed_jobs():
200
+ job_id = job['job_id']
201
+ controller_pid = job['controller_pid']
202
+
203
+ # In consolidation mode, it is possible that only the API server
204
+ # process is restarted, and the controller process is not. In such
205
+ # case, we don't need to do anything and the controller process will
206
+ # just keep running.
207
+ if controller_pid is not None:
208
+ try:
209
+ if _controller_process_alive(controller_pid, job_id):
210
+ f.write(f'Controller pid {controller_pid} for '
211
+ f'job {job_id} is still running. '
212
+ 'Skipping recovery.\n')
213
+ continue
214
+ except Exception: # pylint: disable=broad-except
215
+ # _controller_process_alive may raise if psutil fails; we
216
+ # should not crash the recovery logic because of this.
217
+ f.write('Error checking controller pid '
218
+ f'{controller_pid} for job {job_id}\n')
219
+
220
+ if job['schedule_state'] not in [
221
+ managed_job_state.ManagedJobScheduleState.DONE,
222
+ managed_job_state.ManagedJobScheduleState.WAITING
223
+ ]:
224
+ dump_script_path = get_ha_dump_script_path(job_id)
225
+ if not dump_script_path.exists():
226
+ f.write(f'Job {job_id}\'s recovery file ({dump_script_path}'
227
+ ') does not exist. Skipping recovery. Job '
228
+ f'schedule state: {job["schedule_state"]}\n')
229
+ continue
230
+ with open(dump_script_path, 'r', encoding='utf-8') as script_f:
231
+ script = script_f.read()
232
+ runner.run(script)
233
+ f.write(f'Job {job_id} (file: {dump_script_path}) completed '
234
+ f'recovery at {datetime.datetime.now()}\n')
235
+ f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
236
+ f.write(f'Total recovery time: {time.time() - start} seconds\n')
237
+
238
+
127
239
  def get_job_status(backend: 'backends.CloudVmRayBackend',
128
240
  cluster_name: str) -> Optional['job_lib.JobStatus']:
129
241
  """Check the status of the job running on a managed job cluster.
@@ -157,9 +269,8 @@ def _controller_process_alive(pid: int, job_id: int) -> bool:
157
269
  """Check if the controller process is alive."""
158
270
  try:
159
271
  process = psutil.Process(pid)
160
- # The last two args of the command line should be --job-id <id>
161
- job_args = process.cmdline()[-2:]
162
- return process.is_running() and job_args == ['--job-id', str(job_id)]
272
+ cmd_str = ' '.join(process.cmdline())
273
+ return process.is_running() and f'--job-id {job_id}' in cmd_str
163
274
  except psutil.NoSuchProcess:
164
275
  return False
165
276
 
@@ -1136,7 +1247,6 @@ def format_job_table(
1136
1247
  'TASK',
1137
1248
  *(['WORKSPACE'] if show_workspace else []),
1138
1249
  'NAME',
1139
- 'PRIORITY',
1140
1250
  *user_cols,
1141
1251
  'REQUESTED',
1142
1252
  'SUBMITTED',
@@ -1208,7 +1318,6 @@ def format_job_table(
1208
1318
  submitted_at = None
1209
1319
  end_at: Optional[int] = 0
1210
1320
  recovery_cnt = 0
1211
- priority = job_tasks[0].get('priority', '-')
1212
1321
  managed_job_status, current_task_id = _get_job_status_from_tasks(
1213
1322
  job_tasks)
1214
1323
  for task in job_tasks:
@@ -1244,7 +1353,6 @@ def format_job_table(
1244
1353
  '',
1245
1354
  *([''] if show_workspace else []),
1246
1355
  job_name,
1247
- str(priority),
1248
1356
  *user_values,
1249
1357
  '-',
1250
1358
  submitted,
@@ -1275,13 +1383,11 @@ def format_job_table(
1275
1383
  submitted = log_utils.readable_time_duration(task['submitted_at'])
1276
1384
  user_values = get_user_column_values(task)
1277
1385
  task_workspace = '-' if len(job_tasks) > 1 else workspace
1278
- priority = task.get('priority', '-')
1279
1386
  values = [
1280
1387
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1281
1388
  task['task_id'] if len(job_tasks) > 1 else '-',
1282
1389
  *([task_workspace] if show_workspace else []),
1283
1390
  task['task_name'],
1284
- str(priority),
1285
1391
  *user_values,
1286
1392
  task['resources'],
1287
1393
  # SUBMITTED
sky/resources.py CHANGED
@@ -2296,7 +2296,7 @@ def parse_time_minutes(time: str) -> int:
2296
2296
  def parse_memory_resource(resource_qty_str: Union[str, int, float],
2297
2297
  field_name: str,
2298
2298
  ret_type: type = int,
2299
- unit: str = 'g',
2299
+ unit: str = 'gb',
2300
2300
  allow_plus: bool = False,
2301
2301
  allow_x: bool = False,
2302
2302
  allow_rounding: bool = False) -> str:
@@ -5,7 +5,6 @@ kwargs for the payloads, otherwise, we have to keep the default values the sync
5
5
  with the backend functions. The benefit of having the default values in the
6
6
  payloads is that a user can find the default values in the Restful API docs.
7
7
  """
8
- import getpass
9
8
  import os
10
9
  import typing
11
10
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -58,8 +57,7 @@ def request_body_env_vars() -> dict:
58
57
  if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
59
58
  env_vars[env_var] = os.environ[env_var]
60
59
  env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
61
- env_vars[constants.USER_ENV_VAR] = os.getenv(constants.USER_ENV_VAR,
62
- getpass.getuser())
60
+ env_vars[constants.USER_ENV_VAR] = common_utils.get_current_user_name()
63
61
  env_vars[
64
62
  usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
65
63
  # Remove the path to config file, as the config content is included in the
@@ -613,3 +611,8 @@ class UpdateConfigBody(RequestBody):
613
611
  class GetConfigBody(RequestBody):
614
612
  """The request body for getting the entire SkyPilot configuration."""
615
613
  pass
614
+
615
+
616
+ class CostReportBody(RequestBody):
617
+ """The request body for the cost report endpoint."""
618
+ days: Optional[int] = 30
@@ -327,6 +327,26 @@ def refresh_cluster_status_event():
327
327
  time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
328
328
 
329
329
 
330
+ def managed_job_status_refresh_event():
331
+ """Refresh the managed job status for controller consolidation mode."""
332
+ # pylint: disable=import-outside-toplevel
333
+ from sky.jobs import utils as managed_job_utils
334
+ if not managed_job_utils.is_consolidation_mode():
335
+ return
336
+ # We run the recovery logic before starting the event loop as those two are
337
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
338
+ from sky.utils import controller_utils
339
+ if controller_utils.high_availability_specified(
340
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
341
+ managed_job_utils.ha_recovery_for_consolidation_mode()
342
+ # After recovery, we start the event loop.
343
+ from sky.skylet import events
344
+ event = events.ManagedJobEvent()
345
+ while True:
346
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
347
+ event.run()
348
+
349
+
330
350
  @dataclasses.dataclass
331
351
  class InternalRequestDaemon:
332
352
  id: str
@@ -341,7 +361,10 @@ INTERNAL_REQUEST_DAEMONS = [
341
361
  # cluster being stopped or down when `sky status -r` is called.
342
362
  InternalRequestDaemon(id='skypilot-status-refresh-daemon',
343
363
  name='status',
344
- event_fn=refresh_cluster_status_event)
364
+ event_fn=refresh_cluster_status_event),
365
+ InternalRequestDaemon(id='managed-job-status-refresh-daemon',
366
+ name='managed-job-status',
367
+ event_fn=managed_job_status_refresh_event),
345
368
  ]
346
369
 
347
370
 
sky/server/server.py CHANGED
@@ -1044,13 +1044,14 @@ async def download(download_body: payloads.DownloadBody) -> None:
1044
1044
  detail=f'Error creating zip file: {str(e)}')
1045
1045
 
1046
1046
 
1047
- @app.get('/cost_report')
1048
- async def cost_report(request: fastapi.Request) -> None:
1047
+ @app.post('/cost_report')
1048
+ async def cost_report(request: fastapi.Request,
1049
+ cost_report_body: payloads.CostReportBody) -> None:
1049
1050
  """Gets the cost report of a cluster."""
1050
1051
  executor.schedule_request(
1051
1052
  request_id=request.state.request_id,
1052
1053
  request_name='cost_report',
1053
- request_body=payloads.RequestBody(),
1054
+ request_body=cost_report_body,
1054
1055
  func=core.cost_report,
1055
1056
  schedule_type=requests_lib.ScheduleType.SHORT,
1056
1057
  )
sky/skylet/constants.py CHANGED
@@ -401,6 +401,8 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
401
401
  PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
402
402
  '~/.sky/.controller_recovery_restarting_signal')
403
403
 
404
+ HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
405
+
404
406
  # The placeholder for the local skypilot config path in file mounts for
405
407
  # controllers.
406
408
  LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
@@ -436,30 +438,20 @@ LOGGING_CONFIG_DIR = '~/.sky/logging'
436
438
 
437
439
  # Resources constants
438
440
  TIME_UNITS = {
439
- 's': 1 / 60,
440
- 'sec': 1 / 60,
441
441
  'm': 1,
442
- 'min': 1,
443
442
  'h': 60,
444
- 'hr': 60,
445
443
  'd': 24 * 60,
446
- 'day': 24 * 60,
444
+ 'w': 7 * 24 * 60,
447
445
  }
448
446
 
449
447
  TIME_PATTERN: str = (
450
448
  f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
451
449
 
452
450
  MEMORY_SIZE_UNITS = {
453
- 'b': 1,
454
- 'k': 2**10,
455
451
  'kb': 2**10,
456
- 'm': 2**20,
457
452
  'mb': 2**20,
458
- 'g': 2**30,
459
453
  'gb': 2**30,
460
- 't': 2**40,
461
454
  'tb': 2**40,
462
- 'p': 2**50,
463
455
  'pb': 2**50,
464
456
  }
465
457
 
@@ -472,3 +464,5 @@ MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
472
464
  MIN_PRIORITY = -1000
473
465
  MAX_PRIORITY = 1000
474
466
  DEFAULT_PRIORITY = 0
467
+
468
+ COST_REPORT_DEFAULT_DAYS = 30
sky/task.py CHANGED
@@ -342,8 +342,7 @@ class Task:
342
342
  self.resources: Union[List[sky.Resources],
343
343
  Set[sky.Resources]] = {sky.Resources()}
344
344
  self._service: Optional[service_spec.SkyServiceSpec] = None
345
- # The priority of the managed job running this task.
346
- self._job_priority: Optional[int] = None
345
+
347
346
  # Resources that this task cannot run on.
348
347
  self.blocked_resources = blocked_resources
349
348
 
@@ -712,10 +711,6 @@ class Task:
712
711
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
713
712
  task.set_service(service)
714
713
 
715
- job = config.pop('job', None)
716
- if job is not None and 'priority' in job:
717
- task.set_job_priority(job['priority'])
718
-
719
714
  assert not config, f'Invalid task args: {config.keys()}'
720
715
  return task
721
716
 
@@ -976,23 +971,6 @@ class Task:
976
971
  self._service = service
977
972
  return self
978
973
 
979
- @property
980
- def job_priority(self) -> Optional[int]:
981
- """The priority of the managed job running this task."""
982
- return self._job_priority
983
-
984
- def set_job_priority(self, priority: int) -> 'Task':
985
- """Sets the job priority for this task.
986
-
987
- Args:
988
- priority: an integer between 0 and 1000.
989
-
990
- Returns:
991
- self: The current task, with job priority set.
992
- """
993
- self._job_priority = priority
994
- return self
995
-
996
974
  def set_time_estimator(self, func: Callable[['sky.Resources'],
997
975
  int]) -> 'Task':
998
976
  """Sets a func mapping resources to estimated time (secs).
@@ -1436,9 +1414,6 @@ class Task:
1436
1414
  if self.service is not None:
1437
1415
  add_if_not_none('service', self.service.to_yaml_config())
1438
1416
 
1439
- if self.job_priority is not None:
1440
- add_if_not_none('job', {'priority': self.job_priority})
1441
-
1442
1417
  add_if_not_none('num_nodes', self.num_nodes)
1443
1418
 
1444
1419
  if self.inputs is not None:
@@ -31,7 +31,9 @@ setup: |
31
31
  {% endif %}
32
32
 
33
33
  run: |
34
+ {%- if consolidation_mode_job_id is none %}
34
35
  {{ sky_activate_python_env }}
36
+ {%- endif %}
35
37
 
36
38
  # Write env vars to a file
37
39
  {%- for env_name, env_value in controller_envs.items() %}
@@ -42,9 +44,18 @@ run: |
42
44
  # Note: The job is already in the `spot` table, marked as PENDING.
43
45
  # CloudVmRayBackend._exec_code_on_head() calls
44
46
  # managed_job_codegen.set_pending() before we get here.
45
- python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
47
+ {%- if consolidation_mode_job_id is not none %}
48
+ {{sky_python_cmd}} \
49
+ {%- else %}
50
+ python \
51
+ {%- endif %}
52
+ -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
46
53
  --user-yaml-path {{remote_original_user_yaml_path}} \
54
+ {%- if consolidation_mode_job_id is not none %}
55
+ --job-id {{consolidation_mode_job_id}} \
56
+ {%- else %}
47
57
  --job-id $SKYPILOT_INTERNAL_JOB_ID \
58
+ {%- endif %}
48
59
  --env-file {{remote_env_file_path}} \
49
60
  --priority {{priority}}
50
61
 
@@ -641,7 +641,7 @@ available_node_types:
641
641
  {% if high_availability %}
642
642
  mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
643
643
  if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
644
- SKYPILOT_HA_RECOVERY_LOG="/tmp/ha_recovery.log"
644
+ SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
645
645
  echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
646
646
  start_time=$SECONDS
647
647
  retry_count=0
@@ -140,13 +140,17 @@ def apply(
140
140
  at_client_side)
141
141
  try:
142
142
  mutated_user_request = policy.apply(user_request)
143
+ # Avoid duplicate exception wrapping.
144
+ except exceptions.UserRequestRejectedByPolicy as e:
145
+ with ux_utils.print_exception_no_traceback():
146
+ raise e
143
147
  except Exception as e: # pylint: disable=broad-except
144
148
  with ux_utils.print_exception_no_traceback():
145
149
  raise exceptions.UserRequestRejectedByPolicy(
146
150
  f'{colorama.Fore.RED}User request rejected by policy '
147
151
  f'{policy!r}{colorama.Fore.RESET}: '
148
152
  f'{common_utils.format_exception(e, use_bracket=True)}'
149
- ) from e
153
+ ) from None
150
154
  if mutated_config is None:
151
155
  mutated_config = mutated_user_request.skypilot_config
152
156
  else:
@@ -7,7 +7,6 @@ import colorama
7
7
 
8
8
  from sky import backends
9
9
  from sky.utils import common_utils
10
- from sky.utils import controller_utils
11
10
  from sky.utils import log_utils
12
11
  from sky.utils import resources_utils
13
12
  from sky.utils import status_lib
@@ -137,7 +136,8 @@ def get_total_cost_of_displayed_records(
137
136
 
138
137
  def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
139
138
  show_all: bool,
140
- controller_name: Optional[str] = None):
139
+ controller_name: Optional[str] = None,
140
+ days: Optional[int] = None):
141
141
  """Compute cluster table values and display for cost report.
142
142
 
143
143
  For each cluster, this shows: cluster name, resources, launched time,
@@ -200,23 +200,21 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
200
200
  cluster_table.add_row(row)
201
201
 
202
202
  if cluster_records:
203
+ controller_record = cluster_records[0]
203
204
  if controller_name is not None:
204
- controller = controller_utils.Controllers.from_name(controller_name)
205
- if controller is None:
206
- raise ValueError(f'Controller {controller_name} not found.')
207
- controller_handle: backends.CloudVmRayResourceHandle = (
208
- cluster_records[0]['handle'])
209
- autostop_config = (
210
- controller_handle.launched_resources.autostop_config)
211
- if autostop_config is not None:
205
+ autostop = controller_record.get('autostop', None)
206
+ autostop_str = ''
207
+ if autostop is not None:
212
208
  autostop_str = (f'{colorama.Style.DIM} (will be autostopped if '
213
- f'idle for {autostop_config.idle_minutes}min)'
209
+ f'idle for {autostop}min)'
214
210
  f'{colorama.Style.RESET_ALL}')
215
211
  click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
216
212
  f'{controller_name}{colorama.Style.RESET_ALL}'
217
213
  f'{autostop_str}')
218
214
  else:
219
- click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
215
+ days_str = '' if days is None else f' (last {days} days)'
216
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
217
+ f'Clusters{days_str}'
220
218
  f'{colorama.Style.RESET_ALL}')
221
219
  click.echo(cluster_table)
222
220
 
@@ -345,7 +343,9 @@ def _get_infra(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
345
343
 
346
344
 
347
345
  def _get_status_value_for_cost_report(
348
- cluster_cost_report_record: _ClusterCostReportRecord) -> int:
346
+ cluster_cost_report_record: _ClusterCostReportRecord,
347
+ truncate: bool = True) -> int:
348
+ del truncate
349
349
  status = cluster_cost_report_record['status']
350
350
  if status is None:
351
351
  return -1
@@ -353,7 +353,9 @@ def _get_status_value_for_cost_report(
353
353
 
354
354
 
355
355
  def _get_status_for_cost_report(
356
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
356
+ cluster_cost_report_record: _ClusterCostReportRecord,
357
+ truncate: bool = True) -> str:
358
+ del truncate
357
359
  status = cluster_cost_report_record['status']
358
360
  if status is None:
359
361
  return f'{colorama.Style.DIM}TERMINATED{colorama.Style.RESET_ALL}'
@@ -361,7 +363,9 @@ def _get_status_for_cost_report(
361
363
 
362
364
 
363
365
  def _get_resources_for_cost_report(
364
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
366
+ cluster_cost_report_record: _ClusterCostReportRecord,
367
+ truncate: bool = True) -> str:
368
+ del truncate
365
369
  launched_nodes = cluster_cost_report_record['num_nodes']
366
370
  launched_resources = cluster_cost_report_record['resources']
367
371
 
@@ -373,7 +377,9 @@ def _get_resources_for_cost_report(
373
377
 
374
378
 
375
379
  def _get_price_for_cost_report(
376
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
380
+ cluster_cost_report_record: _ClusterCostReportRecord,
381
+ truncate: bool = True) -> str:
382
+ del truncate
377
383
  launched_nodes = cluster_cost_report_record['num_nodes']
378
384
  launched_resources = cluster_cost_report_record['resources']
379
385
 
@@ -383,7 +389,9 @@ def _get_price_for_cost_report(
383
389
 
384
390
 
385
391
  def _get_estimated_cost_for_cost_report(
386
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
392
+ cluster_cost_report_record: _ClusterCostReportRecord,
393
+ truncate: bool = True) -> str:
394
+ del truncate
387
395
  cost = cluster_cost_report_record['total_cost']
388
396
 
389
397
  if not cost: