skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/cloud_vm_ray_backend.py +43 -60
  3. sky/cli.py +55 -637
  4. sky/client/cli.py +55 -637
  5. sky/clouds/kubernetes.py +3 -0
  6. sky/clouds/scp.py +7 -26
  7. sky/clouds/utils/scp_utils.py +177 -124
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/dashboard/out/users.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/jobs/controller.py +98 -31
  26. sky/jobs/scheduler.py +37 -29
  27. sky/jobs/server/core.py +36 -3
  28. sky/jobs/state.py +69 -9
  29. sky/jobs/utils.py +11 -0
  30. sky/provision/__init__.py +1 -0
  31. sky/provision/scp/__init__.py +15 -0
  32. sky/provision/scp/config.py +93 -0
  33. sky/provision/scp/instance.py +528 -0
  34. sky/resources.py +164 -29
  35. sky/skylet/constants.py +39 -0
  36. sky/skylet/job_lib.py +8 -0
  37. sky/task.py +171 -21
  38. sky/templates/kubernetes-ray.yml.j2 +51 -4
  39. sky/templates/scp-ray.yml.j2 +3 -50
  40. sky/users/permission.py +19 -36
  41. sky/utils/command_runner.py +1 -1
  42. sky/utils/common_utils.py +16 -14
  43. sky/utils/context.py +1 -1
  44. sky/utils/controller_utils.py +12 -3
  45. sky/utils/dag_utils.py +17 -4
  46. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  47. sky/utils/schemas.py +43 -5
  48. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
  49. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
  50. sky/benchmark/__init__.py +0 -0
  51. sky/benchmark/benchmark_state.py +0 -295
  52. sky/benchmark/benchmark_utils.py +0 -641
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  55. sky/skylet/providers/scp/__init__.py +0 -2
  56. sky/skylet/providers/scp/config.py +0 -149
  57. sky/skylet/providers/scp/node_provider.py +0 -578
  58. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
  59. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
  60. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
  61. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
  62. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
@@ -1,641 +0,0 @@
1
- """Benchmark utils: Utility functions to manage benchmarking process."""
2
- import copy
3
- import getpass
4
- import glob
5
- import json
6
- from multiprocessing import pool
7
- import os
8
- import subprocess
9
- import sys
10
- import tempfile
11
- import textwrap
12
- import time
13
- import typing
14
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
15
- import uuid
16
-
17
- import colorama
18
- import prettytable
19
- from rich import progress as rich_progress
20
-
21
- import sky
22
- from sky import backends
23
- from sky import clouds
24
- from sky import data
25
- from sky import global_user_state
26
- from sky import optimizer
27
- from sky import sky_logging
28
- from sky.backends import backend_utils
29
- from sky.benchmark import benchmark_state
30
- from sky.data import storage as storage_lib
31
- from sky.skylet import constants
32
- from sky.skylet import job_lib
33
- from sky.skylet import log_lib
34
- from sky.utils import common_utils
35
- from sky.utils import log_utils
36
- from sky.utils import rich_utils
37
- from sky.utils import status_lib
38
- from sky.utils import subprocess_utils
39
- from sky.utils import ux_utils
40
-
41
- if typing.TYPE_CHECKING:
42
- from sky import resources as resources_lib
43
-
44
- logger = sky_logging.init_logger(__name__)
45
-
46
- _SKY_LOCAL_BENCHMARK_DIR = os.path.expanduser('~/.sky/benchmarks')
47
- _SKY_REMOTE_BENCHMARK_DIR = '~/.sky/sky_benchmark_dir'
48
- # NOTE: This must be the same as _SKY_REMOTE_BENCHMARK_DIR
49
- # in sky/callbacks/sky_callback/base.py.
50
- _SKY_REMOTE_BENCHMARK_DIR_SYMLINK = '~/sky_benchmark_dir'
51
-
52
- # NOTE: This must be the same as _BENCHMARK_SUMMARY
53
- # in sky/callbacks/sky_callback/base.py.
54
- _BENCHMARK_SUMMARY = 'summary.json'
55
- _RUN_START = 'run_start.txt'
56
- _RUN_END = 'run_end.txt'
57
-
58
- _Config = Dict[str, Any]
59
-
60
-
61
- def _generate_cluster_names(benchmark: str, num_clusters: int) -> List[str]:
62
- if num_clusters == 1:
63
- names = [f'sky-bench-{benchmark}']
64
- else:
65
- names = [f'sky-bench-{benchmark}-{i}' for i in range(num_clusters)]
66
- for name in names:
67
- if global_user_state.get_cluster_from_name(name) is not None:
68
- with ux_utils.print_exception_no_traceback():
69
- raise ValueError(f'Cluster name {name} is taken. '
70
- 'Try using a different benchmark name.')
71
- return names
72
-
73
-
74
- def _make_script_with_timelogs(script: str, start_path: str,
75
- end_path: str) -> str:
76
- """Add prologue and epilogue that log the start and end times of the script.
77
-
78
- Using the logs, we can get the job status and duration even when the cluster
79
- has stopped or terminated. Note that the end time is only logged when the
80
- command finishes successfully.
81
- """
82
- return textwrap.dedent(f"""\
83
- echo $(date +%s.%N) > {start_path}
84
- {script}
85
- EXIT_CODE=$?
86
- if [ $EXIT_CODE -eq 0 ]; then
87
- echo $(date +%s.%N) > {end_path}
88
- fi
89
- exit $EXIT_CODE
90
- """)
91
-
92
-
93
- def _get_optimized_resources(
94
- candidate_configs: List[_Config]) -> List['resources_lib.Resources']:
95
- candidate_configs = copy.deepcopy(candidate_configs)
96
- optimized_resources = []
97
- for config in candidate_configs:
98
- with sky.Dag() as dag:
99
- resources = config.get('resources', None)
100
- resources = sky.Resources.from_yaml_config(resources)
101
- task = sky.Task()
102
- task.set_resources(resources)
103
-
104
- # Do not use `sky.optimize` here, as this should be called on the API
105
- # server side.
106
- dag = optimizer.Optimizer.optimize(dag, quiet=True)
107
- task = dag.tasks[0]
108
- optimized_resources.append(task.best_resources)
109
- return optimized_resources
110
-
111
-
112
- def _print_candidate_resources(
113
- benchmark: str, clusters: List[str], config: _Config,
114
- candidate_resources: List['resources_lib.Resources']) -> None:
115
- task_str = config.get('name', 'a task')
116
- num_nodes = config.get('num_nodes', 1)
117
- logger.info(f'{colorama.Style.BRIGHT}Benchmarking {task_str} '
118
- f'on candidate resources (benchmark name: {benchmark}):'
119
- f'{colorama.Style.RESET_ALL}')
120
-
121
- columns = [
122
- 'CLUSTER',
123
- 'CLOUD',
124
- '# NODES',
125
- 'INSTANCE',
126
- 'vCPUs',
127
- 'Mem(GB)',
128
- 'ACCELERATORS',
129
- 'PRICE ($/hr)',
130
- ]
131
- table_kwargs = {
132
- 'hrules': prettytable.FRAME,
133
- 'vrules': prettytable.NONE,
134
- 'border': True,
135
- }
136
- candidate_table = log_utils.create_table(columns, **table_kwargs)
137
-
138
- for cluster, resources in zip(clusters, candidate_resources):
139
- if resources.accelerators is None:
140
- accelerators = '-'
141
- else:
142
- accelerator, count = list(resources.accelerators.items())[0]
143
- accelerators = f'{accelerator}:{count}'
144
- cloud = resources.cloud
145
- vcpus, mem = cloud.get_vcpus_mem_from_instance_type(
146
- resources.instance_type)
147
-
148
- def format_number(x):
149
- if x is None:
150
- return '-'
151
- elif x.is_integer():
152
- return str(int(x))
153
- else:
154
- return f'{x:.1f}'
155
-
156
- vcpus = format_number(vcpus)
157
- mem = format_number(mem)
158
- cost = num_nodes * resources.get_cost(3600)
159
- spot = '[Spot]' if resources.use_spot else ''
160
- row = [
161
- cluster, cloud, num_nodes, resources.instance_type + spot, vcpus,
162
- mem, accelerators, f'{cost:.2f}'
163
- ]
164
- candidate_table.add_row(row)
165
- logger.info(f'{candidate_table}\n')
166
-
167
-
168
- def _create_benchmark_bucket() -> Tuple[str, str]:
169
- # Generate a bucket name.
170
- # TODO(woosuk): Use a more pleasant naming scheme.
171
- # TODO(woosuk): Ensure that the bucket name is globally unique.
172
- bucket_name = f'sky-bench-{uuid.uuid4().hex[:4]}-{getpass.getuser()}'
173
-
174
- # Select the bucket type.
175
- enabled_clouds = (
176
- storage_lib.get_cached_enabled_storage_cloud_names_or_refresh(
177
- raise_if_no_cloud_access=True))
178
- # Sky Benchmark only supports S3 (see _download_remote_dir and
179
- # _delete_remote_dir).
180
- enabled_clouds = [
181
- cloud for cloud in enabled_clouds if cloud in [str(clouds.AWS())]
182
- ]
183
- assert enabled_clouds, ('No enabled cloud storage found. Sky Benchmark '
184
- 'requires GCP or AWS to store logs.')
185
- bucket_type = data.StoreType.from_cloud(enabled_clouds[0]).value
186
-
187
- # Create a benchmark bucket.
188
- logger.info(f'Creating a bucket {bucket_name} to save the benchmark logs.')
189
- storage = data.Storage(bucket_name, source=None, persistent=True)
190
- storage.construct()
191
- storage.add_store(bucket_type)
192
-
193
- # Save the bucket name and type to the config.
194
- benchmark_state.set_benchmark_bucket(bucket_name, bucket_type)
195
- return bucket_name, bucket_type
196
-
197
-
198
- def _format_err_msg(msg: str):
199
- return f'{colorama.Fore.RED}{msg}{colorama.Style.RESET_ALL}'
200
-
201
-
202
- def _parallel_run_with_interrupt_handling(func: Callable,
203
- args: List[Any]) -> List[Any]:
204
- with pool.ThreadPool(processes=len(args)) as p:
205
- try:
206
- return list(p.imap(func, args))
207
- except KeyboardInterrupt:
208
- print()
209
- logger.error(_format_err_msg('Interrupted by user.'))
210
- subprocess_utils.run('sky status')
211
- sys.exit(1)
212
-
213
-
214
- def _launch_with_log_suppress_exception(
215
- cluster: str, cmd: List[str],
216
- log_dir: str) -> Union[Tuple[int, str], Exception]:
217
- """Executes `sky launch` in a subprocess and returns normally.
218
-
219
- This function does not propagate any error so that failures in a
220
- launch thread do not disrupt the other parallel launch threads.
221
- """
222
- prefix_color = colorama.Fore.MAGENTA
223
- prefix = f'{prefix_color}({cluster}){colorama.Style.RESET_ALL} '
224
- try:
225
- returncode, _, stderr = log_lib.run_with_log(
226
- cmd,
227
- log_path=os.path.join(log_dir, f'{cluster}.log'),
228
- stream_logs=True,
229
- streaming_prefix=prefix,
230
- start_streaming_at='Creating a new cluster: ',
231
- skip_lines=[
232
- 'Tip: to reuse an existing cluster, specify --cluster (-c).',
233
- ],
234
- end_streaming_at='Job submitted with Job ID: ',
235
- require_outputs=True,
236
- )
237
- # Report any error from the `sky launch` subprocess.
238
- return returncode, stderr
239
- except Exception as e: # pylint: disable=broad-except
240
- # FIXME(woosuk): Avoid using general Exception.
241
- # Report any error in executing and processing the outputs of
242
- # the `sky launch` subprocess.
243
- return e
244
-
245
-
246
- def _download_remote_dir(remote_dir: str, local_dir: str,
247
- bucket_type: data.StoreType) -> None:
248
- # FIXME(woosuk): Replace this function with bucket.download_remote_dir.
249
- if bucket_type == data.StoreType.S3:
250
- remote_dir = f's3://{remote_dir}'
251
- subprocess.run(
252
- ['aws', 's3', 'cp', '--recursive', remote_dir, local_dir],
253
- stdout=subprocess.DEVNULL,
254
- stderr=subprocess.DEVNULL,
255
- check=True)
256
- else:
257
- raise RuntimeError(f'{bucket_type} is not supported yet.')
258
-
259
-
260
- def _delete_remote_dir(remote_dir: str, bucket_type: data.StoreType) -> None:
261
- # FIXME(woosuk): Replace this function with bucket.delete_remote_dir.
262
- if bucket_type == data.StoreType.S3:
263
- remote_dir = f's3://{remote_dir}'
264
- subprocess.run(['aws', 's3', 'rm', '--recursive', remote_dir],
265
- stdout=subprocess.DEVNULL,
266
- stderr=subprocess.DEVNULL,
267
- check=True)
268
- else:
269
- raise RuntimeError(f'{bucket_type} is not supported yet.')
270
-
271
-
272
- def _read_timestamp(path: str) -> float:
273
- with open(path, 'r', encoding='utf-8') as f:
274
- timestamp = f.readlines()
275
- assert len(timestamp) == 1
276
- return float(timestamp[0].strip())
277
-
278
-
279
- def _update_benchmark_result(benchmark_result: Dict[str, Any]) -> Optional[str]:
280
- benchmark = benchmark_result['benchmark']
281
- benchmark_status = benchmark_result['status']
282
- cluster = benchmark_result['cluster']
283
- if benchmark_status.is_terminal():
284
- # No need to update.
285
- return
286
-
287
- # Get the start and end timestamps if exist.
288
- local_dir = os.path.join(_SKY_LOCAL_BENCHMARK_DIR, benchmark, cluster)
289
- run_start_path = os.path.join(local_dir, _RUN_START)
290
- start_time = None
291
- if os.path.exists(run_start_path):
292
- start_time = _read_timestamp(run_start_path)
293
- run_end_path = os.path.join(local_dir, _RUN_END)
294
- end_time = None
295
- if os.path.exists(run_end_path):
296
- # The job has terminated with a zero exit code. See
297
- # generate_benchmark_configs() which ensures the 'run' commands write
298
- # out end_time remotely on success; and the caller of this func which
299
- # downloads all benchmark log files including the end_time file to
300
- # local.
301
- end_time = _read_timestamp(run_end_path)
302
-
303
- # Get the status of the benchmarking cluster and job.
304
- record = global_user_state.get_cluster_from_name(cluster)
305
- cluster_status = None
306
- job_status = None
307
- if record is not None:
308
- cluster_status, handle = backend_utils.refresh_cluster_status_handle(
309
- cluster)
310
- if handle is not None:
311
- backend = backend_utils.get_backend_from_handle(handle)
312
- assert isinstance(backend, backends.CloudVmRayBackend)
313
-
314
- if cluster_status == status_lib.ClusterStatus.UP:
315
- # NOTE: The id of the benchmarking job must be 1.
316
- # TODO(woosuk): Handle exceptions.
317
- job_status = backend.get_job_status(handle,
318
- job_ids=[1],
319
- stream_logs=False)[1]
320
-
321
- logger.debug(f'Cluster {cluster}, cluster_status: {cluster_status}, '
322
- f'benchmark_status {benchmark_status}, job_status: '
323
- f'{job_status}, start_time {start_time}, end_time {end_time}')
324
-
325
- # Update the benchmark status.
326
- if end_time is not None:
327
- # The job has terminated with zero exit code.
328
- benchmark_status = benchmark_state.BenchmarkStatus.FINISHED
329
- elif cluster_status is None:
330
- # Candidate cluster: preempted or never successfully launched.
331
- #
332
- # Note that benchmark record is only inserted after all clusters
333
- # finished launch() (successful or not). See
334
- # launch_benchmark_clusters(). So this case doesn't include "just before
335
- # candidate cluster's launch() is called".
336
-
337
- # See above: if cluster_status is not UP, job_status is defined as None.
338
- assert job_status is None, job_status
339
- benchmark_status = benchmark_state.BenchmarkStatus.TERMINATED
340
- elif cluster_status == status_lib.ClusterStatus.INIT:
341
- # Candidate cluster's launch has something gone wrong, or is still
342
- # launching.
343
-
344
- # See above: if cluster_status is not UP, job_status is defined as None.
345
- assert job_status is None, job_status
346
- benchmark_status = benchmark_state.BenchmarkStatus.INIT
347
- elif cluster_status == status_lib.ClusterStatus.STOPPED:
348
- # Candidate cluster is auto-stopped, or user manually stops it at any
349
- # time. Also, end_time is None.
350
-
351
- # See above: if cluster_status is not UP, job_status is defined as None.
352
- assert job_status is None, job_status
353
- benchmark_status = benchmark_state.BenchmarkStatus.TERMINATED
354
- else:
355
- assert cluster_status == status_lib.ClusterStatus.UP, (
356
- 'ClusterStatus enum should have been handled')
357
- if job_status is None:
358
- benchmark_status = benchmark_state.BenchmarkStatus.INIT
359
- else:
360
- if job_status < job_lib.JobStatus.RUNNING:
361
- benchmark_status = benchmark_state.BenchmarkStatus.INIT
362
- elif job_status == job_lib.JobStatus.RUNNING:
363
- benchmark_status = benchmark_state.BenchmarkStatus.RUNNING
364
- else:
365
- assert job_status.is_terminal(), '> RUNNING means terminal'
366
- # Case: cluster_status UP, job_status.is_terminal()
367
- if job_status == job_lib.JobStatus.SUCCEEDED:
368
- # Since we download the benchmark logs before checking the
369
- # cluster status, there is a chance that the end timestamp
370
- # is saved and the cluster is stopped AFTER we download the
371
- # logs. In this case, we consider the current timestamp as
372
- # the end time.
373
- end_time = time.time()
374
- benchmark_status = benchmark_state.BenchmarkStatus.FINISHED
375
- else:
376
- benchmark_status = (
377
- benchmark_state.BenchmarkStatus.TERMINATED)
378
-
379
- callback_log_dirs = glob.glob(os.path.join(local_dir, 'sky-callback-*'))
380
- if callback_log_dirs:
381
- # There can be multiple logs if the cluster has executed multiple jobs.
382
- # Here, we consider the first log as the log of the benchmarking job.
383
- log_dir = sorted(callback_log_dirs)[0]
384
- summary_path = os.path.join(log_dir, _BENCHMARK_SUMMARY)
385
- else:
386
- summary_path = None
387
-
388
- message = None
389
- if summary_path is not None and os.path.exists(summary_path):
390
- # (1) SkyCallback has saved the summary.
391
- with open(summary_path, 'r', encoding='utf-8') as f:
392
- summary = json.load(f)
393
- if end_time is None:
394
- last_time = summary['last_step_time']
395
- else:
396
- last_time = end_time
397
- if last_time is None:
398
- if job_status == job_lib.JobStatus.RUNNING:
399
- last_time = time.time()
400
- else:
401
- message = (f'No duration information found for {cluster}. '
402
- 'Check if at least 1 step has finished.')
403
- record = benchmark_state.BenchmarkRecord(
404
- start_time=start_time,
405
- last_time=last_time,
406
- num_steps_so_far=summary['num_steps'],
407
- seconds_per_step=summary['time_per_step'],
408
- estimated_total_seconds=summary['estimated_total_time'],
409
- )
410
- elif end_time is not None:
411
- # (2) The benchmarking job has terminated normally
412
- # without SkyCallback logs.
413
- record = benchmark_state.BenchmarkRecord(start_time=start_time,
414
- last_time=end_time)
415
- elif job_status == job_lib.JobStatus.RUNNING:
416
- # (3) SkyCallback is not initialized yet or not used.
417
- message = ('SkyCallback is not initialized yet '
418
- f'or not used for {cluster}.')
419
- record = benchmark_state.BenchmarkRecord(start_time=start_time,
420
- last_time=time.time())
421
- elif benchmark_status == benchmark_state.BenchmarkStatus.TERMINATED:
422
- # (4) The benchmarking job has terminated abnormally.
423
- message = (f'The benchmarking job on {cluster} has terminated with '
424
- 'non-zero exit code.')
425
- record = benchmark_state.BenchmarkRecord(start_time=start_time,
426
- last_time=None)
427
- else:
428
- # (5) Otherwise (e.g., cluster_status is INIT).
429
- message = f'No benchmark logs found for {cluster}.'
430
- record = benchmark_state.BenchmarkRecord(start_time=None,
431
- last_time=None)
432
- benchmark_state.update_benchmark_result(benchmark, cluster,
433
- benchmark_status, record)
434
- return message
435
-
436
-
437
- def generate_benchmark_configs(
438
- benchmark: str,
439
- config: _Config,
440
- candidates: List[Dict[str, str]],
441
- ) -> Tuple[List[str], List[_Config]]:
442
- # Generate a config for each cluster.
443
- clusters = _generate_cluster_names(benchmark, len(candidates))
444
- candidate_configs = []
445
- # TODO(woosuk): Use a jinja template.
446
- for cluster, candidate in zip(clusters, candidates):
447
- # Re-override the config with each candidate config.
448
- candidate_config = copy.deepcopy(config)
449
- if 'resources' not in candidate_config:
450
- candidate_config['resources'] = {}
451
- if 'candidates' in candidate_config['resources']:
452
- del candidate_config['resources']['candidates']
453
- candidate_config['resources'].update(candidate)
454
-
455
- # Mount the benchmark bucket to SKY_BENCHMARK_DIR.
456
- if 'file_mounts' not in candidate_config:
457
- candidate_config['file_mounts'] = {}
458
- # The bucket name and type are specified at launch time.
459
- candidate_config['file_mounts'][_SKY_REMOTE_BENCHMARK_DIR] = {
460
- 'name': None,
461
- 'mode': 'MOUNT',
462
- 'store': None,
463
- }
464
-
465
- benchmark_dir = os.path.join(_SKY_REMOTE_BENCHMARK_DIR, benchmark,
466
- cluster)
467
- if 'setup' not in candidate_config:
468
- candidate_config['setup'] = ''
469
- # Create a symbolic link to a directory in the benchmark bucket.
470
- candidate_config['setup'] = textwrap.dedent(f"""\
471
- mkdir -p {benchmark_dir}
472
- ln -s {benchmark_dir} {_SKY_REMOTE_BENCHMARK_DIR_SYMLINK}
473
- {candidate_config['setup']}""")
474
-
475
- # Log the start and end time of the benchmarking task.
476
- if 'run' not in candidate_config:
477
- candidate_config['run'] = ''
478
- candidate_config['run'] = _make_script_with_timelogs(
479
- candidate_config['run'], os.path.join(benchmark_dir, _RUN_START),
480
- os.path.join(benchmark_dir, _RUN_END))
481
-
482
- candidate_configs.append(candidate_config)
483
- return clusters, candidate_configs
484
-
485
-
486
- def print_benchmark_clusters(benchmark: str, clusters: List[str],
487
- config: _Config,
488
- candidate_configs: List[_Config]) -> None:
489
- candidate_resources = _get_optimized_resources(candidate_configs)
490
- _print_candidate_resources(benchmark, clusters, config, candidate_resources)
491
-
492
-
493
- def launch_benchmark_clusters(benchmark: str, clusters: List[str],
494
- candidate_configs: List[_Config],
495
- commandline_args: List[Dict[str, Any]]) -> bool:
496
- # Use a Sky storage to save the benchmark logs.
497
- bucket_name, bucket_type = benchmark_state.get_benchmark_bucket()
498
- if bucket_name is not None:
499
- handle = global_user_state.get_handle_from_storage_name(bucket_name)
500
- if handle is not None:
501
- assert bucket_type is not None
502
-
503
- # If the bucket does not exist, create one.
504
- if bucket_name is None or handle is None:
505
- bucket_name, bucket_type = _create_benchmark_bucket()
506
-
507
- # Remove the previous benchmark logs if exist.
508
- remove_benchmark_logs(benchmark, bucket_name, data.StoreType[bucket_type])
509
-
510
- # The benchmark bucket is mounted to _SKY_REMOTE_BENCHMARK_DIR.
511
- for candidate_config in candidate_configs:
512
- bucket_config = candidate_config['file_mounts'][
513
- _SKY_REMOTE_BENCHMARK_DIR]
514
- bucket_config['name'] = bucket_name
515
- bucket_config['store'] = bucket_type
516
-
517
- # Generate a temporary yaml file for each cluster.
518
- yaml_fds = []
519
- for cluster, candidate_config in zip(clusters, candidate_configs):
520
- # pylint: disable=consider-using-with
521
- f = tempfile.NamedTemporaryFile('w',
522
- prefix=f'{cluster}-',
523
- suffix='.yaml')
524
- common_utils.dump_yaml(f.name, candidate_config)
525
- yaml_fds.append(f)
526
- logger.debug(f'Generated temporary yaml file: {f.name}')
527
-
528
- # Generate a common launch command.
529
- cmd = ['-d', '-y']
530
- for arg_name, arg in commandline_args.items():
531
- if isinstance(arg, list):
532
- # 'env' arguments.
533
- for v in arg:
534
- cmd += [f'--{arg_name}', str(v)]
535
- else:
536
- cmd += [f'--{arg_name}', str(arg)]
537
-
538
- # Generate a launch command for each cluster.
539
- launch_cmds = [['sky', 'launch', yaml_fd.name, '-c', cluster] + cmd
540
- for yaml_fd, cluster in zip(yaml_fds, clusters)]
541
-
542
- # Save stdout/stderr from cluster launches.
543
- run_timestamp = sky_logging.get_run_timestamp()
544
- log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
545
- log_dir = os.path.expanduser(log_dir)
546
- logger.info(
547
- f'{colorama.Fore.YELLOW}To view stdout/stderr from individual '
548
- f'cluster launches, check the logs in{colorama.Style.RESET_ALL} '
549
- f'{colorama.Style.BRIGHT}{log_dir}/{colorama.Style.RESET_ALL}')
550
-
551
- # Launch the benchmarking clusters in parallel.
552
- outputs = _parallel_run_with_interrupt_handling(
553
- lambda args: _launch_with_log_suppress_exception(*args, log_dir=log_dir
554
- ),
555
- list(zip(clusters, launch_cmds)))
556
-
557
- # Handle the errors raised during the cluster launch.
558
- for cluster, output in zip(clusters, outputs):
559
- if isinstance(output, Exception):
560
- logger.error(_format_err_msg(f'Launching {cluster} failed.'))
561
- logger.error(output)
562
- else:
563
- returncode, stderr = output
564
- if returncode != 0:
565
- message = _format_err_msg(
566
- f'Launching {cluster} failed with code {returncode}.')
567
- logger.error(message)
568
- logger.error(stderr)
569
-
570
- # Delete the temporary yaml files.
571
- for f in yaml_fds:
572
- f.close()
573
-
574
- # If at least one cluster has been provisioned (in whatever state),
575
- # add the benchmark to the state so that `sky bench down` can
576
- # terminate the launched clusters.
577
- benchmark_created = False
578
- for cluster in clusters:
579
- record = global_user_state.get_cluster_from_name(cluster)
580
- if record is not None:
581
- if not benchmark_created:
582
- task_name = candidate_configs[0].get('name', None)
583
- benchmark_state.add_benchmark(benchmark, task_name, bucket_name)
584
- benchmark_created = True
585
- benchmark_state.add_benchmark_result(benchmark, record['handle'])
586
- return benchmark_created
587
-
588
-
589
- def update_benchmark_state(benchmark: str) -> None:
590
- benchmark_results = benchmark_state.get_benchmark_results(benchmark)
591
- if all(result['status'].is_terminal() for result in benchmark_results):
592
- return
593
-
594
- bucket_name = benchmark_state.get_benchmark_from_name(benchmark)['bucket']
595
- handle = global_user_state.get_handle_from_storage_name(bucket_name)
596
- bucket_type = list(handle.sky_stores.keys())[0]
597
-
598
- # Download the benchmark logs from the benchmark bucket.
599
- # FIXME(woosuk): Do not download the logs if not necessary.
600
- remote_dir = os.path.join(bucket_name, benchmark)
601
- local_dir = os.path.join(_SKY_LOCAL_BENCHMARK_DIR, benchmark)
602
- os.makedirs(local_dir, exist_ok=True)
603
- with rich_utils.safe_status(
604
- ux_utils.spinner_message('Downloading benchmark logs')):
605
- _download_remote_dir(remote_dir, local_dir, bucket_type)
606
-
607
- # Update the benchmark results in parallel.
608
- num_candidates = len(benchmark_results)
609
- plural = 's' if num_candidates > 1 else ''
610
- progress = rich_progress.Progress(transient=True,
611
- redirect_stdout=False,
612
- redirect_stderr=False)
613
- task = progress.add_task(ux_utils.spinner_message(
614
- f'Processing {num_candidates} benchmark result{plural}'),
615
- total=num_candidates)
616
-
617
- def _update_with_progress_bar(arg: Any) -> None:
618
- message = _update_benchmark_result(arg)
619
- if message is None:
620
- progress.update(task, advance=1)
621
- else:
622
- progress.stop()
623
- logger.info(
624
- f'{colorama.Fore.YELLOW}{message}{colorama.Style.RESET_ALL}')
625
- progress.start()
626
-
627
- with progress:
628
- _parallel_run_with_interrupt_handling(_update_with_progress_bar,
629
- benchmark_results)
630
- progress.live.transient = False
631
- progress.refresh()
632
-
633
-
634
- def remove_benchmark_logs(benchmark: str, bucket_name: str,
635
- bucket_type: data.StoreType) -> None:
636
- # Delete logs in the benchmark bucket.
637
- remote_dir = os.path.join(bucket_name, benchmark)
638
- _delete_remote_dir(remote_dir, bucket_type)
639
- # Delete logs in the local storage.
640
- local_dir = os.path.join(_SKY_LOCAL_BENCHMARK_DIR, benchmark)
641
- subprocess.run(['rm', '-rf', local_dir], check=False)
@@ -1,6 +0,0 @@
1
- (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[722],{8706:function(e,s,t){(window.__NEXT_P=window.__NEXT_P||[]).push(["/clusters/[cluster]",function(){return t(6996)}])},6639:function(e,s,t){"use strict";t.d(s,{Z:function(){return a}});/**
2
- * @license lucide-react v0.407.0 - ISC
3
- *
4
- * This source code is licensed under the ISC license.
5
- * See the LICENSE file in the root directory of this source tree.
6
- */let a=(0,t(998).Z)("ChevronRight",[["path",{d:"m9 18 6-6-6-6",key:"mthhwq"}]])},6996:function(e,s,t){"use strict";t.r(s);var a=t(5893),r=t(7294),l=t(8799),n=t(938),i=t(1163);t(9470);var c=t(1664),d=t.n(c),o=t(9037),m=t(9307),x=t(7673),u=t(3266),h=t(3626),j=t(282),f=t(8671),N=t(5895),y=t(6639),b=t(1272),v=t(6989),p=t(9284),g=t(3001),w=t(9008),k=t.n(w);let _=(e,s)=>{let t="",a="";return e>=0&&(t=e+"m",a=" "),s&&(t+="".concat(a,"(down)")),""===t&&(t="-"),t};function C(e){let{clusterData:s,clusterJobData:t,clusterJobsLoading:l,refreshClusterJobsOnly:i}=e,[c,o]=(0,r.useState)(!1),[u,h]=(0,r.useState)(!1),[p,g]=(0,r.useState)(!1),w=async()=>{try{let e=C(s.task_yaml);await navigator.clipboard.writeText(e),h(!0),setTimeout(()=>h(!1),2e3)}catch(e){console.error("Failed to copy YAML to clipboard:",e)}},k=async()=>{try{await navigator.clipboard.writeText(s.command),g(!0),setTimeout(()=>g(!1),2e3)}catch(e){console.error("Failed to copy command to clipboard:",e)}},C=e=>{if(!e)return"No YAML available";try{let s=b.ZP.load(e),t=b.ZP.dump(s,{lineWidth:-1,styles:{"!!str":"literal"},quotingType:"'",forceQuotes:!1,noRefs:!0,sortKeys:!1,condenseFlow:!1,indent:2}).split("\n"),a=[],r=-1;for(let e=0;e<t.length;e++){let s=t[e],l=s.search(/\S/);0===l&&r>=0&&e>0&&a.push(""),a.push(s),r=l}return a.join("\n").trim()}catch(s){return console.error("YAML formatting error:",s),e}},S=(null==s?void 0:s.command)||(null==s?void 0:s.task_yaml);return(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"mb-6",children:(0,a.jsxs)(x.Zb,{children:[(0,a.jsx)("div",{className:"flex items-center justify-between px-4 pt-4",children:(0,a.jsx)("h3",{className:"text-lg font-semibold",children:"Details"})}),(0,a.jsx)("div",{className:"p-4",children:(0,a.jsxs)("div",{className:"grid grid-cols-2 gap-6",children:[(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"Status"}),(0,a.jsx)("div",{className:"text-base mt-1",children:(0,a.jsx)(m.OE,{status:s.status})})]}),(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"Cluster"}),(0,a.jsx)("div",{className:"text-base mt-1",children:s.cluster})]}),(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"User"}),(0,a.jsx)("div",{className:"text-base mt-1",children:s.user})]}),(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"Infra"}),(0,a.jsx)("div",{className:"text-base mt-1",children:s.infra?(0,a.jsx)(v.Md,{content:s.full_infra||s.infra,className:"text-sm text-muted-foreground",children:(0,a.jsxs)("span",{children:[(0,a.jsx)(d(),{href:"/infra",className:"text-blue-600 hover:underline",children:s.cloud||s.infra.split("(")[0].trim()}),s.infra.includes("(")&&(0,a.jsx)("span",{children:" "+s.infra.substring(s.infra.indexOf("("))})]})}):"N/A"})]}),(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"Resources"}),(0,a.jsx)("div",{className:"text-base mt-1",children:s.resources_str_full||s.resources_str||"N/A"})]}),(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"Started"}),(0,a.jsx)("div",{className:"text-base mt-1",children:s.time?new Date(s.time).toLocaleString():"N/A"})]}),(0,a.jsxs)("div",{children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"Autostop"}),(0,a.jsx)("div",{className:"text-base mt-1",children:_(s.autostop,s.to_down)})]}),S&&(0,a.jsxs)("div",{className:"col-span-2",children:[(0,a.jsxs)("div",{className:"flex items-center",children:[(0,a.jsx)("div",{className:"text-gray-600 font-medium text-base",children:"Entrypoint"}),s.command&&(0,a.jsx)(v.WH,{content:p?"Copied!":"Copy command",className:"text-muted-foreground",children:(0,a.jsx)("button",{onClick:k,className:"flex items-center text-gray-500 hover:text-gray-700 transition-colors duration-200 p-1 ml-2",children:p?(0,a.jsx)(j.Z,{className:"w-4 h-4 text-green-600"}):(0,a.jsx)(f.Z,{className:"w-4 h-4"})})})]}),(0,a.jsxs)("div",{className:"space-y-4 mt-3",children:[s.command&&(0,a.jsx)("div",{children:(0,a.jsx)("div",{className:"bg-gray-50 border border-gray-200 rounded-md p-3",children:(0,a.jsx)("code",{className:"text-sm text-gray-800 font-mono break-all",children:s.command})})}),s.task_yaml&&"{}"!==s.task_yaml&&!s.cluster.startsWith("sky-jobs-controller-")&&!s.cluster.startsWith("sky-serve-controller-")&&(0,a.jsxs)("div",{children:[(0,a.jsxs)("div",{className:"flex items-center mb-2",children:[(0,a.jsxs)("button",{onClick:()=>{o(!c)},className:"flex items-center text-left focus:outline-none text-gray-700 hover:text-gray-900 transition-colors duration-200",children:[c?(0,a.jsx)(N.Z,{className:"w-4 h-4 mr-1"}):(0,a.jsx)(y.Z,{className:"w-4 h-4 mr-1"}),(0,a.jsx)("span",{className:"text-base",children:"Show SkyPilot YAML"})]}),(0,a.jsx)(v.WH,{content:u?"Copied!":"Copy YAML",className:"text-muted-foreground",children:(0,a.jsx)("button",{onClick:w,className:"flex items-center text-gray-500 hover:text-gray-700 transition-colors duration-200 p-1 ml-2",children:u?(0,a.jsx)(j.Z,{className:"w-4 h-4 text-green-600"}):(0,a.jsx)(f.Z,{className:"w-4 h-4"})})})]}),c&&(0,a.jsx)("div",{className:"bg-gray-50 border border-gray-200 rounded-md p-3 max-h-96 overflow-y-auto",children:(0,a.jsx)("pre",{className:"text-sm text-gray-800 font-mono whitespace-pre-wrap",children:C(s.task_yaml)})})]})]})]})]})})]})}),(0,a.jsx)("div",{className:"mb-8",children:(0,a.jsx)(n.ClusterJobs,{clusterName:s.cluster,clusterJobData:t,loading:l,refreshClusterJobsOnly:i})})]})}s.default=function(){let e=(0,i.useRouter)(),{cluster:s}=e.query,[t,n]=(0,r.useState)(!1),[c,m]=(0,r.useState)(!0),[x,j]=(0,r.useState)(!1),[f,N]=(0,r.useState)(!1),y=(0,g.X)(),{clusterData:b,clusterJobData:w,loading:_,clusterDetailsLoading:S,clusterJobsLoading:Z,refreshData:L,refreshClusterJobsOnly:O}=(0,u.QL)({cluster:s});r.useEffect(()=>{!S&&c&&m(!1)},[S,c]);let A=async()=>{n(!0),await L(),n(!1)};if(!e.isReady)return(0,a.jsx)("div",{children:"Loading..."});let D=s?"Cluster: ".concat(s," | SkyPilot Dashboard"):"Cluster Details | SkyPilot Dashboard";return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(k(),{children:(0,a.jsx)("title",{children:D})}),(0,a.jsxs)(a.Fragment,{children:[(0,a.jsxs)("div",{className:"flex items-center justify-between mb-4 h-5",children:[(0,a.jsxs)("div",{className:"text-base flex items-center",children:[(0,a.jsx)(d(),{href:"/clusters",className:"text-sky-blue hover:underline",children:"Sky Clusters"}),(0,a.jsx)("span",{className:"mx-2 text-gray-500",children:"›"}),(0,a.jsx)(d(),{href:"/clusters/".concat(s),className:"text-sky-blue hover:underline",children:s})]}),(0,a.jsx)("div",{className:"text-sm flex items-center",children:(0,a.jsxs)("div",{className:"text-sm flex items-center",children:[(S||t)&&(0,a.jsxs)("div",{className:"flex items-center mr-4",children:[(0,a.jsx)(l.Z,{size:15,className:"mt-0"}),(0,a.jsx)("span",{className:"ml-2 text-gray-500",children:"Loading..."})]}),b&&(0,a.jsxs)("div",{className:"flex items-center space-x-4",children:[(0,a.jsx)(v.WH,{content:"Refresh",className:"text-sm text-muted-foreground",children:(0,a.jsxs)("button",{onClick:A,disabled:S||t,className:"text-sky-blue hover:text-sky-blue-bright font-medium inline-flex items-center",children:[(0,a.jsx)(h.Z,{className:"w-4 h-4 mr-1.5"}),!y&&(0,a.jsx)("span",{children:"Refresh"})]})}),(0,a.jsx)(o.Status2Actions,{withLabel:!0,cluster:b.cluster,status:b.status,onOpenSSHModal:()=>{j(!0)},onOpenVSCodeModal:()=>{N(!0)}})]})]})})]}),S&&c?(0,a.jsxs)("div",{className:"flex justify-center items-center py-12",children:[(0,a.jsx)(l.Z,{size:24,className:"mr-2"}),(0,a.jsx)("span",{className:"text-gray-500",children:"Loading cluster details..."})]}):b?(0,a.jsx)(C,{clusterData:b,clusterJobData:w,clusterJobsLoading:Z,refreshClusterJobsOnly:O}):null,(0,a.jsx)(p.Oh,{isOpen:x,onClose:()=>j(!1),cluster:s}),(0,a.jsx)(p._R,{isOpen:f,onClose:()=>N(!1),cluster:s})]})]})}},9008:function(e,s,t){e.exports=t(7219)}},function(e){e.O(0,[616,760,799,804,664,798,947,470,901,969,856,973,938,37,888,774,179],function(){return e(e.s=8706)}),_N_E=e.O()}]);