skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +72 -68
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -272,6 +272,65 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
|
|
272
272
|
return list(env_dict.items())
|
|
273
273
|
|
|
274
274
|
|
|
275
|
+
def _format_job_ids_str(job_ids: List[int], max_length: int = 30) -> str:
|
|
276
|
+
"""Format job IDs string with ellipsis if too long.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
job_ids: List of job IDs to format.
|
|
280
|
+
max_length: Maximum length of the output string.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Formatted string like "11,12,...,2017,2018" if truncated,
|
|
284
|
+
or the full string if it fits within max_length.
|
|
285
|
+
"""
|
|
286
|
+
if not job_ids:
|
|
287
|
+
return ''
|
|
288
|
+
|
|
289
|
+
# Convert all to strings
|
|
290
|
+
job_strs = [str(job_id) for job_id in job_ids]
|
|
291
|
+
full_str = ','.join(job_strs)
|
|
292
|
+
|
|
293
|
+
# If it fits, return as is
|
|
294
|
+
if len(full_str) <= max_length:
|
|
295
|
+
return full_str
|
|
296
|
+
|
|
297
|
+
if len(job_strs) <= 2:
|
|
298
|
+
return full_str # Can't truncate further
|
|
299
|
+
|
|
300
|
+
# Need to truncate with ellipsis
|
|
301
|
+
ellipsis = '...'
|
|
302
|
+
|
|
303
|
+
# Start with minimum: first and last
|
|
304
|
+
start_count = 1
|
|
305
|
+
end_count = 1
|
|
306
|
+
|
|
307
|
+
while start_count + end_count < len(job_strs):
|
|
308
|
+
# Try adding one more to start
|
|
309
|
+
if start_count + 1 + end_count < len(job_strs):
|
|
310
|
+
start_part = ','.join(job_strs[:start_count + 1])
|
|
311
|
+
end_part = ','.join(job_strs[-end_count:])
|
|
312
|
+
candidate = f'{start_part},{ellipsis},{end_part}'
|
|
313
|
+
if len(candidate) <= max_length:
|
|
314
|
+
start_count += 1
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
# Try adding one more to end
|
|
318
|
+
if start_count + end_count + 1 < len(job_strs):
|
|
319
|
+
start_part = ','.join(job_strs[:start_count])
|
|
320
|
+
end_part = ','.join(job_strs[-(end_count + 1):])
|
|
321
|
+
candidate = f'{start_part},{ellipsis},{end_part}'
|
|
322
|
+
if len(candidate) <= max_length:
|
|
323
|
+
end_count += 1
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
# Can't add more
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
start_part = ','.join(job_strs[:start_count])
|
|
330
|
+
end_part = ','.join(job_strs[-end_count:])
|
|
331
|
+
return f'{start_part},{ellipsis},{end_part}'
|
|
332
|
+
|
|
333
|
+
|
|
275
334
|
def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
|
|
276
335
|
incomplete: str) -> List[str]:
|
|
277
336
|
"""Handle shell completion for cluster names."""
|
|
@@ -1428,17 +1487,20 @@ def _handle_jobs_queue_request(
|
|
|
1428
1487
|
|
|
1429
1488
|
|
|
1430
1489
|
def _handle_services_request(
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1490
|
+
request_id: str,
|
|
1491
|
+
service_names: Optional[List[str]],
|
|
1492
|
+
show_all: bool,
|
|
1493
|
+
show_endpoint: bool,
|
|
1494
|
+
pool: bool = False, # pylint: disable=redefined-outer-name
|
|
1495
|
+
is_called_by_user: bool = False
|
|
1496
|
+
) -> Tuple[Optional[int], str]:
|
|
1436
1497
|
"""Get service statuses.
|
|
1437
1498
|
|
|
1438
1499
|
Args:
|
|
1439
1500
|
service_names: If not None, only show the statuses of these services.
|
|
1440
1501
|
show_all: Show all information of each service.
|
|
1441
1502
|
show_endpoint: If True, only show the endpoint of the service.
|
|
1503
|
+
pool: If True, the request is for a pool. Otherwise for a service.
|
|
1442
1504
|
is_called_by_user: If this function is called by user directly, or an
|
|
1443
1505
|
internal call.
|
|
1444
1506
|
|
|
@@ -1447,6 +1509,7 @@ def _handle_services_request(
|
|
|
1447
1509
|
is an error when querying the services. In this case, msg contains the
|
|
1448
1510
|
error message. Otherwise, msg contains the formatted service table.
|
|
1449
1511
|
"""
|
|
1512
|
+
noun = 'pool' if pool else 'service'
|
|
1450
1513
|
num_services = None
|
|
1451
1514
|
try:
|
|
1452
1515
|
if not is_called_by_user:
|
|
@@ -1483,11 +1546,11 @@ def _handle_services_request(
|
|
|
1483
1546
|
# print the original error.
|
|
1484
1547
|
pass
|
|
1485
1548
|
if not msg:
|
|
1486
|
-
msg = ('Failed to fetch
|
|
1549
|
+
msg = (f'Failed to fetch {noun} statuses due to connection issues. '
|
|
1487
1550
|
'Please try again later. Details: '
|
|
1488
1551
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1489
1552
|
except Exception as e: # pylint: disable=broad-except
|
|
1490
|
-
msg = ('Failed to fetch
|
|
1553
|
+
msg = (f'Failed to fetch {noun} statuses: '
|
|
1491
1554
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1492
1555
|
else:
|
|
1493
1556
|
if show_endpoint:
|
|
@@ -1502,14 +1565,16 @@ def _handle_services_request(
|
|
|
1502
1565
|
endpoint = service_records[0]['endpoint']
|
|
1503
1566
|
msg = '-' if endpoint is None else endpoint
|
|
1504
1567
|
else:
|
|
1505
|
-
msg = serve_lib.format_service_table(service_records, show_all
|
|
1568
|
+
msg = serve_lib.format_service_table(service_records, show_all,
|
|
1569
|
+
pool)
|
|
1506
1570
|
service_not_found_msg = ''
|
|
1507
1571
|
if service_names is not None:
|
|
1508
1572
|
for service_name in service_names:
|
|
1509
1573
|
if not any(service_name == record['name']
|
|
1510
1574
|
for record in service_records):
|
|
1511
1575
|
service_not_found_msg += (
|
|
1512
|
-
f'\
|
|
1576
|
+
f'\n{noun.capitalize()} '
|
|
1577
|
+
f'{service_name!r} not found.')
|
|
1513
1578
|
if service_not_found_msg:
|
|
1514
1579
|
msg += f'\n{service_not_found_msg}'
|
|
1515
1580
|
return num_services, msg
|
|
@@ -1665,6 +1730,11 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
|
|
|
1665
1730
|
is_flag=True,
|
|
1666
1731
|
required=False,
|
|
1667
1732
|
help='Also show sky serve services, if any.')
|
|
1733
|
+
@click.option('--show-pools/--no-show-pools',
|
|
1734
|
+
default=True,
|
|
1735
|
+
is_flag=True,
|
|
1736
|
+
required=False,
|
|
1737
|
+
help='Also show cluster pools, if any.')
|
|
1668
1738
|
@click.option(
|
|
1669
1739
|
'--kubernetes',
|
|
1670
1740
|
'--k8s',
|
|
@@ -1684,8 +1754,8 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
|
|
|
1684
1754
|
# pylint: disable=redefined-builtin
|
|
1685
1755
|
def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1686
1756
|
endpoint: Optional[int], show_managed_jobs: bool,
|
|
1687
|
-
show_services: bool,
|
|
1688
|
-
all_users: bool):
|
|
1757
|
+
show_services: bool, show_pools: bool, kubernetes: bool,
|
|
1758
|
+
clusters: List[str], all_users: bool):
|
|
1689
1759
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1690
1760
|
"""Show clusters.
|
|
1691
1761
|
|
|
@@ -1807,6 +1877,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1807
1877
|
def submit_services() -> Optional[str]:
|
|
1808
1878
|
return serve_lib.status(service_names=None)
|
|
1809
1879
|
|
|
1880
|
+
def submit_pools() -> Optional[str]:
|
|
1881
|
+
return managed_jobs.pool_status(pool_names=None)
|
|
1882
|
+
|
|
1810
1883
|
def submit_workspace() -> Optional[str]:
|
|
1811
1884
|
try:
|
|
1812
1885
|
return sdk.workspaces()
|
|
@@ -1823,6 +1896,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1823
1896
|
managed_jobs_queue_request_id = None
|
|
1824
1897
|
service_status_request_id = None
|
|
1825
1898
|
workspace_request_id = None
|
|
1899
|
+
pool_status_request_id = None
|
|
1826
1900
|
|
|
1827
1901
|
# Submit all requests in parallel
|
|
1828
1902
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
@@ -1830,6 +1904,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1830
1904
|
managed_jobs_request_future = executor.submit(submit_managed_jobs)
|
|
1831
1905
|
if show_services:
|
|
1832
1906
|
services_request_future = executor.submit(submit_services)
|
|
1907
|
+
if show_pools:
|
|
1908
|
+
pools_request_future = executor.submit(submit_pools)
|
|
1833
1909
|
if not (ip or show_endpoints):
|
|
1834
1910
|
workspace_request_future = executor.submit(submit_workspace)
|
|
1835
1911
|
|
|
@@ -1838,13 +1914,17 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1838
1914
|
managed_jobs_queue_request_id = managed_jobs_request_future.result()
|
|
1839
1915
|
if show_services:
|
|
1840
1916
|
service_status_request_id = services_request_future.result()
|
|
1917
|
+
if show_pools:
|
|
1918
|
+
pool_status_request_id = pools_request_future.result()
|
|
1841
1919
|
if not (ip or show_endpoints):
|
|
1842
1920
|
workspace_request_id = workspace_request_future.result()
|
|
1843
1921
|
|
|
1844
|
-
managed_jobs_queue_request_id = '' if not managed_jobs_queue_request_id
|
|
1845
|
-
|
|
1846
|
-
service_status_request_id = '' if not service_status_request_id
|
|
1847
|
-
|
|
1922
|
+
managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
|
|
1923
|
+
else managed_jobs_queue_request_id)
|
|
1924
|
+
service_status_request_id = ('' if not service_status_request_id else
|
|
1925
|
+
service_status_request_id)
|
|
1926
|
+
pool_status_request_id = ('' if not pool_status_request_id else
|
|
1927
|
+
pool_status_request_id)
|
|
1848
1928
|
|
|
1849
1929
|
# Phase 3: Get cluster records and handle special cases
|
|
1850
1930
|
cluster_records = _get_cluster_records_and_set_ssh_config(
|
|
@@ -1919,7 +1999,34 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1919
1999
|
job_info += '. '
|
|
1920
2000
|
hints.append(
|
|
1921
2001
|
controller_utils.Controllers.JOBS_CONTROLLER.value.
|
|
1922
|
-
in_progress_hint.format(job_info=job_info))
|
|
2002
|
+
in_progress_hint(False).format(job_info=job_info))
|
|
2003
|
+
|
|
2004
|
+
if show_pools:
|
|
2005
|
+
num_pools = None
|
|
2006
|
+
if managed_jobs_query_interrupted:
|
|
2007
|
+
msg = 'KeyboardInterrupt'
|
|
2008
|
+
else:
|
|
2009
|
+
with rich_utils.client_status('[cyan]Checking pools[/]'):
|
|
2010
|
+
try:
|
|
2011
|
+
num_pools, msg = _handle_services_request(
|
|
2012
|
+
pool_status_request_id,
|
|
2013
|
+
service_names=None,
|
|
2014
|
+
show_all=False,
|
|
2015
|
+
show_endpoint=False,
|
|
2016
|
+
pool=True,
|
|
2017
|
+
is_called_by_user=False)
|
|
2018
|
+
except KeyboardInterrupt:
|
|
2019
|
+
sdk.api_cancel(pool_status_request_id, silent=True)
|
|
2020
|
+
num_pools = -1
|
|
2021
|
+
msg = 'KeyboardInterrupt'
|
|
2022
|
+
if num_pools is not None:
|
|
2023
|
+
if num_pools > 0:
|
|
2024
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
2025
|
+
f'Pools{colorama.Style.RESET_ALL}')
|
|
2026
|
+
click.echo(msg)
|
|
2027
|
+
hints.append(
|
|
2028
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
|
|
2029
|
+
in_progress_hint(True))
|
|
1923
2030
|
|
|
1924
2031
|
if show_services:
|
|
1925
2032
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
@@ -1942,8 +2049,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1942
2049
|
msg = 'KeyboardInterrupt'
|
|
1943
2050
|
click.echo(msg)
|
|
1944
2051
|
if num_services is not None:
|
|
1945
|
-
hints.append(
|
|
1946
|
-
|
|
2052
|
+
hints.append(
|
|
2053
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
|
|
2054
|
+
in_progress_hint(False))
|
|
1947
2055
|
|
|
1948
2056
|
if num_pending_autostop > 0 and not refresh:
|
|
1949
2057
|
# Don't print this hint if there's no pending autostop or user has
|
|
@@ -4193,6 +4301,17 @@ def jobs():
|
|
|
4193
4301
|
is_flag=True,
|
|
4194
4302
|
help=('If True, as soon as a job is submitted, return from this call '
|
|
4195
4303
|
'and do not stream execution logs.'))
|
|
4304
|
+
@click.option('--pool',
|
|
4305
|
+
'-p',
|
|
4306
|
+
default=None,
|
|
4307
|
+
type=str,
|
|
4308
|
+
required=False,
|
|
4309
|
+
help='(Experimental; optional) Pool to use for jobs submission.')
|
|
4310
|
+
@click.option('--num-jobs',
|
|
4311
|
+
default=None,
|
|
4312
|
+
type=int,
|
|
4313
|
+
required=False,
|
|
4314
|
+
help='Number of jobs to submit.')
|
|
4196
4315
|
@click.option('--git-url', type=str, help='Git repository URL.')
|
|
4197
4316
|
@click.option('--git-ref',
|
|
4198
4317
|
type=str,
|
|
@@ -4226,6 +4345,8 @@ def jobs_launch(
|
|
|
4226
4345
|
ports: Tuple[str],
|
|
4227
4346
|
detach_run: bool,
|
|
4228
4347
|
yes: bool,
|
|
4348
|
+
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4349
|
+
num_jobs: Optional[int],
|
|
4229
4350
|
async_call: bool,
|
|
4230
4351
|
config_override: Optional[Dict[str, Any]] = None,
|
|
4231
4352
|
git_url: Optional[str] = None,
|
|
@@ -4245,6 +4366,9 @@ def jobs_launch(
|
|
|
4245
4366
|
|
|
4246
4367
|
sky jobs launch 'echo hello!'
|
|
4247
4368
|
"""
|
|
4369
|
+
if pool is None and num_jobs is not None:
|
|
4370
|
+
raise click.UsageError('Cannot specify --num-jobs without --pool.')
|
|
4371
|
+
|
|
4248
4372
|
if cluster is not None:
|
|
4249
4373
|
if name is not None and name != cluster:
|
|
4250
4374
|
raise click.UsageError('Cannot specify both --name and --cluster. '
|
|
@@ -4295,22 +4419,63 @@ def jobs_launch(
|
|
|
4295
4419
|
|
|
4296
4420
|
common_utils.check_cluster_name_is_valid(name)
|
|
4297
4421
|
|
|
4422
|
+
if pool is not None:
|
|
4423
|
+
num_job_int = num_jobs if num_jobs is not None else 1
|
|
4424
|
+
plural = '' if num_job_int == 1 else 's'
|
|
4425
|
+
click.secho(f'Submitting to pool {colorama.Fore.CYAN}{pool!r}'
|
|
4426
|
+
f'{colorama.Style.RESET_ALL} with {colorama.Fore.CYAN}'
|
|
4427
|
+
f'{num_job_int}{colorama.Style.RESET_ALL} job{plural}.')
|
|
4428
|
+
print_setup_fm_warning = False
|
|
4429
|
+
for task_ in dag.tasks:
|
|
4430
|
+
if (task_.setup is not None or task_.file_mounts or
|
|
4431
|
+
task_.storage_mounts):
|
|
4432
|
+
print_setup_fm_warning = True
|
|
4433
|
+
break
|
|
4434
|
+
if print_setup_fm_warning:
|
|
4435
|
+
click.secho(
|
|
4436
|
+
f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
|
|
4437
|
+
' will be ignored in pool. To update a pool, please '
|
|
4438
|
+
f'use `sky pool apply {pool} pool.yaml`. '
|
|
4439
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4440
|
+
|
|
4298
4441
|
# Optimize info is only show if _need_confirmation.
|
|
4299
4442
|
if not yes:
|
|
4300
4443
|
click.secho(
|
|
4301
4444
|
f'Managed job {dag.name!r} will be launched on (estimated):',
|
|
4302
4445
|
fg='yellow')
|
|
4303
4446
|
|
|
4304
|
-
request_id = managed_jobs.launch(dag,
|
|
4447
|
+
request_id = managed_jobs.launch(dag,
|
|
4448
|
+
name,
|
|
4449
|
+
pool,
|
|
4450
|
+
num_jobs,
|
|
4451
|
+
_need_confirmation=not yes)
|
|
4305
4452
|
job_id_handle = _async_call_or_wait(request_id, async_call,
|
|
4306
4453
|
'sky.jobs.launch')
|
|
4454
|
+
|
|
4307
4455
|
if not async_call and not detach_run:
|
|
4308
|
-
|
|
4309
|
-
|
|
4310
|
-
|
|
4311
|
-
|
|
4312
|
-
|
|
4313
|
-
|
|
4456
|
+
job_ids = job_id_handle[0]
|
|
4457
|
+
if isinstance(job_ids, int) or len(job_ids) == 1:
|
|
4458
|
+
job_id = job_ids if isinstance(job_ids, int) else job_ids[0]
|
|
4459
|
+
returncode = managed_jobs.tail_logs(name=None,
|
|
4460
|
+
job_id=job_id,
|
|
4461
|
+
follow=True,
|
|
4462
|
+
controller=False)
|
|
4463
|
+
sys.exit(returncode)
|
|
4464
|
+
else:
|
|
4465
|
+
job_ids_str = _format_job_ids_str(job_ids)
|
|
4466
|
+
click.secho(
|
|
4467
|
+
f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
|
|
4468
|
+
f'{job_ids_str}{colorama.Style.RESET_ALL}.'
|
|
4469
|
+
f'\n📋 Useful Commands'
|
|
4470
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
|
|
4471
|
+
f'{ux_utils.BOLD}sky jobs logs <job-id>'
|
|
4472
|
+
f'{ux_utils.RESET_BOLD}'
|
|
4473
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
|
|
4474
|
+
f'{ux_utils.BOLD}sky jobs logs --controller <job-id>'
|
|
4475
|
+
f'{ux_utils.RESET_BOLD}'
|
|
4476
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}To cancel all jobs on the '
|
|
4477
|
+
f'pool:\t{ux_utils.BOLD}sky jobs cancel --pool {pool}'
|
|
4478
|
+
f'{ux_utils.RESET_BOLD}')
|
|
4314
4479
|
|
|
4315
4480
|
|
|
4316
4481
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
|
@@ -4420,14 +4585,25 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4420
4585
|
required=False,
|
|
4421
4586
|
type=str,
|
|
4422
4587
|
help='Managed job name to cancel.')
|
|
4588
|
+
@click.option('--pool',
|
|
4589
|
+
'-p',
|
|
4590
|
+
required=False,
|
|
4591
|
+
type=str,
|
|
4592
|
+
help='Pool name to cancel.')
|
|
4423
4593
|
@click.argument('job_ids', default=None, type=int, required=False, nargs=-1)
|
|
4424
4594
|
@flags.all_option('Cancel all managed jobs for the current user.')
|
|
4425
4595
|
@flags.yes_option()
|
|
4426
4596
|
@flags.all_users_option('Cancel all managed jobs from all users.')
|
|
4427
4597
|
@usage_lib.entrypoint
|
|
4428
4598
|
# pylint: disable=redefined-builtin
|
|
4429
|
-
def jobs_cancel(
|
|
4430
|
-
|
|
4599
|
+
def jobs_cancel(
|
|
4600
|
+
name: Optional[str],
|
|
4601
|
+
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4602
|
+
job_ids: Tuple[int],
|
|
4603
|
+
all: bool,
|
|
4604
|
+
yes: bool,
|
|
4605
|
+
all_users: bool,
|
|
4606
|
+
):
|
|
4431
4607
|
"""Cancel managed jobs.
|
|
4432
4608
|
|
|
4433
4609
|
You can provide either a job name or a list of job IDs to be cancelled.
|
|
@@ -4442,22 +4618,29 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
|
|
4442
4618
|
\b
|
|
4443
4619
|
# Cancel managed jobs with IDs 1, 2, 3
|
|
4444
4620
|
$ sky jobs cancel 1 2 3
|
|
4621
|
+
\b
|
|
4622
|
+
# Cancel all managed jobs in pool 'my-pool'
|
|
4623
|
+
$ sky jobs cancel -p my-pool
|
|
4445
4624
|
"""
|
|
4446
4625
|
job_id_str = ','.join(map(str, job_ids))
|
|
4447
|
-
if sum([
|
|
4626
|
+
if sum([
|
|
4627
|
+
bool(job_ids), name is not None, pool is not None, all or all_users
|
|
4628
|
+
]) != 1:
|
|
4448
4629
|
arguments = []
|
|
4449
4630
|
arguments += [f'--job-ids {job_id_str}'] if job_ids else []
|
|
4450
4631
|
arguments += [f'--name {name}'] if name is not None else []
|
|
4632
|
+
arguments += [f'--pool {pool}'] if pool is not None else []
|
|
4451
4633
|
arguments += ['--all'] if all else []
|
|
4452
4634
|
arguments += ['--all-users'] if all_users else []
|
|
4453
4635
|
raise click.UsageError(
|
|
4454
|
-
'Can only specify one of JOB_IDS, --name, or
|
|
4455
|
-
f'Provided {" ".join(arguments)!r}.')
|
|
4636
|
+
'Can only specify one of JOB_IDS, --name, --pool, or '
|
|
4637
|
+
f'--all/--all-users. Provided {" ".join(arguments)!r}.')
|
|
4456
4638
|
|
|
4457
4639
|
if not yes:
|
|
4458
4640
|
plural = 's' if len(job_ids) > 1 else ''
|
|
4459
4641
|
job_identity_str = (f'managed job{plural} with ID{plural} {job_id_str}'
|
|
4460
|
-
if job_ids else
|
|
4642
|
+
if job_ids else f'{name!r}' if name is not None else
|
|
4643
|
+
f'managed jobs in pool {pool!r}')
|
|
4461
4644
|
if all_users:
|
|
4462
4645
|
job_identity_str = 'all managed jobs FOR ALL USERS'
|
|
4463
4646
|
elif all:
|
|
@@ -4470,6 +4653,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
|
|
4470
4653
|
sdk.stream_and_get(
|
|
4471
4654
|
managed_jobs.cancel(job_ids=job_ids,
|
|
4472
4655
|
name=name,
|
|
4656
|
+
pool=pool,
|
|
4473
4657
|
all=all,
|
|
4474
4658
|
all_users=all_users))
|
|
4475
4659
|
|
|
@@ -4547,24 +4731,47 @@ def jobs_dashboard():
|
|
|
4547
4731
|
sdk.dashboard(starting_page='jobs')
|
|
4548
4732
|
|
|
4549
4733
|
|
|
4550
|
-
@
|
|
4551
|
-
|
|
4552
|
-
|
|
4553
|
-
def dashboard() -> None:
|
|
4554
|
-
"""Starts the dashboard for skypilot."""
|
|
4555
|
-
sdk.dashboard()
|
|
4556
|
-
|
|
4557
|
-
|
|
4558
|
-
@cli.group(cls=_NaturalOrderGroup)
|
|
4559
|
-
def serve():
|
|
4560
|
-
"""SkyServe CLI (multi-region, multi-cloud serving)."""
|
|
4734
|
+
@jobs.group(cls=_NaturalOrderGroup)
|
|
4735
|
+
def pool():
|
|
4736
|
+
"""(Experimental) Pool management commands."""
|
|
4561
4737
|
pass
|
|
4562
4738
|
|
|
4563
4739
|
|
|
4564
|
-
|
|
4565
|
-
|
|
4566
|
-
|
|
4740
|
+
# TODO(MaoZiming): Update Doc.
|
|
4741
|
+
# TODO(MaoZiming): Expose mix replica traffic option to user.
|
|
4742
|
+
# Currently, we do not mix traffic from old and new replicas.
|
|
4743
|
+
@pool.command('apply', cls=_DocumentedCodeCommand)
|
|
4744
|
+
@flags.config_option(expose_value=False)
|
|
4745
|
+
@click.argument('pool_yaml',
|
|
4746
|
+
required=True,
|
|
4747
|
+
type=str,
|
|
4748
|
+
nargs=-1,
|
|
4749
|
+
**_get_shell_complete_args(_complete_file_name))
|
|
4750
|
+
@click.option('--pool-name',
|
|
4751
|
+
'-p',
|
|
4752
|
+
default=None,
|
|
4753
|
+
type=str,
|
|
4754
|
+
help='A pool name. Unique for each pool. If not provided, '
|
|
4755
|
+
'a unique name is autogenerated.')
|
|
4756
|
+
@click.option('--mode',
|
|
4757
|
+
default=serve_lib.DEFAULT_UPDATE_MODE.value,
|
|
4758
|
+
type=click.Choice([m.value for m in serve_lib.UpdateMode],
|
|
4759
|
+
case_sensitive=False),
|
|
4760
|
+
required=False,
|
|
4761
|
+
help=('Update mode. If "rolling", cluster pool will be updated '
|
|
4762
|
+
'with rolling update. If "blue_green", cluster pool will '
|
|
4763
|
+
'be updated with blue-green update. This option is only '
|
|
4764
|
+
'valid when the pool is already running.'))
|
|
4765
|
+
@_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
|
|
4766
|
+
flags.COMMON_OPTIONS)
|
|
4767
|
+
@flags.yes_option()
|
|
4768
|
+
@timeline.event
|
|
4769
|
+
@usage_lib.entrypoint
|
|
4770
|
+
def jobs_pool_apply(
|
|
4771
|
+
pool_yaml: Tuple[str, ...],
|
|
4772
|
+
pool_name: Optional[str],
|
|
4567
4773
|
workdir: Optional[str],
|
|
4774
|
+
infra: Optional[str],
|
|
4568
4775
|
cloud: Optional[str],
|
|
4569
4776
|
region: Optional[str],
|
|
4570
4777
|
zone: Optional[str],
|
|
@@ -4573,21 +4780,196 @@ def _generate_task_with_service(
|
|
|
4573
4780
|
image_id: Optional[str],
|
|
4574
4781
|
env_file: Optional[Dict[str, str]],
|
|
4575
4782
|
env: List[Tuple[str, str]],
|
|
4576
|
-
secret:
|
|
4783
|
+
secret: List[Tuple[str, str]],
|
|
4577
4784
|
gpus: Optional[str],
|
|
4578
4785
|
instance_type: Optional[str],
|
|
4579
|
-
ports:
|
|
4786
|
+
ports: Tuple[str],
|
|
4580
4787
|
cpus: Optional[str],
|
|
4581
4788
|
memory: Optional[str],
|
|
4582
4789
|
disk_size: Optional[int],
|
|
4583
4790
|
disk_tier: Optional[str],
|
|
4584
4791
|
network_tier: Optional[str],
|
|
4585
|
-
|
|
4792
|
+
mode: str,
|
|
4793
|
+
yes: bool,
|
|
4794
|
+
async_call: bool,
|
|
4795
|
+
):
|
|
4796
|
+
"""Apply a config to a cluster pool for managed jobs submission.
|
|
4797
|
+
|
|
4798
|
+
If the pool is already running, the config will be applied to the pool.
|
|
4799
|
+
Otherwise, a new pool will be created.
|
|
4800
|
+
|
|
4801
|
+
POOL_YAML must point to a valid YAML file.
|
|
4802
|
+
"""
|
|
4803
|
+
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
4804
|
+
infra, cloud, region, zone)
|
|
4805
|
+
if pool_name is None:
|
|
4806
|
+
pool_name = serve_lib.generate_service_name(pool=True)
|
|
4807
|
+
|
|
4808
|
+
task = _generate_task_with_service(
|
|
4809
|
+
service_name=pool_name,
|
|
4810
|
+
service_yaml_args=pool_yaml,
|
|
4811
|
+
workdir=workdir,
|
|
4812
|
+
cloud=cloud,
|
|
4813
|
+
region=region,
|
|
4814
|
+
zone=zone,
|
|
4815
|
+
gpus=gpus,
|
|
4816
|
+
cpus=cpus,
|
|
4817
|
+
memory=memory,
|
|
4818
|
+
instance_type=instance_type,
|
|
4819
|
+
num_nodes=num_nodes,
|
|
4820
|
+
use_spot=use_spot,
|
|
4821
|
+
image_id=image_id,
|
|
4822
|
+
env_file=env_file,
|
|
4823
|
+
env=env,
|
|
4824
|
+
secret=secret,
|
|
4825
|
+
disk_size=disk_size,
|
|
4826
|
+
disk_tier=disk_tier,
|
|
4827
|
+
network_tier=network_tier,
|
|
4828
|
+
ports=ports,
|
|
4829
|
+
not_supported_cmd='sky jobs pool up',
|
|
4830
|
+
pool=True,
|
|
4831
|
+
)
|
|
4832
|
+
assert task.service is not None
|
|
4833
|
+
if not task.service.pool:
|
|
4834
|
+
raise click.UsageError('The YAML file needs a `pool` section.')
|
|
4835
|
+
click.secho('Pool spec:', fg='cyan')
|
|
4836
|
+
click.echo(task.service)
|
|
4837
|
+
serve_lib.validate_service_task(task, pool=True)
|
|
4838
|
+
|
|
4839
|
+
click.secho(
|
|
4840
|
+
'Each pool worker will use the following resources (estimated):',
|
|
4841
|
+
fg='cyan')
|
|
4842
|
+
with sky.Dag() as dag:
|
|
4843
|
+
dag.add(task)
|
|
4844
|
+
|
|
4845
|
+
request_id = managed_jobs.pool_apply(task,
|
|
4846
|
+
pool_name,
|
|
4847
|
+
mode=serve_lib.UpdateMode(mode),
|
|
4848
|
+
_need_confirmation=not yes)
|
|
4849
|
+
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
|
|
4850
|
+
|
|
4851
|
+
|
|
4852
|
+
@pool.command('status', cls=_DocumentedCodeCommand)
|
|
4853
|
+
@flags.config_option(expose_value=False)
|
|
4854
|
+
@flags.verbose_option()
|
|
4855
|
+
@click.argument('pool_names', required=False, type=str, nargs=-1)
|
|
4856
|
+
@usage_lib.entrypoint
|
|
4857
|
+
# pylint: disable=redefined-builtin
|
|
4858
|
+
def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
4859
|
+
"""Show statuses of cluster pools.
|
|
4860
|
+
|
|
4861
|
+
Show detailed statuses of one or more pools. If POOL_NAME is not
|
|
4862
|
+
provided, show all pools' status.
|
|
4863
|
+
"""
|
|
4864
|
+
pool_names_to_query: Optional[List[str]] = pool_names
|
|
4865
|
+
if not pool_names:
|
|
4866
|
+
pool_names_to_query = None
|
|
4867
|
+
with rich_utils.client_status('[cyan]Checking pools[/]'):
|
|
4868
|
+
pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
|
|
4869
|
+
_, msg = _handle_services_request(pool_status_request_id,
|
|
4870
|
+
service_names=pool_names_to_query,
|
|
4871
|
+
show_all=verbose,
|
|
4872
|
+
show_endpoint=False,
|
|
4873
|
+
pool=True,
|
|
4874
|
+
is_called_by_user=True)
|
|
4875
|
+
|
|
4876
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
4877
|
+
f'Pools{colorama.Style.RESET_ALL}')
|
|
4878
|
+
click.echo(msg)
|
|
4879
|
+
|
|
4880
|
+
|
|
4881
|
+
@pool.command('down', cls=_DocumentedCodeCommand)
|
|
4882
|
+
@flags.config_option(expose_value=False)
|
|
4883
|
+
@click.argument('pool_names', required=False, type=str, nargs=-1)
|
|
4884
|
+
@flags.all_option('Delete all pools.')
|
|
4885
|
+
@click.option('--purge',
|
|
4886
|
+
'-p',
|
|
4887
|
+
default=False,
|
|
4888
|
+
is_flag=True,
|
|
4889
|
+
help='Tear down pools in failed status.')
|
|
4890
|
+
@flags.yes_option()
|
|
4891
|
+
@_add_click_options(flags.COMMON_OPTIONS)
|
|
4892
|
+
@usage_lib.entrypoint
|
|
4893
|
+
# pylint: disable=redefined-builtin
|
|
4894
|
+
def jobs_pool_down(
|
|
4895
|
+
pool_names: List[str],
|
|
4896
|
+
all: bool,
|
|
4897
|
+
purge: bool,
|
|
4898
|
+
yes: bool,
|
|
4899
|
+
async_call: bool,
|
|
4900
|
+
) -> None:
|
|
4901
|
+
"""Delete pool(s).
|
|
4902
|
+
|
|
4903
|
+
POOL_NAMES is the name of the pool (or glob pattern) to delete. If
|
|
4904
|
+
both POOL_NAMES and ``--all`` are supplied, the latter takes precedence.
|
|
4905
|
+
|
|
4906
|
+
Deleting a pool will delete all of its workers and associated resources.
|
|
4907
|
+
"""
|
|
4908
|
+
if sum([bool(pool_names), all]) != 1:
|
|
4909
|
+
argument_str = (f'POOL_NAMES={",".join(pool_names)}'
|
|
4910
|
+
if pool_names else '')
|
|
4911
|
+
argument_str += ' --all' if all else ''
|
|
4912
|
+
raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
|
|
4913
|
+
f'Provided {argument_str!r}.')
|
|
4914
|
+
|
|
4915
|
+
if not yes:
|
|
4916
|
+
quoted_pool_names = [f'{name!r}' for name in pool_names]
|
|
4917
|
+
list_pool_str = ', '.join(quoted_pool_names)
|
|
4918
|
+
pool_identity_str = f'pool(s) {list_pool_str}'
|
|
4919
|
+
if all:
|
|
4920
|
+
pool_identity_str = 'all pools'
|
|
4921
|
+
click.confirm(f'Terminating {pool_identity_str}. Proceed?',
|
|
4922
|
+
default=True,
|
|
4923
|
+
abort=True,
|
|
4924
|
+
show_default=True)
|
|
4925
|
+
|
|
4926
|
+
request_id = managed_jobs.pool_down(pool_names, all=all, purge=purge)
|
|
4927
|
+
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_down')
|
|
4928
|
+
|
|
4929
|
+
|
|
4930
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
4931
|
+
@flags.config_option(expose_value=False)
|
|
4932
|
+
@usage_lib.entrypoint
|
|
4933
|
+
def dashboard() -> None:
|
|
4934
|
+
"""Starts the dashboard for skypilot."""
|
|
4935
|
+
sdk.dashboard()
|
|
4936
|
+
|
|
4937
|
+
|
|
4938
|
+
@cli.group(cls=_NaturalOrderGroup)
|
|
4939
|
+
def serve():
|
|
4940
|
+
"""SkyServe CLI (multi-region, multi-cloud serving)."""
|
|
4941
|
+
pass
|
|
4942
|
+
|
|
4943
|
+
|
|
4944
|
+
def _generate_task_with_service(
|
|
4945
|
+
service_name: str,
|
|
4946
|
+
service_yaml_args: Tuple[str, ...],
|
|
4947
|
+
workdir: Optional[str],
|
|
4948
|
+
cloud: Optional[str],
|
|
4949
|
+
region: Optional[str],
|
|
4950
|
+
zone: Optional[str],
|
|
4951
|
+
num_nodes: Optional[int],
|
|
4952
|
+
use_spot: Optional[bool],
|
|
4953
|
+
image_id: Optional[str],
|
|
4954
|
+
env_file: Optional[Dict[str, str]],
|
|
4955
|
+
env: List[Tuple[str, str]],
|
|
4956
|
+
secret: Optional[List[Tuple[str, str]]],
|
|
4957
|
+
gpus: Optional[str],
|
|
4958
|
+
instance_type: Optional[str],
|
|
4959
|
+
ports: Optional[Tuple[str]],
|
|
4960
|
+
cpus: Optional[str],
|
|
4961
|
+
memory: Optional[str],
|
|
4962
|
+
disk_size: Optional[int],
|
|
4963
|
+
disk_tier: Optional[str],
|
|
4964
|
+
network_tier: Optional[str],
|
|
4965
|
+
not_supported_cmd: str,
|
|
4966
|
+
pool: bool, # pylint: disable=redefined-outer-name
|
|
4586
4967
|
) -> sky.Task:
|
|
4587
4968
|
"""Generate a task with service section from a service YAML file."""
|
|
4588
4969
|
is_yaml, _ = _check_yaml(''.join(service_yaml_args))
|
|
4970
|
+
yaml_name = 'SERVICE_YAML' if not pool else 'POOL_YAML'
|
|
4589
4971
|
if not is_yaml:
|
|
4590
|
-
raise click.UsageError('
|
|
4972
|
+
raise click.UsageError(f'{yaml_name} must be a valid YAML file.')
|
|
4591
4973
|
env = _merge_env_vars(env_file, env)
|
|
4592
4974
|
# We keep nargs=-1 in service_yaml argument to reuse this function.
|
|
4593
4975
|
task = _make_task_or_dag_from_entrypoint_with_overrides(
|
|
@@ -4617,9 +4999,17 @@ def _generate_task_with_service(
|
|
|
4617
4999
|
_DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd))
|
|
4618
5000
|
|
|
4619
5001
|
if task.service is None:
|
|
5002
|
+
field_name = 'service' if not pool else 'pool'
|
|
4620
5003
|
with ux_utils.print_exception_no_traceback():
|
|
4621
|
-
raise ValueError('
|
|
4622
|
-
'To fix, add a valid
|
|
5004
|
+
raise ValueError(f'{field_name.capitalize()} section not found '
|
|
5005
|
+
'in the YAML file. To fix, add a valid '
|
|
5006
|
+
f'`{field_name}` field.')
|
|
5007
|
+
|
|
5008
|
+
if task.service.pool:
|
|
5009
|
+
if task.service.ports is not None or ports:
|
|
5010
|
+
with ux_utils.print_exception_no_traceback():
|
|
5011
|
+
raise ValueError('Cannot specify ports in a cluster pool.')
|
|
5012
|
+
return task
|
|
4623
5013
|
|
|
4624
5014
|
# NOTE(yi): we only allow one service port now.
|
|
4625
5015
|
service_port: Optional[int] = int(
|
|
@@ -4779,10 +5169,14 @@ def serve_up(
|
|
|
4779
5169
|
network_tier=network_tier,
|
|
4780
5170
|
ports=ports,
|
|
4781
5171
|
not_supported_cmd='sky serve up',
|
|
5172
|
+
pool=False,
|
|
4782
5173
|
)
|
|
5174
|
+
assert task.service is not None
|
|
5175
|
+
if task.service.pool:
|
|
5176
|
+
raise click.UsageError('The YAML file needs a `service` section.')
|
|
4783
5177
|
click.secho('Service spec:', fg='cyan')
|
|
4784
5178
|
click.echo(task.service)
|
|
4785
|
-
serve_lib.validate_service_task(task)
|
|
5179
|
+
serve_lib.validate_service_task(task, pool=False)
|
|
4786
5180
|
|
|
4787
5181
|
click.secho('Each replica will use the following resources (estimated):',
|
|
4788
5182
|
fg='cyan')
|
|
@@ -4881,10 +5275,11 @@ def serve_update(
|
|
|
4881
5275
|
network_tier=network_tier,
|
|
4882
5276
|
ports=ports,
|
|
4883
5277
|
not_supported_cmd='sky serve update',
|
|
5278
|
+
pool=False,
|
|
4884
5279
|
)
|
|
4885
5280
|
click.secho('Service spec:', fg='cyan')
|
|
4886
5281
|
click.echo(task.service)
|
|
4887
|
-
serve_lib.validate_service_task(task)
|
|
5282
|
+
serve_lib.validate_service_task(task, pool=False)
|
|
4888
5283
|
|
|
4889
5284
|
click.secho('New replica will use the following resources (estimated):',
|
|
4890
5285
|
fg='cyan')
|