skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -60
- sky/client/common.py +12 -9
- sky/clouds/nebius.py +1 -1
- sky/clouds/utils/gcp_utils.py +1 -1
- sky/clouds/vast.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
- sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
- sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +25 -0
- sky/data/storage.py +1219 -1775
- sky/global_user_state.py +18 -8
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +164 -31
- sky/jobs/utils.py +144 -68
- sky/logs/aws.py +4 -2
- sky/provision/kubernetes/utils.py +6 -4
- sky/provision/nebius/constants.py +3 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vast/utils.py +9 -6
- sky/py.typed +0 -0
- sky/resources.py +24 -14
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +6 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
- sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
- sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
- sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
- sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
- /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -272,6 +272,65 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
|
|
272
272
|
return list(env_dict.items())
|
|
273
273
|
|
|
274
274
|
|
|
275
|
+
def _format_job_ids_str(job_ids: List[int], max_length: int = 30) -> str:
|
|
276
|
+
"""Format job IDs string with ellipsis if too long.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
job_ids: List of job IDs to format.
|
|
280
|
+
max_length: Maximum length of the output string.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Formatted string like "11,12,...,2017,2018" if truncated,
|
|
284
|
+
or the full string if it fits within max_length.
|
|
285
|
+
"""
|
|
286
|
+
if not job_ids:
|
|
287
|
+
return ''
|
|
288
|
+
|
|
289
|
+
# Convert all to strings
|
|
290
|
+
job_strs = [str(job_id) for job_id in job_ids]
|
|
291
|
+
full_str = ','.join(job_strs)
|
|
292
|
+
|
|
293
|
+
# If it fits, return as is
|
|
294
|
+
if len(full_str) <= max_length:
|
|
295
|
+
return full_str
|
|
296
|
+
|
|
297
|
+
if len(job_strs) <= 2:
|
|
298
|
+
return full_str # Can't truncate further
|
|
299
|
+
|
|
300
|
+
# Need to truncate with ellipsis
|
|
301
|
+
ellipsis = '...'
|
|
302
|
+
|
|
303
|
+
# Start with minimum: first and last
|
|
304
|
+
start_count = 1
|
|
305
|
+
end_count = 1
|
|
306
|
+
|
|
307
|
+
while start_count + end_count < len(job_strs):
|
|
308
|
+
# Try adding one more to start
|
|
309
|
+
if start_count + 1 + end_count < len(job_strs):
|
|
310
|
+
start_part = ','.join(job_strs[:start_count + 1])
|
|
311
|
+
end_part = ','.join(job_strs[-end_count:])
|
|
312
|
+
candidate = f'{start_part},{ellipsis},{end_part}'
|
|
313
|
+
if len(candidate) <= max_length:
|
|
314
|
+
start_count += 1
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
# Try adding one more to end
|
|
318
|
+
if start_count + end_count + 1 < len(job_strs):
|
|
319
|
+
start_part = ','.join(job_strs[:start_count])
|
|
320
|
+
end_part = ','.join(job_strs[-(end_count + 1):])
|
|
321
|
+
candidate = f'{start_part},{ellipsis},{end_part}'
|
|
322
|
+
if len(candidate) <= max_length:
|
|
323
|
+
end_count += 1
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
# Can't add more
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
start_part = ','.join(job_strs[:start_count])
|
|
330
|
+
end_part = ','.join(job_strs[-end_count:])
|
|
331
|
+
return f'{start_part},{ellipsis},{end_part}'
|
|
332
|
+
|
|
333
|
+
|
|
275
334
|
def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
|
|
276
335
|
incomplete: str) -> List[str]:
|
|
277
336
|
"""Handle shell completion for cluster names."""
|
|
@@ -1428,17 +1487,20 @@ def _handle_jobs_queue_request(
|
|
|
1428
1487
|
|
|
1429
1488
|
|
|
1430
1489
|
def _handle_services_request(
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1490
|
+
request_id: str,
|
|
1491
|
+
service_names: Optional[List[str]],
|
|
1492
|
+
show_all: bool,
|
|
1493
|
+
show_endpoint: bool,
|
|
1494
|
+
pool: bool = False, # pylint: disable=redefined-outer-name
|
|
1495
|
+
is_called_by_user: bool = False
|
|
1496
|
+
) -> Tuple[Optional[int], str]:
|
|
1436
1497
|
"""Get service statuses.
|
|
1437
1498
|
|
|
1438
1499
|
Args:
|
|
1439
1500
|
service_names: If not None, only show the statuses of these services.
|
|
1440
1501
|
show_all: Show all information of each service.
|
|
1441
1502
|
show_endpoint: If True, only show the endpoint of the service.
|
|
1503
|
+
pool: If True, the request is for a pool. Otherwise for a service.
|
|
1442
1504
|
is_called_by_user: If this function is called by user directly, or an
|
|
1443
1505
|
internal call.
|
|
1444
1506
|
|
|
@@ -1447,6 +1509,7 @@ def _handle_services_request(
|
|
|
1447
1509
|
is an error when querying the services. In this case, msg contains the
|
|
1448
1510
|
error message. Otherwise, msg contains the formatted service table.
|
|
1449
1511
|
"""
|
|
1512
|
+
noun = 'pool' if pool else 'service'
|
|
1450
1513
|
num_services = None
|
|
1451
1514
|
try:
|
|
1452
1515
|
if not is_called_by_user:
|
|
@@ -1483,11 +1546,11 @@ def _handle_services_request(
|
|
|
1483
1546
|
# print the original error.
|
|
1484
1547
|
pass
|
|
1485
1548
|
if not msg:
|
|
1486
|
-
msg = ('Failed to fetch
|
|
1549
|
+
msg = (f'Failed to fetch {noun} statuses due to connection issues. '
|
|
1487
1550
|
'Please try again later. Details: '
|
|
1488
1551
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1489
1552
|
except Exception as e: # pylint: disable=broad-except
|
|
1490
|
-
msg = ('Failed to fetch
|
|
1553
|
+
msg = (f'Failed to fetch {noun} statuses: '
|
|
1491
1554
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1492
1555
|
else:
|
|
1493
1556
|
if show_endpoint:
|
|
@@ -1502,14 +1565,16 @@ def _handle_services_request(
|
|
|
1502
1565
|
endpoint = service_records[0]['endpoint']
|
|
1503
1566
|
msg = '-' if endpoint is None else endpoint
|
|
1504
1567
|
else:
|
|
1505
|
-
msg = serve_lib.format_service_table(service_records, show_all
|
|
1568
|
+
msg = serve_lib.format_service_table(service_records, show_all,
|
|
1569
|
+
pool)
|
|
1506
1570
|
service_not_found_msg = ''
|
|
1507
1571
|
if service_names is not None:
|
|
1508
1572
|
for service_name in service_names:
|
|
1509
1573
|
if not any(service_name == record['name']
|
|
1510
1574
|
for record in service_records):
|
|
1511
1575
|
service_not_found_msg += (
|
|
1512
|
-
f'\
|
|
1576
|
+
f'\n{noun.capitalize()} '
|
|
1577
|
+
f'{service_name!r} not found.')
|
|
1513
1578
|
if service_not_found_msg:
|
|
1514
1579
|
msg += f'\n{service_not_found_msg}'
|
|
1515
1580
|
return num_services, msg
|
|
@@ -1665,6 +1730,11 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
|
|
|
1665
1730
|
is_flag=True,
|
|
1666
1731
|
required=False,
|
|
1667
1732
|
help='Also show sky serve services, if any.')
|
|
1733
|
+
@click.option('--show-pools/--no-show-pools',
|
|
1734
|
+
default=True,
|
|
1735
|
+
is_flag=True,
|
|
1736
|
+
required=False,
|
|
1737
|
+
help='Also show cluster pools, if any.')
|
|
1668
1738
|
@click.option(
|
|
1669
1739
|
'--kubernetes',
|
|
1670
1740
|
'--k8s',
|
|
@@ -1684,8 +1754,8 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
|
|
|
1684
1754
|
# pylint: disable=redefined-builtin
|
|
1685
1755
|
def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1686
1756
|
endpoint: Optional[int], show_managed_jobs: bool,
|
|
1687
|
-
show_services: bool,
|
|
1688
|
-
all_users: bool):
|
|
1757
|
+
show_services: bool, show_pools: bool, kubernetes: bool,
|
|
1758
|
+
clusters: List[str], all_users: bool):
|
|
1689
1759
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1690
1760
|
"""Show clusters.
|
|
1691
1761
|
|
|
@@ -1807,6 +1877,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1807
1877
|
def submit_services() -> Optional[str]:
|
|
1808
1878
|
return serve_lib.status(service_names=None)
|
|
1809
1879
|
|
|
1880
|
+
def submit_pools() -> Optional[str]:
|
|
1881
|
+
return managed_jobs.pool_status(pool_names=None)
|
|
1882
|
+
|
|
1810
1883
|
def submit_workspace() -> Optional[str]:
|
|
1811
1884
|
try:
|
|
1812
1885
|
return sdk.workspaces()
|
|
@@ -1823,6 +1896,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1823
1896
|
managed_jobs_queue_request_id = None
|
|
1824
1897
|
service_status_request_id = None
|
|
1825
1898
|
workspace_request_id = None
|
|
1899
|
+
pool_status_request_id = None
|
|
1826
1900
|
|
|
1827
1901
|
# Submit all requests in parallel
|
|
1828
1902
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
@@ -1830,6 +1904,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1830
1904
|
managed_jobs_request_future = executor.submit(submit_managed_jobs)
|
|
1831
1905
|
if show_services:
|
|
1832
1906
|
services_request_future = executor.submit(submit_services)
|
|
1907
|
+
if show_pools:
|
|
1908
|
+
pools_request_future = executor.submit(submit_pools)
|
|
1833
1909
|
if not (ip or show_endpoints):
|
|
1834
1910
|
workspace_request_future = executor.submit(submit_workspace)
|
|
1835
1911
|
|
|
@@ -1838,13 +1914,17 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1838
1914
|
managed_jobs_queue_request_id = managed_jobs_request_future.result()
|
|
1839
1915
|
if show_services:
|
|
1840
1916
|
service_status_request_id = services_request_future.result()
|
|
1917
|
+
if show_pools:
|
|
1918
|
+
pool_status_request_id = pools_request_future.result()
|
|
1841
1919
|
if not (ip or show_endpoints):
|
|
1842
1920
|
workspace_request_id = workspace_request_future.result()
|
|
1843
1921
|
|
|
1844
|
-
managed_jobs_queue_request_id = '' if not managed_jobs_queue_request_id
|
|
1845
|
-
|
|
1846
|
-
service_status_request_id = '' if not service_status_request_id
|
|
1847
|
-
|
|
1922
|
+
managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
|
|
1923
|
+
else managed_jobs_queue_request_id)
|
|
1924
|
+
service_status_request_id = ('' if not service_status_request_id else
|
|
1925
|
+
service_status_request_id)
|
|
1926
|
+
pool_status_request_id = ('' if not pool_status_request_id else
|
|
1927
|
+
pool_status_request_id)
|
|
1848
1928
|
|
|
1849
1929
|
# Phase 3: Get cluster records and handle special cases
|
|
1850
1930
|
cluster_records = _get_cluster_records_and_set_ssh_config(
|
|
@@ -1919,7 +1999,34 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1919
1999
|
job_info += '. '
|
|
1920
2000
|
hints.append(
|
|
1921
2001
|
controller_utils.Controllers.JOBS_CONTROLLER.value.
|
|
1922
|
-
in_progress_hint.format(job_info=job_info))
|
|
2002
|
+
in_progress_hint(False).format(job_info=job_info))
|
|
2003
|
+
|
|
2004
|
+
if show_pools:
|
|
2005
|
+
num_pools = None
|
|
2006
|
+
if managed_jobs_query_interrupted:
|
|
2007
|
+
msg = 'KeyboardInterrupt'
|
|
2008
|
+
else:
|
|
2009
|
+
with rich_utils.client_status('[cyan]Checking pools[/]'):
|
|
2010
|
+
try:
|
|
2011
|
+
num_pools, msg = _handle_services_request(
|
|
2012
|
+
pool_status_request_id,
|
|
2013
|
+
service_names=None,
|
|
2014
|
+
show_all=False,
|
|
2015
|
+
show_endpoint=False,
|
|
2016
|
+
pool=True,
|
|
2017
|
+
is_called_by_user=False)
|
|
2018
|
+
except KeyboardInterrupt:
|
|
2019
|
+
sdk.api_cancel(pool_status_request_id, silent=True)
|
|
2020
|
+
num_pools = -1
|
|
2021
|
+
msg = 'KeyboardInterrupt'
|
|
2022
|
+
if num_pools is not None:
|
|
2023
|
+
if num_pools > 0:
|
|
2024
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
2025
|
+
f'Pools{colorama.Style.RESET_ALL}')
|
|
2026
|
+
click.echo(msg)
|
|
2027
|
+
hints.append(
|
|
2028
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
|
|
2029
|
+
in_progress_hint(True))
|
|
1923
2030
|
|
|
1924
2031
|
if show_services:
|
|
1925
2032
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
@@ -1942,8 +2049,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1942
2049
|
msg = 'KeyboardInterrupt'
|
|
1943
2050
|
click.echo(msg)
|
|
1944
2051
|
if num_services is not None:
|
|
1945
|
-
hints.append(
|
|
1946
|
-
|
|
2052
|
+
hints.append(
|
|
2053
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
|
|
2054
|
+
in_progress_hint(False))
|
|
1947
2055
|
|
|
1948
2056
|
if num_pending_autostop > 0 and not refresh:
|
|
1949
2057
|
# Don't print this hint if there's no pending autostop or user has
|
|
@@ -3427,13 +3535,6 @@ def show_gpus(
|
|
|
3427
3535
|
num_filtered_contexts = 0
|
|
3428
3536
|
|
|
3429
3537
|
if realtime_gpu_availability_lists:
|
|
3430
|
-
if len(realtime_gpu_availability_lists[0]) != 2:
|
|
3431
|
-
# TODO(kyuds): for backwards compatibility, as we add new
|
|
3432
|
-
# context to the API server response in #5362. Remove this after
|
|
3433
|
-
# 0.10.0.
|
|
3434
|
-
realtime_gpu_availability_lists = [
|
|
3435
|
-
(context, realtime_gpu_availability_lists)
|
|
3436
|
-
]
|
|
3437
3538
|
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
|
3438
3539
|
if not _filter_ctx(ctx):
|
|
3439
3540
|
continue
|
|
@@ -4200,6 +4301,17 @@ def jobs():
|
|
|
4200
4301
|
is_flag=True,
|
|
4201
4302
|
help=('If True, as soon as a job is submitted, return from this call '
|
|
4202
4303
|
'and do not stream execution logs.'))
|
|
4304
|
+
@click.option('--pool',
|
|
4305
|
+
'-p',
|
|
4306
|
+
default=None,
|
|
4307
|
+
type=str,
|
|
4308
|
+
required=False,
|
|
4309
|
+
help='(Experimental; optional) Pool to use for jobs submission.')
|
|
4310
|
+
@click.option('--num-jobs',
|
|
4311
|
+
default=None,
|
|
4312
|
+
type=int,
|
|
4313
|
+
required=False,
|
|
4314
|
+
help='Number of jobs to submit.')
|
|
4203
4315
|
@click.option('--git-url', type=str, help='Git repository URL.')
|
|
4204
4316
|
@click.option('--git-ref',
|
|
4205
4317
|
type=str,
|
|
@@ -4233,6 +4345,8 @@ def jobs_launch(
|
|
|
4233
4345
|
ports: Tuple[str],
|
|
4234
4346
|
detach_run: bool,
|
|
4235
4347
|
yes: bool,
|
|
4348
|
+
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4349
|
+
num_jobs: Optional[int],
|
|
4236
4350
|
async_call: bool,
|
|
4237
4351
|
config_override: Optional[Dict[str, Any]] = None,
|
|
4238
4352
|
git_url: Optional[str] = None,
|
|
@@ -4252,6 +4366,9 @@ def jobs_launch(
|
|
|
4252
4366
|
|
|
4253
4367
|
sky jobs launch 'echo hello!'
|
|
4254
4368
|
"""
|
|
4369
|
+
if pool is None and num_jobs is not None:
|
|
4370
|
+
raise click.UsageError('Cannot specify --num-jobs without --pool.')
|
|
4371
|
+
|
|
4255
4372
|
if cluster is not None:
|
|
4256
4373
|
if name is not None and name != cluster:
|
|
4257
4374
|
raise click.UsageError('Cannot specify both --name and --cluster. '
|
|
@@ -4302,22 +4419,63 @@ def jobs_launch(
|
|
|
4302
4419
|
|
|
4303
4420
|
common_utils.check_cluster_name_is_valid(name)
|
|
4304
4421
|
|
|
4422
|
+
if pool is not None:
|
|
4423
|
+
num_job_int = num_jobs if num_jobs is not None else 1
|
|
4424
|
+
plural = '' if num_job_int == 1 else 's'
|
|
4425
|
+
click.secho(f'Submitting to pool {colorama.Fore.CYAN}{pool!r}'
|
|
4426
|
+
f'{colorama.Style.RESET_ALL} with {colorama.Fore.CYAN}'
|
|
4427
|
+
f'{num_job_int}{colorama.Style.RESET_ALL} job{plural}.')
|
|
4428
|
+
print_setup_fm_warning = False
|
|
4429
|
+
for task_ in dag.tasks:
|
|
4430
|
+
if (task_.setup is not None or task_.file_mounts or
|
|
4431
|
+
task_.storage_mounts):
|
|
4432
|
+
print_setup_fm_warning = True
|
|
4433
|
+
break
|
|
4434
|
+
if print_setup_fm_warning:
|
|
4435
|
+
click.secho(
|
|
4436
|
+
f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
|
|
4437
|
+
' will be ignored in pool. To update a pool, please '
|
|
4438
|
+
f'use `sky pool apply {pool} pool.yaml`. '
|
|
4439
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4440
|
+
|
|
4305
4441
|
# Optimize info is only show if _need_confirmation.
|
|
4306
4442
|
if not yes:
|
|
4307
4443
|
click.secho(
|
|
4308
4444
|
f'Managed job {dag.name!r} will be launched on (estimated):',
|
|
4309
4445
|
fg='yellow')
|
|
4310
4446
|
|
|
4311
|
-
request_id = managed_jobs.launch(dag,
|
|
4447
|
+
request_id = managed_jobs.launch(dag,
|
|
4448
|
+
name,
|
|
4449
|
+
pool,
|
|
4450
|
+
num_jobs,
|
|
4451
|
+
_need_confirmation=not yes)
|
|
4312
4452
|
job_id_handle = _async_call_or_wait(request_id, async_call,
|
|
4313
4453
|
'sky.jobs.launch')
|
|
4454
|
+
|
|
4314
4455
|
if not async_call and not detach_run:
|
|
4315
|
-
|
|
4316
|
-
|
|
4317
|
-
|
|
4318
|
-
|
|
4319
|
-
|
|
4320
|
-
|
|
4456
|
+
job_ids = job_id_handle[0]
|
|
4457
|
+
if isinstance(job_ids, int) or len(job_ids) == 1:
|
|
4458
|
+
job_id = job_ids if isinstance(job_ids, int) else job_ids[0]
|
|
4459
|
+
returncode = managed_jobs.tail_logs(name=None,
|
|
4460
|
+
job_id=job_id,
|
|
4461
|
+
follow=True,
|
|
4462
|
+
controller=False)
|
|
4463
|
+
sys.exit(returncode)
|
|
4464
|
+
else:
|
|
4465
|
+
job_ids_str = _format_job_ids_str(job_ids)
|
|
4466
|
+
click.secho(
|
|
4467
|
+
f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
|
|
4468
|
+
f'{job_ids_str}{colorama.Style.RESET_ALL}.'
|
|
4469
|
+
f'\n📋 Useful Commands'
|
|
4470
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
|
|
4471
|
+
f'{ux_utils.BOLD}sky jobs logs <job-id>'
|
|
4472
|
+
f'{ux_utils.RESET_BOLD}'
|
|
4473
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
|
|
4474
|
+
f'{ux_utils.BOLD}sky jobs logs --controller <job-id>'
|
|
4475
|
+
f'{ux_utils.RESET_BOLD}'
|
|
4476
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}To cancel all jobs on the '
|
|
4477
|
+
f'pool:\t{ux_utils.BOLD}sky jobs cancel --pool {pool}'
|
|
4478
|
+
f'{ux_utils.RESET_BOLD}')
|
|
4321
4479
|
|
|
4322
4480
|
|
|
4323
4481
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
|
@@ -4427,14 +4585,25 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4427
4585
|
required=False,
|
|
4428
4586
|
type=str,
|
|
4429
4587
|
help='Managed job name to cancel.')
|
|
4588
|
+
@click.option('--pool',
|
|
4589
|
+
'-p',
|
|
4590
|
+
required=False,
|
|
4591
|
+
type=str,
|
|
4592
|
+
help='Pool name to cancel.')
|
|
4430
4593
|
@click.argument('job_ids', default=None, type=int, required=False, nargs=-1)
|
|
4431
4594
|
@flags.all_option('Cancel all managed jobs for the current user.')
|
|
4432
4595
|
@flags.yes_option()
|
|
4433
4596
|
@flags.all_users_option('Cancel all managed jobs from all users.')
|
|
4434
4597
|
@usage_lib.entrypoint
|
|
4435
4598
|
# pylint: disable=redefined-builtin
|
|
4436
|
-
def jobs_cancel(
|
|
4437
|
-
|
|
4599
|
+
def jobs_cancel(
|
|
4600
|
+
name: Optional[str],
|
|
4601
|
+
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4602
|
+
job_ids: Tuple[int],
|
|
4603
|
+
all: bool,
|
|
4604
|
+
yes: bool,
|
|
4605
|
+
all_users: bool,
|
|
4606
|
+
):
|
|
4438
4607
|
"""Cancel managed jobs.
|
|
4439
4608
|
|
|
4440
4609
|
You can provide either a job name or a list of job IDs to be cancelled.
|
|
@@ -4449,22 +4618,29 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
|
|
4449
4618
|
\b
|
|
4450
4619
|
# Cancel managed jobs with IDs 1, 2, 3
|
|
4451
4620
|
$ sky jobs cancel 1 2 3
|
|
4621
|
+
\b
|
|
4622
|
+
# Cancel all managed jobs in pool 'my-pool'
|
|
4623
|
+
$ sky jobs cancel -p my-pool
|
|
4452
4624
|
"""
|
|
4453
4625
|
job_id_str = ','.join(map(str, job_ids))
|
|
4454
|
-
if sum([
|
|
4626
|
+
if sum([
|
|
4627
|
+
bool(job_ids), name is not None, pool is not None, all or all_users
|
|
4628
|
+
]) != 1:
|
|
4455
4629
|
arguments = []
|
|
4456
4630
|
arguments += [f'--job-ids {job_id_str}'] if job_ids else []
|
|
4457
4631
|
arguments += [f'--name {name}'] if name is not None else []
|
|
4632
|
+
arguments += [f'--pool {pool}'] if pool is not None else []
|
|
4458
4633
|
arguments += ['--all'] if all else []
|
|
4459
4634
|
arguments += ['--all-users'] if all_users else []
|
|
4460
4635
|
raise click.UsageError(
|
|
4461
|
-
'Can only specify one of JOB_IDS, --name, or
|
|
4462
|
-
f'Provided {" ".join(arguments)!r}.')
|
|
4636
|
+
'Can only specify one of JOB_IDS, --name, --pool, or '
|
|
4637
|
+
f'--all/--all-users. Provided {" ".join(arguments)!r}.')
|
|
4463
4638
|
|
|
4464
4639
|
if not yes:
|
|
4465
4640
|
plural = 's' if len(job_ids) > 1 else ''
|
|
4466
4641
|
job_identity_str = (f'managed job{plural} with ID{plural} {job_id_str}'
|
|
4467
|
-
if job_ids else
|
|
4642
|
+
if job_ids else f'{name!r}' if name is not None else
|
|
4643
|
+
f'managed jobs in pool {pool!r}')
|
|
4468
4644
|
if all_users:
|
|
4469
4645
|
job_identity_str = 'all managed jobs FOR ALL USERS'
|
|
4470
4646
|
elif all:
|
|
@@ -4477,6 +4653,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
|
|
4477
4653
|
sdk.stream_and_get(
|
|
4478
4654
|
managed_jobs.cancel(job_ids=job_ids,
|
|
4479
4655
|
name=name,
|
|
4656
|
+
pool=pool,
|
|
4480
4657
|
all=all,
|
|
4481
4658
|
all_users=all_users))
|
|
4482
4659
|
|
|
@@ -4554,24 +4731,47 @@ def jobs_dashboard():
|
|
|
4554
4731
|
sdk.dashboard(starting_page='jobs')
|
|
4555
4732
|
|
|
4556
4733
|
|
|
4557
|
-
@
|
|
4558
|
-
|
|
4559
|
-
|
|
4560
|
-
def dashboard() -> None:
|
|
4561
|
-
"""Starts the dashboard for skypilot."""
|
|
4562
|
-
sdk.dashboard()
|
|
4563
|
-
|
|
4564
|
-
|
|
4565
|
-
@cli.group(cls=_NaturalOrderGroup)
|
|
4566
|
-
def serve():
|
|
4567
|
-
"""SkyServe CLI (multi-region, multi-cloud serving)."""
|
|
4734
|
+
@jobs.group(cls=_NaturalOrderGroup)
|
|
4735
|
+
def pool():
|
|
4736
|
+
"""(Experimental) Pool management commands."""
|
|
4568
4737
|
pass
|
|
4569
4738
|
|
|
4570
4739
|
|
|
4571
|
-
|
|
4572
|
-
|
|
4573
|
-
|
|
4740
|
+
# TODO(MaoZiming): Update Doc.
|
|
4741
|
+
# TODO(MaoZiming): Expose mix replica traffic option to user.
|
|
4742
|
+
# Currently, we do not mix traffic from old and new replicas.
|
|
4743
|
+
@pool.command('apply', cls=_DocumentedCodeCommand)
|
|
4744
|
+
@flags.config_option(expose_value=False)
|
|
4745
|
+
@click.argument('pool_yaml',
|
|
4746
|
+
required=True,
|
|
4747
|
+
type=str,
|
|
4748
|
+
nargs=-1,
|
|
4749
|
+
**_get_shell_complete_args(_complete_file_name))
|
|
4750
|
+
@click.option('--pool-name',
|
|
4751
|
+
'-p',
|
|
4752
|
+
default=None,
|
|
4753
|
+
type=str,
|
|
4754
|
+
help='A pool name. Unique for each pool. If not provided, '
|
|
4755
|
+
'a unique name is autogenerated.')
|
|
4756
|
+
@click.option('--mode',
|
|
4757
|
+
default=serve_lib.DEFAULT_UPDATE_MODE.value,
|
|
4758
|
+
type=click.Choice([m.value for m in serve_lib.UpdateMode],
|
|
4759
|
+
case_sensitive=False),
|
|
4760
|
+
required=False,
|
|
4761
|
+
help=('Update mode. If "rolling", cluster pool will be updated '
|
|
4762
|
+
'with rolling update. If "blue_green", cluster pool will '
|
|
4763
|
+
'be updated with blue-green update. This option is only '
|
|
4764
|
+
'valid when the pool is already running.'))
|
|
4765
|
+
@_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
|
|
4766
|
+
flags.COMMON_OPTIONS)
|
|
4767
|
+
@flags.yes_option()
|
|
4768
|
+
@timeline.event
|
|
4769
|
+
@usage_lib.entrypoint
|
|
4770
|
+
def jobs_pool_apply(
|
|
4771
|
+
pool_yaml: Tuple[str, ...],
|
|
4772
|
+
pool_name: Optional[str],
|
|
4574
4773
|
workdir: Optional[str],
|
|
4774
|
+
infra: Optional[str],
|
|
4575
4775
|
cloud: Optional[str],
|
|
4576
4776
|
region: Optional[str],
|
|
4577
4777
|
zone: Optional[str],
|
|
@@ -4580,21 +4780,196 @@ def _generate_task_with_service(
|
|
|
4580
4780
|
image_id: Optional[str],
|
|
4581
4781
|
env_file: Optional[Dict[str, str]],
|
|
4582
4782
|
env: List[Tuple[str, str]],
|
|
4583
|
-
secret:
|
|
4783
|
+
secret: List[Tuple[str, str]],
|
|
4584
4784
|
gpus: Optional[str],
|
|
4585
4785
|
instance_type: Optional[str],
|
|
4586
|
-
ports:
|
|
4786
|
+
ports: Tuple[str],
|
|
4587
4787
|
cpus: Optional[str],
|
|
4588
4788
|
memory: Optional[str],
|
|
4589
4789
|
disk_size: Optional[int],
|
|
4590
4790
|
disk_tier: Optional[str],
|
|
4591
4791
|
network_tier: Optional[str],
|
|
4592
|
-
|
|
4792
|
+
mode: str,
|
|
4793
|
+
yes: bool,
|
|
4794
|
+
async_call: bool,
|
|
4795
|
+
):
|
|
4796
|
+
"""Apply a config to a cluster pool for managed jobs submission.
|
|
4797
|
+
|
|
4798
|
+
If the pool is already running, the config will be applied to the pool.
|
|
4799
|
+
Otherwise, a new pool will be created.
|
|
4800
|
+
|
|
4801
|
+
POOL_YAML must point to a valid YAML file.
|
|
4802
|
+
"""
|
|
4803
|
+
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
4804
|
+
infra, cloud, region, zone)
|
|
4805
|
+
if pool_name is None:
|
|
4806
|
+
pool_name = serve_lib.generate_service_name(pool=True)
|
|
4807
|
+
|
|
4808
|
+
task = _generate_task_with_service(
|
|
4809
|
+
service_name=pool_name,
|
|
4810
|
+
service_yaml_args=pool_yaml,
|
|
4811
|
+
workdir=workdir,
|
|
4812
|
+
cloud=cloud,
|
|
4813
|
+
region=region,
|
|
4814
|
+
zone=zone,
|
|
4815
|
+
gpus=gpus,
|
|
4816
|
+
cpus=cpus,
|
|
4817
|
+
memory=memory,
|
|
4818
|
+
instance_type=instance_type,
|
|
4819
|
+
num_nodes=num_nodes,
|
|
4820
|
+
use_spot=use_spot,
|
|
4821
|
+
image_id=image_id,
|
|
4822
|
+
env_file=env_file,
|
|
4823
|
+
env=env,
|
|
4824
|
+
secret=secret,
|
|
4825
|
+
disk_size=disk_size,
|
|
4826
|
+
disk_tier=disk_tier,
|
|
4827
|
+
network_tier=network_tier,
|
|
4828
|
+
ports=ports,
|
|
4829
|
+
not_supported_cmd='sky jobs pool up',
|
|
4830
|
+
pool=True,
|
|
4831
|
+
)
|
|
4832
|
+
assert task.service is not None
|
|
4833
|
+
if not task.service.pool:
|
|
4834
|
+
raise click.UsageError('The YAML file needs a `pool` section.')
|
|
4835
|
+
click.secho('Pool spec:', fg='cyan')
|
|
4836
|
+
click.echo(task.service)
|
|
4837
|
+
serve_lib.validate_service_task(task, pool=True)
|
|
4838
|
+
|
|
4839
|
+
click.secho(
|
|
4840
|
+
'Each pool worker will use the following resources (estimated):',
|
|
4841
|
+
fg='cyan')
|
|
4842
|
+
with sky.Dag() as dag:
|
|
4843
|
+
dag.add(task)
|
|
4844
|
+
|
|
4845
|
+
request_id = managed_jobs.pool_apply(task,
|
|
4846
|
+
pool_name,
|
|
4847
|
+
mode=serve_lib.UpdateMode(mode),
|
|
4848
|
+
_need_confirmation=not yes)
|
|
4849
|
+
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
|
|
4850
|
+
|
|
4851
|
+
|
|
4852
|
+
@pool.command('status', cls=_DocumentedCodeCommand)
|
|
4853
|
+
@flags.config_option(expose_value=False)
|
|
4854
|
+
@flags.verbose_option()
|
|
4855
|
+
@click.argument('pool_names', required=False, type=str, nargs=-1)
|
|
4856
|
+
@usage_lib.entrypoint
|
|
4857
|
+
# pylint: disable=redefined-builtin
|
|
4858
|
+
def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
4859
|
+
"""Show statuses of cluster pools.
|
|
4860
|
+
|
|
4861
|
+
Show detailed statuses of one or more pools. If POOL_NAME is not
|
|
4862
|
+
provided, show all pools' status.
|
|
4863
|
+
"""
|
|
4864
|
+
pool_names_to_query: Optional[List[str]] = pool_names
|
|
4865
|
+
if not pool_names:
|
|
4866
|
+
pool_names_to_query = None
|
|
4867
|
+
with rich_utils.client_status('[cyan]Checking pools[/]'):
|
|
4868
|
+
pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
|
|
4869
|
+
_, msg = _handle_services_request(pool_status_request_id,
|
|
4870
|
+
service_names=pool_names_to_query,
|
|
4871
|
+
show_all=verbose,
|
|
4872
|
+
show_endpoint=False,
|
|
4873
|
+
pool=True,
|
|
4874
|
+
is_called_by_user=True)
|
|
4875
|
+
|
|
4876
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
4877
|
+
f'Pools{colorama.Style.RESET_ALL}')
|
|
4878
|
+
click.echo(msg)
|
|
4879
|
+
|
|
4880
|
+
|
|
4881
|
+
@pool.command('down', cls=_DocumentedCodeCommand)
|
|
4882
|
+
@flags.config_option(expose_value=False)
|
|
4883
|
+
@click.argument('pool_names', required=False, type=str, nargs=-1)
|
|
4884
|
+
@flags.all_option('Delete all pools.')
|
|
4885
|
+
@click.option('--purge',
|
|
4886
|
+
'-p',
|
|
4887
|
+
default=False,
|
|
4888
|
+
is_flag=True,
|
|
4889
|
+
help='Tear down pools in failed status.')
|
|
4890
|
+
@flags.yes_option()
|
|
4891
|
+
@_add_click_options(flags.COMMON_OPTIONS)
|
|
4892
|
+
@usage_lib.entrypoint
|
|
4893
|
+
# pylint: disable=redefined-builtin
|
|
4894
|
+
def jobs_pool_down(
|
|
4895
|
+
pool_names: List[str],
|
|
4896
|
+
all: bool,
|
|
4897
|
+
purge: bool,
|
|
4898
|
+
yes: bool,
|
|
4899
|
+
async_call: bool,
|
|
4900
|
+
) -> None:
|
|
4901
|
+
"""Delete pool(s).
|
|
4902
|
+
|
|
4903
|
+
POOL_NAMES is the name of the pool (or glob pattern) to delete. If
|
|
4904
|
+
both POOL_NAMES and ``--all`` are supplied, the latter takes precedence.
|
|
4905
|
+
|
|
4906
|
+
Deleting a pool will delete all of its workers and associated resources.
|
|
4907
|
+
"""
|
|
4908
|
+
if sum([bool(pool_names), all]) != 1:
|
|
4909
|
+
argument_str = (f'POOL_NAMES={",".join(pool_names)}'
|
|
4910
|
+
if pool_names else '')
|
|
4911
|
+
argument_str += ' --all' if all else ''
|
|
4912
|
+
raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
|
|
4913
|
+
f'Provided {argument_str!r}.')
|
|
4914
|
+
|
|
4915
|
+
if not yes:
|
|
4916
|
+
quoted_pool_names = [f'{name!r}' for name in pool_names]
|
|
4917
|
+
list_pool_str = ', '.join(quoted_pool_names)
|
|
4918
|
+
pool_identity_str = f'pool(s) {list_pool_str}'
|
|
4919
|
+
if all:
|
|
4920
|
+
pool_identity_str = 'all pools'
|
|
4921
|
+
click.confirm(f'Terminating {pool_identity_str}. Proceed?',
|
|
4922
|
+
default=True,
|
|
4923
|
+
abort=True,
|
|
4924
|
+
show_default=True)
|
|
4925
|
+
|
|
4926
|
+
request_id = managed_jobs.pool_down(pool_names, all=all, purge=purge)
|
|
4927
|
+
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_down')
|
|
4928
|
+
|
|
4929
|
+
|
|
4930
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
4931
|
+
@flags.config_option(expose_value=False)
|
|
4932
|
+
@usage_lib.entrypoint
|
|
4933
|
+
def dashboard() -> None:
|
|
4934
|
+
"""Starts the dashboard for skypilot."""
|
|
4935
|
+
sdk.dashboard()
|
|
4936
|
+
|
|
4937
|
+
|
|
4938
|
+
@cli.group(cls=_NaturalOrderGroup)
|
|
4939
|
+
def serve():
|
|
4940
|
+
"""SkyServe CLI (multi-region, multi-cloud serving)."""
|
|
4941
|
+
pass
|
|
4942
|
+
|
|
4943
|
+
|
|
4944
|
+
def _generate_task_with_service(
|
|
4945
|
+
service_name: str,
|
|
4946
|
+
service_yaml_args: Tuple[str, ...],
|
|
4947
|
+
workdir: Optional[str],
|
|
4948
|
+
cloud: Optional[str],
|
|
4949
|
+
region: Optional[str],
|
|
4950
|
+
zone: Optional[str],
|
|
4951
|
+
num_nodes: Optional[int],
|
|
4952
|
+
use_spot: Optional[bool],
|
|
4953
|
+
image_id: Optional[str],
|
|
4954
|
+
env_file: Optional[Dict[str, str]],
|
|
4955
|
+
env: List[Tuple[str, str]],
|
|
4956
|
+
secret: Optional[List[Tuple[str, str]]],
|
|
4957
|
+
gpus: Optional[str],
|
|
4958
|
+
instance_type: Optional[str],
|
|
4959
|
+
ports: Optional[Tuple[str]],
|
|
4960
|
+
cpus: Optional[str],
|
|
4961
|
+
memory: Optional[str],
|
|
4962
|
+
disk_size: Optional[int],
|
|
4963
|
+
disk_tier: Optional[str],
|
|
4964
|
+
network_tier: Optional[str],
|
|
4965
|
+
not_supported_cmd: str,
|
|
4966
|
+
pool: bool, # pylint: disable=redefined-outer-name
|
|
4593
4967
|
) -> sky.Task:
|
|
4594
4968
|
"""Generate a task with service section from a service YAML file."""
|
|
4595
4969
|
is_yaml, _ = _check_yaml(''.join(service_yaml_args))
|
|
4970
|
+
yaml_name = 'SERVICE_YAML' if not pool else 'POOL_YAML'
|
|
4596
4971
|
if not is_yaml:
|
|
4597
|
-
raise click.UsageError('
|
|
4972
|
+
raise click.UsageError(f'{yaml_name} must be a valid YAML file.')
|
|
4598
4973
|
env = _merge_env_vars(env_file, env)
|
|
4599
4974
|
# We keep nargs=-1 in service_yaml argument to reuse this function.
|
|
4600
4975
|
task = _make_task_or_dag_from_entrypoint_with_overrides(
|
|
@@ -4624,9 +4999,17 @@ def _generate_task_with_service(
|
|
|
4624
4999
|
_DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd))
|
|
4625
5000
|
|
|
4626
5001
|
if task.service is None:
|
|
5002
|
+
field_name = 'service' if not pool else 'pool'
|
|
4627
5003
|
with ux_utils.print_exception_no_traceback():
|
|
4628
|
-
raise ValueError('
|
|
4629
|
-
'To fix, add a valid
|
|
5004
|
+
raise ValueError(f'{field_name.capitalize()} section not found '
|
|
5005
|
+
'in the YAML file. To fix, add a valid '
|
|
5006
|
+
f'`{field_name}` field.')
|
|
5007
|
+
|
|
5008
|
+
if task.service.pool:
|
|
5009
|
+
if task.service.ports is not None or ports:
|
|
5010
|
+
with ux_utils.print_exception_no_traceback():
|
|
5011
|
+
raise ValueError('Cannot specify ports in a cluster pool.')
|
|
5012
|
+
return task
|
|
4630
5013
|
|
|
4631
5014
|
# NOTE(yi): we only allow one service port now.
|
|
4632
5015
|
service_port: Optional[int] = int(
|
|
@@ -4786,10 +5169,14 @@ def serve_up(
|
|
|
4786
5169
|
network_tier=network_tier,
|
|
4787
5170
|
ports=ports,
|
|
4788
5171
|
not_supported_cmd='sky serve up',
|
|
5172
|
+
pool=False,
|
|
4789
5173
|
)
|
|
5174
|
+
assert task.service is not None
|
|
5175
|
+
if task.service.pool:
|
|
5176
|
+
raise click.UsageError('The YAML file needs a `service` section.')
|
|
4790
5177
|
click.secho('Service spec:', fg='cyan')
|
|
4791
5178
|
click.echo(task.service)
|
|
4792
|
-
serve_lib.validate_service_task(task)
|
|
5179
|
+
serve_lib.validate_service_task(task, pool=False)
|
|
4793
5180
|
|
|
4794
5181
|
click.secho('Each replica will use the following resources (estimated):',
|
|
4795
5182
|
fg='cyan')
|
|
@@ -4888,10 +5275,11 @@ def serve_update(
|
|
|
4888
5275
|
network_tier=network_tier,
|
|
4889
5276
|
ports=ports,
|
|
4890
5277
|
not_supported_cmd='sky serve update',
|
|
5278
|
+
pool=False,
|
|
4891
5279
|
)
|
|
4892
5280
|
click.secho('Service spec:', fg='cyan')
|
|
4893
5281
|
click.echo(task.service)
|
|
4894
|
-
serve_lib.validate_service_task(task)
|
|
5282
|
+
serve_lib.validate_service_task(task, pool=False)
|
|
4895
5283
|
|
|
4896
5284
|
click.secho('New replica will use the following resources (estimated):',
|
|
4897
5285
|
fg='cyan')
|