skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -60
- sky/client/common.py +12 -9
- sky/clouds/nebius.py +1 -1
- sky/clouds/utils/gcp_utils.py +1 -1
- sky/clouds/vast.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
- sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
- sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +25 -0
- sky/data/storage.py +1219 -1775
- sky/global_user_state.py +18 -8
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +164 -31
- sky/jobs/utils.py +144 -68
- sky/logs/aws.py +4 -2
- sky/provision/kubernetes/utils.py +6 -4
- sky/provision/nebius/constants.py +3 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vast/utils.py +9 -6
- sky/py.typed +0 -0
- sky/resources.py +24 -14
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +6 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
- sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
- sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
- sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
- sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
- /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -30,7 +30,6 @@ from sky.backends import backend_utils
|
|
|
30
30
|
from sky.jobs import constants as managed_job_constants
|
|
31
31
|
from sky.jobs import scheduler
|
|
32
32
|
from sky.jobs import state as managed_job_state
|
|
33
|
-
from sky.server import common as server_common
|
|
34
33
|
from sky.skylet import constants
|
|
35
34
|
from sky.skylet import job_lib
|
|
36
35
|
from sky.skylet import log_lib
|
|
@@ -39,7 +38,6 @@ from sky.utils import annotations
|
|
|
39
38
|
from sky.utils import command_runner
|
|
40
39
|
from sky.utils import common_utils
|
|
41
40
|
from sky.utils import controller_utils
|
|
42
|
-
from sky.utils import env_options
|
|
43
41
|
from sky.utils import infra_utils
|
|
44
42
|
from sky.utils import log_utils
|
|
45
43
|
from sky.utils import message_utils
|
|
@@ -136,12 +134,6 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
|
136
134
|
def _validate_consolidation_mode_config(
|
|
137
135
|
current_is_consolidation_mode: bool) -> None:
|
|
138
136
|
"""Validate the consolidation mode config."""
|
|
139
|
-
if (current_is_consolidation_mode and
|
|
140
|
-
not env_options.Options.IS_DEVELOPER.get() and
|
|
141
|
-
server_common.is_api_server_local()):
|
|
142
|
-
with ux_utils.print_exception_no_traceback():
|
|
143
|
-
raise exceptions.NotSupportedError(
|
|
144
|
-
'Consolidation mode is not supported when running locally.')
|
|
145
137
|
# Check whether the consolidation mode config is changed.
|
|
146
138
|
if current_is_consolidation_mode:
|
|
147
139
|
controller_cn = (
|
|
@@ -239,8 +231,8 @@ def ha_recovery_for_consolidation_mode():
|
|
|
239
231
|
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
240
232
|
|
|
241
233
|
|
|
242
|
-
def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
243
|
-
|
|
234
|
+
def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
235
|
+
job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
|
|
244
236
|
"""Check the status of the job running on a managed job cluster.
|
|
245
237
|
|
|
246
238
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
@@ -253,10 +245,13 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
|
253
245
|
logger.info(f'Cluster {cluster_name} not found.')
|
|
254
246
|
return None
|
|
255
247
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
248
|
+
job_ids = None if job_id is None else [job_id]
|
|
256
249
|
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
257
250
|
try:
|
|
258
251
|
logger.info('=== Checking the job status... ===')
|
|
259
|
-
statuses = backend.get_job_status(handle,
|
|
252
|
+
statuses = backend.get_job_status(handle,
|
|
253
|
+
job_ids=job_ids,
|
|
254
|
+
stream_logs=False)
|
|
260
255
|
status = list(statuses.values())[0]
|
|
261
256
|
if status is None:
|
|
262
257
|
logger.info('No job found.')
|
|
@@ -323,13 +318,20 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
323
318
|
error_msg = None
|
|
324
319
|
tasks = managed_job_state.get_managed_jobs(job_id)
|
|
325
320
|
for task in tasks:
|
|
326
|
-
|
|
327
|
-
|
|
321
|
+
pool = task.get('pool', None)
|
|
322
|
+
if pool is None:
|
|
323
|
+
task_name = task['job_name']
|
|
324
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
325
|
+
task_name, job_id)
|
|
326
|
+
else:
|
|
327
|
+
cluster_name, _ = (
|
|
328
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
328
329
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
329
330
|
cluster_name)
|
|
330
331
|
if handle is not None:
|
|
331
332
|
try:
|
|
332
|
-
|
|
333
|
+
if pool is None:
|
|
334
|
+
terminate_cluster(cluster_name)
|
|
333
335
|
except Exception as e: # pylint: disable=broad-except
|
|
334
336
|
error_msg = (
|
|
335
337
|
f'Failed to terminate cluster {cluster_name}: '
|
|
@@ -510,10 +512,10 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
510
512
|
|
|
511
513
|
|
|
512
514
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
513
|
-
get_end_time: bool) -> float:
|
|
515
|
+
job_id: Optional[int], get_end_time: bool) -> float:
|
|
514
516
|
"""Get the submitted/ended time of the job."""
|
|
515
517
|
code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
516
|
-
job_id=
|
|
518
|
+
job_id=job_id, get_ended_time=get_end_time)
|
|
517
519
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
518
520
|
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
519
521
|
code,
|
|
@@ -527,14 +529,17 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
|
527
529
|
|
|
528
530
|
|
|
529
531
|
def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
530
|
-
cluster_name: str) -> float:
|
|
532
|
+
cluster_name: str, job_id: Optional[int]) -> float:
|
|
531
533
|
"""Try to get the end time of the job.
|
|
532
534
|
|
|
533
535
|
If the job is preempted or we can't connect to the instance for whatever
|
|
534
536
|
reason, fall back to the current time.
|
|
535
537
|
"""
|
|
536
538
|
try:
|
|
537
|
-
return get_job_timestamp(backend,
|
|
539
|
+
return get_job_timestamp(backend,
|
|
540
|
+
cluster_name,
|
|
541
|
+
job_id=job_id,
|
|
542
|
+
get_end_time=True)
|
|
538
543
|
except exceptions.CommandError as e:
|
|
539
544
|
if e.returncode == 255:
|
|
540
545
|
# Failed to connect - probably the instance was preempted since the
|
|
@@ -556,8 +561,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
556
561
|
if event_callback is None or task is None:
|
|
557
562
|
return
|
|
558
563
|
event_callback = event_callback.strip()
|
|
559
|
-
|
|
560
|
-
|
|
564
|
+
pool = managed_job_state.get_pool_from_job_id(job_id)
|
|
565
|
+
if pool is not None:
|
|
566
|
+
cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
|
|
567
|
+
else:
|
|
568
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
569
|
+
task.name, job_id) if task.name else None
|
|
561
570
|
logger.info(f'=== START: event callback for {status!r} ===')
|
|
562
571
|
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
563
572
|
'managed_job_event',
|
|
@@ -684,6 +693,15 @@ def cancel_job_by_name(job_name: str,
|
|
|
684
693
|
return f'{job_name!r} {msg}'
|
|
685
694
|
|
|
686
695
|
|
|
696
|
+
def cancel_jobs_by_pool(pool_name: str,
|
|
697
|
+
current_workspace: Optional[str] = None) -> str:
|
|
698
|
+
"""Cancel all jobs in a pool."""
|
|
699
|
+
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
|
|
700
|
+
if not job_ids:
|
|
701
|
+
return f'No running job found in pool {pool_name!r}.'
|
|
702
|
+
return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
|
703
|
+
|
|
704
|
+
|
|
687
705
|
def stream_logs_by_id(job_id: int,
|
|
688
706
|
follow: bool = True,
|
|
689
707
|
tail: Optional[int] = None) -> Tuple[str, int]:
|
|
@@ -716,23 +734,41 @@ def stream_logs_by_id(job_id: int,
|
|
|
716
734
|
if managed_job_status.is_failed():
|
|
717
735
|
job_msg = ('\nFailure reason: '
|
|
718
736
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
737
|
+
log_file_exists = False
|
|
738
|
+
task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
|
|
739
|
+
job_id)
|
|
740
|
+
num_tasks = len(task_info)
|
|
741
|
+
for task_id, task_name, task_status, log_file in task_info:
|
|
742
|
+
if log_file:
|
|
743
|
+
log_file_exists = True
|
|
744
|
+
task_str = (f'Task {task_name}({task_id})'
|
|
745
|
+
if task_name else f'Task {task_id}')
|
|
746
|
+
if num_tasks > 1:
|
|
747
|
+
print(f'=== {task_str} ===')
|
|
748
|
+
with open(os.path.expanduser(log_file),
|
|
749
|
+
'r',
|
|
750
|
+
encoding='utf-8') as f:
|
|
751
|
+
# Stream the logs to the console without reading the
|
|
752
|
+
# whole file into memory.
|
|
753
|
+
start_streaming = False
|
|
754
|
+
read_from: Union[TextIO, Deque[str]] = f
|
|
755
|
+
if tail is not None:
|
|
756
|
+
assert tail > 0
|
|
757
|
+
# Read only the last 'tail' lines using deque
|
|
758
|
+
read_from = collections.deque(f, maxlen=tail)
|
|
759
|
+
for line in read_from:
|
|
760
|
+
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
|
761
|
+
start_streaming = True
|
|
762
|
+
if start_streaming:
|
|
763
|
+
print(line, end='', flush=True)
|
|
764
|
+
if num_tasks > 1:
|
|
765
|
+
# Add the "Task finished" message for terminal states
|
|
766
|
+
if task_status.is_terminal():
|
|
767
|
+
print(ux_utils.finishing_message(
|
|
768
|
+
f'{task_str} finished '
|
|
769
|
+
f'(status: {task_status.value}).'),
|
|
770
|
+
flush=True)
|
|
771
|
+
if log_file_exists:
|
|
736
772
|
# Add the "Job finished" message for terminal states
|
|
737
773
|
if managed_job_status.is_terminal():
|
|
738
774
|
print(ux_utils.finishing_message(
|
|
@@ -759,12 +795,19 @@ def stream_logs_by_id(job_id: int,
|
|
|
759
795
|
|
|
760
796
|
while should_keep_logging(managed_job_status):
|
|
761
797
|
handle = None
|
|
798
|
+
job_id_to_tail = None
|
|
762
799
|
if task_id is not None:
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
800
|
+
pool = managed_job_state.get_pool_from_job_id(job_id)
|
|
801
|
+
if pool is not None:
|
|
802
|
+
cluster_name, job_id_to_tail = (
|
|
803
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
804
|
+
else:
|
|
805
|
+
task_name = managed_job_state.get_task_name(job_id, task_id)
|
|
806
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
807
|
+
task_name, job_id)
|
|
808
|
+
if cluster_name is not None:
|
|
809
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
810
|
+
cluster_name)
|
|
768
811
|
|
|
769
812
|
# Check the handle: The cluster can be preempted and removed from
|
|
770
813
|
# the table before the managed job state is updated by the
|
|
@@ -796,7 +839,7 @@ def stream_logs_by_id(job_id: int,
|
|
|
796
839
|
status_display.stop()
|
|
797
840
|
tail_param = tail if tail is not None else 0
|
|
798
841
|
returncode = backend.tail_logs(handle,
|
|
799
|
-
job_id=
|
|
842
|
+
job_id=job_id_to_tail,
|
|
800
843
|
managed_job_id=job_id,
|
|
801
844
|
follow=follow,
|
|
802
845
|
tail=tail_param)
|
|
@@ -1114,9 +1157,15 @@ def dump_managed_job_queue() -> str:
|
|
|
1114
1157
|
job['status'] = job['status'].value
|
|
1115
1158
|
job['schedule_state'] = job['schedule_state'].value
|
|
1116
1159
|
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1160
|
+
pool = managed_job_state.get_pool_from_job_id(job['job_id'])
|
|
1161
|
+
if pool is not None:
|
|
1162
|
+
cluster_name, _ = managed_job_state.get_pool_submit_info(
|
|
1163
|
+
job['job_id'])
|
|
1164
|
+
else:
|
|
1165
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1166
|
+
job['task_name'], job['job_id'])
|
|
1167
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
1168
|
+
cluster_name) if cluster_name is not None else None
|
|
1120
1169
|
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1121
1170
|
resources_str = resources_utils.get_readable_resources_repr(
|
|
1122
1171
|
handle, simplify=True)
|
|
@@ -1127,6 +1176,11 @@ def dump_managed_job_queue() -> str:
|
|
|
1127
1176
|
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1128
1177
|
job['region'] = handle.launched_resources.region
|
|
1129
1178
|
job['zone'] = handle.launched_resources.zone
|
|
1179
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1180
|
+
str(handle.launched_resources.cloud),
|
|
1181
|
+
handle.launched_resources.region,
|
|
1182
|
+
handle.launched_resources.zone).formatted_str()
|
|
1183
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1130
1184
|
else:
|
|
1131
1185
|
# FIXME(zongheng): display the last cached values for these.
|
|
1132
1186
|
job['cluster_resources'] = '-'
|
|
@@ -1134,6 +1188,7 @@ def dump_managed_job_queue() -> str:
|
|
|
1134
1188
|
job['cloud'] = '-'
|
|
1135
1189
|
job['region'] = '-'
|
|
1136
1190
|
job['zone'] = '-'
|
|
1191
|
+
job['infra'] = '-'
|
|
1137
1192
|
|
|
1138
1193
|
# Add details about schedule state / backoff.
|
|
1139
1194
|
state_details = None
|
|
@@ -1274,10 +1329,13 @@ def format_job_table(
|
|
|
1274
1329
|
'JOB DURATION',
|
|
1275
1330
|
'#RECOVERIES',
|
|
1276
1331
|
'STATUS',
|
|
1332
|
+
'WORKER_POOL',
|
|
1277
1333
|
]
|
|
1278
1334
|
if show_all:
|
|
1279
1335
|
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
|
1280
1336
|
columns += [
|
|
1337
|
+
'WORKER_CLUSTER',
|
|
1338
|
+
'WORKER_JOB_ID',
|
|
1281
1339
|
'STARTED',
|
|
1282
1340
|
'INFRA',
|
|
1283
1341
|
'RESOURCES',
|
|
@@ -1387,11 +1445,14 @@ def format_job_table(
|
|
|
1387
1445
|
job_duration,
|
|
1388
1446
|
recovery_cnt,
|
|
1389
1447
|
status_str,
|
|
1448
|
+
job_tasks[0].get('pool', '-'),
|
|
1390
1449
|
]
|
|
1391
1450
|
if show_all:
|
|
1392
1451
|
details = job_tasks[current_task_id].get('details')
|
|
1393
1452
|
failure_reason = job_tasks[current_task_id]['failure_reason']
|
|
1394
1453
|
job_values.extend([
|
|
1454
|
+
'-',
|
|
1455
|
+
'-',
|
|
1395
1456
|
'-',
|
|
1396
1457
|
'-',
|
|
1397
1458
|
'-',
|
|
@@ -1427,37 +1488,43 @@ def format_job_table(
|
|
|
1427
1488
|
job_duration,
|
|
1428
1489
|
task['recovery_count'],
|
|
1429
1490
|
task['status'].colored_str(),
|
|
1491
|
+
task.get('pool', '-'),
|
|
1430
1492
|
]
|
|
1431
1493
|
if show_all:
|
|
1432
1494
|
# schedule_state is only set at the job level, so if we have
|
|
1433
1495
|
# more than one task, only display on the aggregated row.
|
|
1434
1496
|
schedule_state = (task['schedule_state']
|
|
1435
1497
|
if len(job_tasks) == 1 else '-')
|
|
1436
|
-
|
|
1437
|
-
if
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1498
|
+
infra_str = task.get('infra')
|
|
1499
|
+
if infra_str is None:
|
|
1500
|
+
cloud = task.get('cloud')
|
|
1501
|
+
if cloud is None:
|
|
1502
|
+
# Backward compatibility for old jobs controller without
|
|
1503
|
+
# cloud info returned, we parse it from the cluster
|
|
1504
|
+
# resources
|
|
1505
|
+
# TODO(zhwu): remove this after 0.12.0
|
|
1506
|
+
cloud = task['cluster_resources'].split('(')[0].split(
|
|
1507
|
+
'x')[-1]
|
|
1508
|
+
task['cluster_resources'] = task[
|
|
1509
|
+
'cluster_resources'].replace(f'{cloud}(',
|
|
1510
|
+
'(').replace(
|
|
1511
|
+
'x ', 'x')
|
|
1512
|
+
region = task['region']
|
|
1513
|
+
zone = task.get('zone')
|
|
1514
|
+
if cloud == '-':
|
|
1515
|
+
cloud = None
|
|
1516
|
+
if region == '-':
|
|
1517
|
+
region = None
|
|
1518
|
+
if zone == '-':
|
|
1519
|
+
zone = None
|
|
1520
|
+
infra_str = infra_utils.InfraInfo(cloud, region,
|
|
1521
|
+
zone).formatted_str()
|
|
1457
1522
|
values.extend([
|
|
1523
|
+
task.get('current_cluster_name', '-'),
|
|
1524
|
+
task.get('job_id_on_pool_cluster', '-'),
|
|
1458
1525
|
# STARTED
|
|
1459
1526
|
log_utils.readable_time_duration(task['start_at']),
|
|
1460
|
-
|
|
1527
|
+
infra_str,
|
|
1461
1528
|
task['cluster_resources'],
|
|
1462
1529
|
schedule_state,
|
|
1463
1530
|
generate_details(task.get('details'),
|
|
@@ -1549,6 +1616,15 @@ class ManagedJobCodeGen:
|
|
|
1549
1616
|
""")
|
|
1550
1617
|
return cls._build(code)
|
|
1551
1618
|
|
|
1619
|
+
@classmethod
|
|
1620
|
+
def cancel_jobs_by_pool(cls, pool_name: str) -> str:
|
|
1621
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
1622
|
+
code = textwrap.dedent(f"""\
|
|
1623
|
+
msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
|
|
1624
|
+
print(msg, end="", flush=True)
|
|
1625
|
+
""")
|
|
1626
|
+
return cls._build(code)
|
|
1627
|
+
|
|
1552
1628
|
@classmethod
|
|
1553
1629
|
def get_version_and_job_table(cls) -> str:
|
|
1554
1630
|
"""Generate code to get controller version and raw job table."""
|
sky/logs/aws.py
CHANGED
|
@@ -9,6 +9,8 @@ from sky.skylet import constants
|
|
|
9
9
|
from sky.utils import common_utils
|
|
10
10
|
from sky.utils import resources_utils
|
|
11
11
|
|
|
12
|
+
EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class _CloudwatchLoggingConfig(pydantic.BaseModel):
|
|
14
16
|
"""Configuration for AWS CloudWatch logging agent."""
|
|
@@ -109,8 +111,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
109
111
|
# Check if we're running on EC2 with an IAM role or if
|
|
110
112
|
# AWS credentials are available in the environment
|
|
111
113
|
pre_cmd = (
|
|
112
|
-
'if ! curl -s -m 1
|
|
113
|
-
'
|
|
114
|
+
f'if ! curl -s -m 1 {EC2_MD_URL}'
|
|
115
|
+
'latest/meta-data/iam/security-credentials/ > /dev/null; '
|
|
114
116
|
'then '
|
|
115
117
|
# failed EC2 check, look for env vars
|
|
116
118
|
'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
|
|
@@ -3179,10 +3179,12 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
|
3179
3179
|
return pods
|
|
3180
3180
|
|
|
3181
3181
|
|
|
3182
|
-
def is_tpu_on_gke(accelerator: str) -> bool:
|
|
3182
|
+
def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
|
|
3183
3183
|
"""Determines if the given accelerator is a TPU supported on GKE."""
|
|
3184
|
-
|
|
3185
|
-
|
|
3184
|
+
if normalize:
|
|
3185
|
+
normalized, _ = normalize_tpu_accelerator_name(accelerator)
|
|
3186
|
+
return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
3187
|
+
return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
3186
3188
|
|
|
3187
3189
|
|
|
3188
3190
|
def get_node_accelerator_count(context: Optional[str],
|
|
@@ -3384,7 +3386,7 @@ def process_skypilot_pods(
|
|
|
3384
3386
|
|
|
3385
3387
|
def _gpu_resource_key_helper(context: Optional[str]) -> str:
|
|
3386
3388
|
"""Helper function to get the GPU resource key."""
|
|
3387
|
-
gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['
|
|
3389
|
+
gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
|
|
3388
3390
|
try:
|
|
3389
3391
|
nodes = kubernetes.core_api(context).list_node().items
|
|
3390
3392
|
for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
|
|
@@ -15,6 +15,9 @@ INFINIBAND_ENV_VARS = {
|
|
|
15
15
|
'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
# pylint: disable=line-too-long
|
|
19
|
+
INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
|
|
20
|
+
|
|
18
21
|
# Docker run options for InfiniBand support
|
|
19
22
|
INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']
|
|
20
23
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -97,7 +97,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
97
|
region=region,
|
|
98
98
|
disk_size=config.node_config['DiskSize'],
|
|
99
99
|
preemptible=config.node_config['Preemptible'],
|
|
100
|
-
image_name=config.node_config['ImageId']
|
|
100
|
+
image_name=config.node_config['ImageId'],
|
|
101
|
+
ports=config.ports_to_open_on_launch)
|
|
101
102
|
except Exception as e: # pylint: disable=broad-except
|
|
102
103
|
logger.warning(f'run_instances error: {e}')
|
|
103
104
|
raise
|
sky/provision/vast/utils.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# python sdk.
|
|
6
6
|
#
|
|
7
7
|
"""Vast library wrapper for SkyPilot."""
|
|
8
|
-
from typing import Any, Dict, List
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.adaptors import vast
|
|
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
37
|
-
image_name: str,
|
|
37
|
+
image_name: str, ports: Optional[List[int]],
|
|
38
|
+
preemptible: bool) -> str:
|
|
38
39
|
"""Launches an instance with the given parameters.
|
|
39
40
|
|
|
40
41
|
Converts the instance_type to the Vast GPU name, finds the specs for the
|
|
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
58
59
|
The disk size {xx} GB is not exactly matched the requested
|
|
59
60
|
size {yy} GB. It is possible to charge extra cost on disk.
|
|
60
61
|
|
|
62
|
+
* `ports`: This is a feature flag to expose ports to the internet.
|
|
63
|
+
|
|
61
64
|
* `geolocation`: Geolocation on Vast can be as specific as the
|
|
62
65
|
host chooses to be. They can say, for instance, "Yutakachō,
|
|
63
66
|
Shinagawa District, Tokyo, JP." Such a specific geolocation
|
|
@@ -79,9 +82,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
79
82
|
|
|
80
83
|
* Vast instance types are an invention for skypilot. Refer to
|
|
81
84
|
catalog/vast_catalog.py for the current construction
|
|
82
|
-
of the type.
|
|
83
|
-
|
|
84
|
-
"""
|
|
85
|
+
of the type."""
|
|
85
86
|
cpu_ram = float(instance_type.split('-')[-1]) / 1024
|
|
86
87
|
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
|
87
88
|
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
|
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
104
105
|
|
|
105
106
|
instance_touse = instance_list[0]
|
|
106
107
|
|
|
108
|
+
port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
|
|
109
|
+
|
|
107
110
|
launch_params = {
|
|
108
111
|
'id': instance_touse['id'],
|
|
109
112
|
'direct': True,
|
|
110
113
|
'ssh': True,
|
|
111
|
-
'env': '-e __SOURCE=skypilot',
|
|
114
|
+
'env': f'-e __SOURCE=skypilot {port_map}',
|
|
112
115
|
'onstart_cmd': ';'.join([
|
|
113
116
|
'touch ~/.no_auto_tmux',
|
|
114
117
|
f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
|
sky/py.typed
ADDED
|
File without changes
|
sky/resources.py
CHANGED
|
@@ -19,6 +19,7 @@ from sky.clouds import cloud as sky_cloud
|
|
|
19
19
|
from sky.provision import docker_utils
|
|
20
20
|
from sky.provision.gcp import constants as gcp_constants
|
|
21
21
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
22
|
+
from sky.provision.nebius import constants as nebius_constants
|
|
22
23
|
from sky.skylet import constants
|
|
23
24
|
from sky.utils import accelerator_registry
|
|
24
25
|
from sky.utils import annotations
|
|
@@ -797,8 +798,13 @@ class Resources:
|
|
|
797
798
|
|
|
798
799
|
acc, _ = list(accelerators.items())[0]
|
|
799
800
|
if 'tpu' in acc.lower():
|
|
801
|
+
# TODO(syang): GCP TPU names are supported on both GCP and
|
|
802
|
+
# kubernetes (GKE), but this logic automatically assumes
|
|
803
|
+
# GCP TPUs can only be used on GCP.
|
|
804
|
+
# Fix the logic such that GCP TPU names can failover between
|
|
805
|
+
# GCP and kubernetes.
|
|
800
806
|
if self.cloud is None:
|
|
801
|
-
if kubernetes_utils.is_tpu_on_gke(acc):
|
|
807
|
+
if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
|
|
802
808
|
self._cloud = clouds.Kubernetes()
|
|
803
809
|
else:
|
|
804
810
|
self._cloud = clouds.GCP()
|
|
@@ -813,7 +819,8 @@ class Resources:
|
|
|
813
819
|
|
|
814
820
|
use_tpu_vm = accelerator_args.get('tpu_vm', True)
|
|
815
821
|
if (self.cloud.is_same_cloud(clouds.GCP()) and
|
|
816
|
-
not kubernetes_utils.is_tpu_on_gke(acc
|
|
822
|
+
not kubernetes_utils.is_tpu_on_gke(acc,
|
|
823
|
+
normalize=False)):
|
|
817
824
|
if 'runtime_version' not in accelerator_args:
|
|
818
825
|
|
|
819
826
|
def _get_default_runtime_version() -> str:
|
|
@@ -1254,15 +1261,19 @@ class Resources:
|
|
|
1254
1261
|
ValueError: if the attribute is invalid.
|
|
1255
1262
|
"""
|
|
1256
1263
|
|
|
1257
|
-
if
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1264
|
+
if self._network_tier == resources_utils.NetworkTier.BEST:
|
|
1265
|
+
if isinstance(self._cloud, clouds.GCP):
|
|
1266
|
+
# Handle GPU Direct TCPX requirement for docker images
|
|
1267
|
+
if self._image_id is None:
|
|
1268
|
+
self._image_id = {
|
|
1269
|
+
self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
|
|
1270
|
+
}
|
|
1271
|
+
elif isinstance(self._cloud, clouds.Nebius):
|
|
1272
|
+
if self._image_id is None:
|
|
1273
|
+
self._image_id = {
|
|
1274
|
+
self._region: nebius_constants.INFINIBAND_IMAGE_ID
|
|
1275
|
+
}
|
|
1276
|
+
elif self._image_id:
|
|
1266
1277
|
# Custom image specified - validate it's a docker image
|
|
1267
1278
|
# Check if any of the specified images are not docker images
|
|
1268
1279
|
non_docker_images = []
|
|
@@ -1274,14 +1285,13 @@ class Resources:
|
|
|
1274
1285
|
if non_docker_images:
|
|
1275
1286
|
with ux_utils.print_exception_no_traceback():
|
|
1276
1287
|
raise ValueError(
|
|
1277
|
-
f'When using network_tier=BEST
|
|
1288
|
+
f'When using network_tier=BEST, image_id '
|
|
1278
1289
|
f'must be a docker image. '
|
|
1279
1290
|
f'Found non-docker images: '
|
|
1280
1291
|
f'{", ".join(non_docker_images)}. '
|
|
1281
1292
|
f'Please either: (1) use a docker image '
|
|
1282
1293
|
f'(prefix with "docker:"), or '
|
|
1283
|
-
f'(2) leave image_id empty to use the default
|
|
1284
|
-
f'GPU Direct TCPX image.')
|
|
1294
|
+
f'(2) leave image_id empty to use the default')
|
|
1285
1295
|
|
|
1286
1296
|
if self._image_id is None:
|
|
1287
1297
|
return
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Columns for cluster pool.
|
|
2
|
+
|
|
3
|
+
Revision ID: 002
|
|
4
|
+
Revises: 001
|
|
5
|
+
Create Date: 2025-07-18
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '002'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '001'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add columns for cluster pool."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
27
|
+
'pool',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
31
|
+
'current_cluster_name',
|
|
32
|
+
sa.Text(),
|
|
33
|
+
server_default=None)
|
|
34
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
35
|
+
'job_id_on_pool_cluster',
|
|
36
|
+
sa.Integer(),
|
|
37
|
+
server_default=None)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def downgrade():
|
|
41
|
+
"""Remove columns for cluster pool."""
|
|
42
|
+
pass
|
sky/serve/autoscalers.py
CHANGED
|
@@ -175,6 +175,14 @@ class Autoscaler:
|
|
|
175
175
|
"""Collect request information from aggregator for autoscaling."""
|
|
176
176
|
raise NotImplementedError
|
|
177
177
|
|
|
178
|
+
def info(self) -> Dict[str, Any]:
|
|
179
|
+
"""Get information about the autoscaler."""
|
|
180
|
+
return {
|
|
181
|
+
'target_num_replicas': self.target_num_replicas,
|
|
182
|
+
'min_replicas': self.min_replicas,
|
|
183
|
+
'max_replicas': self.max_replicas,
|
|
184
|
+
}
|
|
185
|
+
|
|
178
186
|
def _generate_scaling_decisions(
|
|
179
187
|
self,
|
|
180
188
|
replica_infos: List['replica_managers.ReplicaInfo'],
|