skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +35 -1
- sky/backends/cloud_vm_ray_backend.py +2 -2
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +18 -16
- sky/clouds/aws.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +732 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +702 -511
- sky/jobs/utils.py +94 -39
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +43 -24
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +8 -1
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +3 -1
- sky/skylet/events.py +2 -10
- sky/utils/command_runner.pyi +3 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1d7e11230da3ca89.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"3SYxqNGnvvPS8h3gdD2T7","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/storage.py
CHANGED
|
@@ -2700,7 +2700,11 @@ class AzureBlobStore(AbstractStore):
|
|
|
2700
2700
|
name=override_args.get('name', metadata.name),
|
|
2701
2701
|
storage_account_name=override_args.get(
|
|
2702
2702
|
'storage_account', metadata.storage_account_name),
|
|
2703
|
-
|
|
2703
|
+
# TODO(cooperc): fix the types for mypy 1.16
|
|
2704
|
+
# Azure store expects a string path; metadata.source may be a Path
|
|
2705
|
+
# or List[Path].
|
|
2706
|
+
source=override_args.get('source',
|
|
2707
|
+
metadata.source), # type: ignore[arg-type]
|
|
2704
2708
|
region=override_args.get('region', metadata.region),
|
|
2705
2709
|
is_sky_managed=override_args.get('is_sky_managed',
|
|
2706
2710
|
metadata.is_sky_managed),
|
sky/execution.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
|
4
4
|
"""
|
|
5
5
|
import enum
|
|
6
|
+
import logging
|
|
6
7
|
import typing
|
|
7
8
|
from typing import List, Optional, Tuple, Union
|
|
8
9
|
|
|
@@ -120,6 +121,7 @@ def _execute(
|
|
|
120
121
|
_quiet_optimizer: bool = False,
|
|
121
122
|
_is_launched_by_jobs_controller: bool = False,
|
|
122
123
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
124
|
+
job_logger: logging.Logger = logger,
|
|
123
125
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
124
126
|
"""Execute an entrypoint.
|
|
125
127
|
|
|
@@ -221,7 +223,8 @@ def _execute(
|
|
|
221
223
|
_quiet_optimizer=_quiet_optimizer,
|
|
222
224
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
223
225
|
_is_launched_by_sky_serve_controller=
|
|
224
|
-
_is_launched_by_sky_serve_controller
|
|
226
|
+
_is_launched_by_sky_serve_controller,
|
|
227
|
+
job_logger=job_logger)
|
|
225
228
|
|
|
226
229
|
|
|
227
230
|
def _execute_dag(
|
|
@@ -243,6 +246,7 @@ def _execute_dag(
|
|
|
243
246
|
_quiet_optimizer: bool,
|
|
244
247
|
_is_launched_by_jobs_controller: bool,
|
|
245
248
|
_is_launched_by_sky_serve_controller: bool,
|
|
249
|
+
job_logger: logging.Logger = logger,
|
|
246
250
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
247
251
|
"""Execute a DAG.
|
|
248
252
|
|
|
@@ -253,7 +257,7 @@ def _execute_dag(
|
|
|
253
257
|
task = dag.tasks[0]
|
|
254
258
|
|
|
255
259
|
if any(r.job_recovery is not None for r in task.resources):
|
|
256
|
-
|
|
260
|
+
job_logger.warning(
|
|
257
261
|
f'{colorama.Style.DIM}The task has `job_recovery` specified, '
|
|
258
262
|
'but is launched as an unmanaged job. It will be ignored.'
|
|
259
263
|
'To enable job recovery, use managed jobs: sky jobs launch.'
|
|
@@ -334,10 +338,10 @@ def _execute_dag(
|
|
|
334
338
|
# itself have no task running and start the auto{stop,down}
|
|
335
339
|
# process, before the task is submitted in the EXEC stage.
|
|
336
340
|
verb = 'torn down' if down else 'stopped'
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
+
job_logger.info(f'{colorama.Style.DIM}The cluster will '
|
|
342
|
+
f'be {verb} after 1 minutes of idleness '
|
|
343
|
+
'(after all jobs finish).'
|
|
344
|
+
f'{colorama.Style.RESET_ALL}')
|
|
341
345
|
idle_minutes_to_autostop = 1
|
|
342
346
|
if Stage.DOWN in stages:
|
|
343
347
|
stages.remove(Stage.DOWN)
|
|
@@ -366,7 +370,7 @@ def _execute_dag(
|
|
|
366
370
|
yellow = colorama.Fore.YELLOW
|
|
367
371
|
bold = colorama.Style.BRIGHT
|
|
368
372
|
reset = colorama.Style.RESET_ALL
|
|
369
|
-
|
|
373
|
+
job_logger.info(
|
|
370
374
|
f'{yellow}Launching a spot job that does not '
|
|
371
375
|
f'automatically recover from preemptions. To '
|
|
372
376
|
'get automatic recovery, use managed job instead: '
|
|
@@ -385,7 +389,7 @@ def _execute_dag(
|
|
|
385
389
|
controller = controller_utils.Controllers.from_name(
|
|
386
390
|
cluster_name)
|
|
387
391
|
if controller is not None:
|
|
388
|
-
|
|
392
|
+
job_logger.info(
|
|
389
393
|
f'Choosing resources for {controller.value.name}...'
|
|
390
394
|
)
|
|
391
395
|
dag = optimizer.Optimizer.optimize(dag,
|
|
@@ -427,7 +431,7 @@ def _execute_dag(
|
|
|
427
431
|
if handle is None:
|
|
428
432
|
assert dryrun, ('If not dryrun, handle must be set or '
|
|
429
433
|
'Stage.PROVISION must be included in stages.')
|
|
430
|
-
|
|
434
|
+
job_logger.info('Dryrun finished.')
|
|
431
435
|
return None, None
|
|
432
436
|
|
|
433
437
|
do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
|
|
@@ -436,7 +440,7 @@ def _execute_dag(
|
|
|
436
440
|
(task.file_mounts is not None or
|
|
437
441
|
task.storage_mounts is not None))
|
|
438
442
|
if do_workdir or do_file_mounts:
|
|
439
|
-
|
|
443
|
+
job_logger.info(ux_utils.starting_message('Syncing files.'))
|
|
440
444
|
|
|
441
445
|
if do_workdir:
|
|
442
446
|
if cluster_name is not None:
|
|
@@ -456,11 +460,11 @@ def _execute_dag(
|
|
|
456
460
|
task.storage_mounts)
|
|
457
461
|
|
|
458
462
|
if no_setup:
|
|
459
|
-
|
|
463
|
+
job_logger.info('Setup commands skipped.')
|
|
460
464
|
elif Stage.SETUP in stages and not dryrun:
|
|
461
465
|
if skip_unnecessary_provisioning and provisioning_skipped:
|
|
462
|
-
|
|
463
|
-
|
|
466
|
+
job_logger.debug('Unnecessary provisioning was skipped, so '
|
|
467
|
+
'skipping setup as well.')
|
|
464
468
|
else:
|
|
465
469
|
if cluster_name is not None:
|
|
466
470
|
global_user_state.add_cluster_event(
|
|
@@ -521,6 +525,7 @@ def launch(
|
|
|
521
525
|
_is_launched_by_jobs_controller: bool = False,
|
|
522
526
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
523
527
|
_disable_controller_check: bool = False,
|
|
528
|
+
job_logger: logging.Logger = logger,
|
|
524
529
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
525
530
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
526
531
|
"""Launches a cluster or task.
|
|
@@ -688,7 +693,7 @@ def launch(
|
|
|
688
693
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
689
694
|
_is_launched_by_sky_serve_controller=
|
|
690
695
|
_is_launched_by_sky_serve_controller,
|
|
691
|
-
|
|
696
|
+
job_logger=job_logger)
|
|
692
697
|
|
|
693
698
|
|
|
694
699
|
@usage_lib.entrypoint
|
|
@@ -699,6 +704,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
699
704
|
down: bool = False,
|
|
700
705
|
stream_logs: bool = True,
|
|
701
706
|
backend: Optional[backends.Backend] = None,
|
|
707
|
+
job_logger: logging.Logger = logger,
|
|
702
708
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
703
709
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
704
710
|
"""Executes a task on an existing cluster.
|
|
@@ -774,4 +780,5 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
774
780
|
],
|
|
775
781
|
cluster_name=cluster_name,
|
|
776
782
|
detach_run=True,
|
|
783
|
+
job_logger=job_logger,
|
|
777
784
|
)
|
sky/jobs/constants.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Constants used for Managed Jobs."""
|
|
2
|
+
import os
|
|
2
3
|
from typing import Any, Dict, Union
|
|
3
4
|
|
|
4
5
|
from sky.skylet import constants as skylet_constants
|
|
@@ -9,6 +10,8 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
|
|
9
10
|
|
|
10
11
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
|
11
12
|
|
|
13
|
+
CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
|
|
14
|
+
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
12
15
|
# Resources as a dict for the jobs controller.
|
|
13
16
|
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
|
14
17
|
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|