skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +35 -1
  4. sky/backends/cloud_vm_ray_backend.py +2 -2
  5. sky/client/sdk.py +20 -0
  6. sky/client/sdk_async.py +18 -16
  7. sky/clouds/aws.py +3 -1
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/storage.py +5 -1
  26. sky/execution.py +21 -14
  27. sky/jobs/constants.py +3 -0
  28. sky/jobs/controller.py +732 -310
  29. sky/jobs/recovery_strategy.py +251 -129
  30. sky/jobs/scheduler.py +247 -174
  31. sky/jobs/server/core.py +20 -4
  32. sky/jobs/server/utils.py +2 -2
  33. sky/jobs/state.py +702 -511
  34. sky/jobs/utils.py +94 -39
  35. sky/provision/aws/config.py +4 -1
  36. sky/provision/gcp/config.py +6 -1
  37. sky/provision/kubernetes/utils.py +17 -8
  38. sky/provision/provisioner.py +1 -0
  39. sky/serve/replica_managers.py +0 -7
  40. sky/serve/serve_utils.py +5 -0
  41. sky/serve/server/impl.py +1 -2
  42. sky/serve/service.py +0 -2
  43. sky/server/common.py +8 -3
  44. sky/server/config.py +43 -24
  45. sky/server/constants.py +1 -0
  46. sky/server/daemons.py +7 -11
  47. sky/server/requests/serializers/encoders.py +1 -1
  48. sky/server/server.py +8 -1
  49. sky/setup_files/dependencies.py +4 -2
  50. sky/skylet/attempt_skylet.py +1 -0
  51. sky/skylet/constants.py +3 -1
  52. sky/skylet/events.py +2 -10
  53. sky/utils/command_runner.pyi +3 -3
  54. sky/utils/common_utils.py +11 -1
  55. sky/utils/controller_utils.py +5 -0
  56. sky/utils/db/db_utils.py +31 -2
  57. sky/utils/rich_utils.py +3 -1
  58. sky/utils/subprocess_utils.py +9 -0
  59. sky/volumes/volume.py +2 -0
  60. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
  61. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
  62. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
  63. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
  64. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
  65. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
  66. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
  67. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-d4fabc08788e14af.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/eWytLgin5zvayQw3Xk46m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/eWytLgin5zvayQw3Xk46m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"eWytLgin5zvayQw3Xk46m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1d7e11230da3ca89.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"3SYxqNGnvvPS8h3gdD2T7","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/data/storage.py CHANGED
@@ -2700,7 +2700,11 @@ class AzureBlobStore(AbstractStore):
2700
2700
  name=override_args.get('name', metadata.name),
2701
2701
  storage_account_name=override_args.get(
2702
2702
  'storage_account', metadata.storage_account_name),
2703
- source=override_args.get('source', metadata.source),
2703
+ # TODO(cooperc): fix the types for mypy 1.16
2704
+ # Azure store expects a string path; metadata.source may be a Path
2705
+ # or List[Path].
2706
+ source=override_args.get('source',
2707
+ metadata.source), # type: ignore[arg-type]
2704
2708
  region=override_args.get('region', metadata.region),
2705
2709
  is_sky_managed=override_args.get('is_sky_managed',
2706
2710
  metadata.is_sky_managed),
sky/execution.py CHANGED
@@ -3,6 +3,7 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
+ import logging
6
7
  import typing
7
8
  from typing import List, Optional, Tuple, Union
8
9
 
@@ -120,6 +121,7 @@ def _execute(
120
121
  _quiet_optimizer: bool = False,
121
122
  _is_launched_by_jobs_controller: bool = False,
122
123
  _is_launched_by_sky_serve_controller: bool = False,
124
+ job_logger: logging.Logger = logger,
123
125
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
124
126
  """Execute an entrypoint.
125
127
 
@@ -221,7 +223,8 @@ def _execute(
221
223
  _quiet_optimizer=_quiet_optimizer,
222
224
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
223
225
  _is_launched_by_sky_serve_controller=
224
- _is_launched_by_sky_serve_controller)
226
+ _is_launched_by_sky_serve_controller,
227
+ job_logger=job_logger)
225
228
 
226
229
 
227
230
  def _execute_dag(
@@ -243,6 +246,7 @@ def _execute_dag(
243
246
  _quiet_optimizer: bool,
244
247
  _is_launched_by_jobs_controller: bool,
245
248
  _is_launched_by_sky_serve_controller: bool,
249
+ job_logger: logging.Logger = logger,
246
250
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
247
251
  """Execute a DAG.
248
252
 
@@ -253,7 +257,7 @@ def _execute_dag(
253
257
  task = dag.tasks[0]
254
258
 
255
259
  if any(r.job_recovery is not None for r in task.resources):
256
- logger.warning(
260
+ job_logger.warning(
257
261
  f'{colorama.Style.DIM}The task has `job_recovery` specified, '
258
262
  'but is launched as an unmanaged job. It will be ignored.'
259
263
  'To enable job recovery, use managed jobs: sky jobs launch.'
@@ -334,10 +338,10 @@ def _execute_dag(
334
338
  # itself have no task running and start the auto{stop,down}
335
339
  # process, before the task is submitted in the EXEC stage.
336
340
  verb = 'torn down' if down else 'stopped'
337
- logger.info(f'{colorama.Style.DIM}The cluster will '
338
- f'be {verb} after 1 minutes of idleness '
339
- '(after all jobs finish).'
340
- f'{colorama.Style.RESET_ALL}')
341
+ job_logger.info(f'{colorama.Style.DIM}The cluster will '
342
+ f'be {verb} after 1 minutes of idleness '
343
+ '(after all jobs finish).'
344
+ f'{colorama.Style.RESET_ALL}')
341
345
  idle_minutes_to_autostop = 1
342
346
  if Stage.DOWN in stages:
343
347
  stages.remove(Stage.DOWN)
@@ -366,7 +370,7 @@ def _execute_dag(
366
370
  yellow = colorama.Fore.YELLOW
367
371
  bold = colorama.Style.BRIGHT
368
372
  reset = colorama.Style.RESET_ALL
369
- logger.info(
373
+ job_logger.info(
370
374
  f'{yellow}Launching a spot job that does not '
371
375
  f'automatically recover from preemptions. To '
372
376
  'get automatic recovery, use managed job instead: '
@@ -385,7 +389,7 @@ def _execute_dag(
385
389
  controller = controller_utils.Controllers.from_name(
386
390
  cluster_name)
387
391
  if controller is not None:
388
- logger.info(
392
+ job_logger.info(
389
393
  f'Choosing resources for {controller.value.name}...'
390
394
  )
391
395
  dag = optimizer.Optimizer.optimize(dag,
@@ -427,7 +431,7 @@ def _execute_dag(
427
431
  if handle is None:
428
432
  assert dryrun, ('If not dryrun, handle must be set or '
429
433
  'Stage.PROVISION must be included in stages.')
430
- logger.info('Dryrun finished.')
434
+ job_logger.info('Dryrun finished.')
431
435
  return None, None
432
436
 
433
437
  do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
@@ -436,7 +440,7 @@ def _execute_dag(
436
440
  (task.file_mounts is not None or
437
441
  task.storage_mounts is not None))
438
442
  if do_workdir or do_file_mounts:
439
- logger.info(ux_utils.starting_message('Syncing files.'))
443
+ job_logger.info(ux_utils.starting_message('Syncing files.'))
440
444
 
441
445
  if do_workdir:
442
446
  if cluster_name is not None:
@@ -456,11 +460,11 @@ def _execute_dag(
456
460
  task.storage_mounts)
457
461
 
458
462
  if no_setup:
459
- logger.info('Setup commands skipped.')
463
+ job_logger.info('Setup commands skipped.')
460
464
  elif Stage.SETUP in stages and not dryrun:
461
465
  if skip_unnecessary_provisioning and provisioning_skipped:
462
- logger.debug('Unnecessary provisioning was skipped, so '
463
- 'skipping setup as well.')
466
+ job_logger.debug('Unnecessary provisioning was skipped, so '
467
+ 'skipping setup as well.')
464
468
  else:
465
469
  if cluster_name is not None:
466
470
  global_user_state.add_cluster_event(
@@ -521,6 +525,7 @@ def launch(
521
525
  _is_launched_by_jobs_controller: bool = False,
522
526
  _is_launched_by_sky_serve_controller: bool = False,
523
527
  _disable_controller_check: bool = False,
528
+ job_logger: logging.Logger = logger,
524
529
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
525
530
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
526
531
  """Launches a cluster or task.
@@ -688,7 +693,7 @@ def launch(
688
693
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
689
694
  _is_launched_by_sky_serve_controller=
690
695
  _is_launched_by_sky_serve_controller,
691
- )
696
+ job_logger=job_logger)
692
697
 
693
698
 
694
699
  @usage_lib.entrypoint
@@ -699,6 +704,7 @@ def exec( # pylint: disable=redefined-builtin
699
704
  down: bool = False,
700
705
  stream_logs: bool = True,
701
706
  backend: Optional[backends.Backend] = None,
707
+ job_logger: logging.Logger = logger,
702
708
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
703
709
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
704
710
  """Executes a task on an existing cluster.
@@ -774,4 +780,5 @@ def exec( # pylint: disable=redefined-builtin
774
780
  ],
775
781
  cluster_name=cluster_name,
776
782
  detach_run=True,
783
+ job_logger=job_logger,
777
784
  )
sky/jobs/constants.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Constants used for Managed Jobs."""
2
+ import os
2
3
  from typing import Any, Dict, Union
3
4
 
4
5
  from sky.skylet import constants as skylet_constants
@@ -9,6 +10,8 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
9
10
 
10
11
  JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
11
12
 
13
+ CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
14
+ SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
12
15
  # Resources as a dict for the jobs controller.
13
16
  # Use smaller CPU instance type for jobs controller, but with more memory, i.e.
14
17
  # r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,