skypilot-nightly 1.0.0.dev20250826__py3-none-any.whl → 1.0.0.dev20250828__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (83) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +11 -10
  3. sky/authentication.py +4 -10
  4. sky/backends/backend.py +3 -5
  5. sky/backends/backend_utils.py +41 -56
  6. sky/backends/cloud_vm_ray_backend.py +13 -24
  7. sky/backends/local_docker_backend.py +3 -8
  8. sky/client/cli/command.py +43 -10
  9. sky/client/common.py +41 -14
  10. sky/client/sdk.py +24 -9
  11. sky/client/sdk_async.py +6 -2
  12. sky/clouds/aws.py +1 -1
  13. sky/clouds/cloud.py +15 -0
  14. sky/clouds/kubernetes.py +27 -0
  15. sky/clouds/ssh.py +2 -3
  16. sky/core.py +1 -4
  17. sky/dashboard/out/404.html +1 -1
  18. sky/dashboard/out/_next/static/chunks/{webpack-6e76f636a048e145.js → webpack-6dae1cd599a34def.js} +1 -1
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  28. sky/dashboard/out/jobs.html +1 -1
  29. sky/dashboard/out/users.html +1 -1
  30. sky/dashboard/out/volumes.html +1 -1
  31. sky/dashboard/out/workspace/new.html +1 -1
  32. sky/dashboard/out/workspaces/[name].html +1 -1
  33. sky/dashboard/out/workspaces.html +1 -1
  34. sky/global_user_state.py +127 -23
  35. sky/jobs/client/sdk.py +5 -2
  36. sky/jobs/recovery_strategy.py +9 -4
  37. sky/logs/agent.py +2 -2
  38. sky/logs/aws.py +6 -3
  39. sky/provision/do/utils.py +2 -1
  40. sky/provision/kubernetes/config.py +2 -8
  41. sky/provision/kubernetes/instance.py +58 -8
  42. sky/provision/kubernetes/network_utils.py +3 -4
  43. sky/provision/kubernetes/utils.py +8 -7
  44. sky/provision/nebius/utils.py +51 -9
  45. sky/provision/vsphere/vsphere_utils.py +2 -8
  46. sky/schemas/api/responses.py +7 -0
  47. sky/serve/client/impl.py +5 -4
  48. sky/serve/replica_managers.py +4 -3
  49. sky/serve/serve_utils.py +4 -4
  50. sky/serve/server/impl.py +3 -2
  51. sky/serve/service_spec.py +2 -8
  52. sky/server/auth/authn.py +4 -0
  53. sky/server/auth/oauth2_proxy.py +10 -4
  54. sky/server/common.py +10 -3
  55. sky/server/daemons.py +10 -5
  56. sky/server/requests/executor.py +6 -1
  57. sky/server/requests/requests.py +21 -0
  58. sky/server/server.py +34 -33
  59. sky/server/uvicorn.py +33 -0
  60. sky/setup_files/dependencies.py +1 -0
  61. sky/sky_logging.py +4 -1
  62. sky/skylet/events.py +4 -5
  63. sky/skypilot_config.py +14 -12
  64. sky/ssh_node_pools/core.py +3 -1
  65. sky/task.py +4 -10
  66. sky/templates/nebius-ray.yml.j2 +4 -8
  67. sky/usage/usage_lib.py +3 -2
  68. sky/users/server.py +6 -6
  69. sky/utils/common_utils.py +0 -71
  70. sky/utils/controller_utils.py +4 -3
  71. sky/utils/dag_utils.py +4 -4
  72. sky/utils/kubernetes/config_map_utils.py +3 -3
  73. sky/utils/schemas.py +3 -0
  74. sky/utils/yaml_utils.py +102 -0
  75. sky/volumes/volume.py +8 -3
  76. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/METADATA +2 -1
  77. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/RECORD +83 -82
  78. /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_buildManifest.js +0 -0
  79. /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_ssgManifest.js +0 -0
  80. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/WHEEL +0 -0
  81. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/entry_points.txt +0 -0
  82. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/licenses/LICENSE +0 -0
  83. {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/TPMkEeuj85tHTmIW7Gu3S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/TPMkEeuj85tHTmIW7Gu3S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"TPMkEeuj85tHTmIW7Gu3S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6dae1cd599a34def.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"9DW6d9jaP2kZt0NcgIfFa","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/TPMkEeuj85tHTmIW7Gu3S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/TPMkEeuj85tHTmIW7Gu3S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"TPMkEeuj85tHTmIW7Gu3S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6dae1cd599a34def.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"9DW6d9jaP2kZt0NcgIfFa","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/TPMkEeuj85tHTmIW7Gu3S/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/TPMkEeuj85tHTmIW7Gu3S/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"TPMkEeuj85tHTmIW7Gu3S","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6dae1cd599a34def.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/9DW6d9jaP2kZt0NcgIfFa/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"9DW6d9jaP2kZt0NcgIfFa","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/global_user_state.py CHANGED
@@ -25,7 +25,6 @@ from sqlalchemy import orm
25
25
  from sqlalchemy.dialects import postgresql
26
26
  from sqlalchemy.dialects import sqlite
27
27
  from sqlalchemy.ext import declarative
28
- import yaml
29
28
 
30
29
  from sky import models
31
30
  from sky import sky_logging
@@ -35,6 +34,7 @@ from sky.utils import common_utils
35
34
  from sky.utils import context_utils
36
35
  from sky.utils import registry
37
36
  from sky.utils import status_lib
37
+ from sky.utils import yaml_utils
38
38
  from sky.utils.db import db_utils
39
39
  from sky.utils.db import migration_utils
40
40
 
@@ -53,6 +53,7 @@ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
53
53
  _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
54
54
 
55
55
  DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
56
+ DEBUG_CLUSTER_EVENT_RETENTION_HOURS = 30 * 24.0
56
57
  MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
57
58
 
58
59
  _UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS = [
@@ -433,6 +434,20 @@ def get_user(user_id: str) -> Optional[models.User]:
433
434
  created_at=row.created_at)
434
435
 
435
436
 
437
+ @_init_db
438
+ def _get_users(user_ids: Set[str]) -> Dict[str, models.User]:
439
+ assert _SQLALCHEMY_ENGINE is not None
440
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
441
+ rows = session.query(user_table).filter(
442
+ user_table.c.id.in_(user_ids)).all()
443
+ return {
444
+ row.id: models.User(id=row.id,
445
+ name=row.name,
446
+ password=row.password,
447
+ created_at=row.created_at) for row in rows
448
+ }
449
+
450
+
436
451
  @_init_db
437
452
  def get_user_by_name(username: str) -> List[models.User]:
438
453
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -581,7 +596,7 @@ def add_or_update_cluster(cluster_name: str,
581
596
  if (is_launch and not cluster_row or
582
597
  cluster_row.status != status_lib.ClusterStatus.UP.value):
583
598
  conditional_values.update({
584
- 'last_creation_yaml': common_utils.dump_yaml_str(task_config)
599
+ 'last_creation_yaml': yaml_utils.dump_yaml_str(task_config)
585
600
  if task_config else None,
586
601
  'last_creation_command': last_use,
587
602
  })
@@ -767,12 +782,41 @@ def get_last_cluster_event(cluster_hash: str,
767
782
  return row.reason
768
783
 
769
784
 
770
- def cleanup_cluster_events_with_retention(retention_hours: float) -> None:
785
+ def _get_last_cluster_event_multiple(
786
+ cluster_hashes: Set[str],
787
+ event_type: ClusterEventType) -> Dict[str, str]:
788
+ assert _SQLALCHEMY_ENGINE is not None
789
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
790
+ # Use a subquery to get the latest event for each cluster_hash
791
+ latest_events = session.query(
792
+ cluster_event_table.c.cluster_hash,
793
+ sqlalchemy.func.max(cluster_event_table.c.transitioned_at).label(
794
+ 'max_time')).filter(
795
+ cluster_event_table.c.cluster_hash.in_(cluster_hashes),
796
+ cluster_event_table.c.type == event_type.value).group_by(
797
+ cluster_event_table.c.cluster_hash).subquery()
798
+
799
+ # Join with original table to get the full event details
800
+ rows = session.query(cluster_event_table).join(
801
+ latest_events,
802
+ sqlalchemy.and_(
803
+ cluster_event_table.c.cluster_hash ==
804
+ latest_events.c.cluster_hash,
805
+ cluster_event_table.c.transitioned_at ==
806
+ latest_events.c.max_time)).all()
807
+
808
+ return {row.cluster_hash: row.reason for row in rows}
809
+
810
+
811
+ def cleanup_cluster_events_with_retention(retention_hours: float,
812
+ event_type: ClusterEventType) -> None:
771
813
  assert _SQLALCHEMY_ENGINE is not None
814
+ # Once for events with type STATUS_CHANGE.
772
815
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
773
816
  query = session.query(cluster_event_table).filter(
774
- cluster_event_table.c.transitioned_at < time.time() -
775
- retention_hours * 3600)
817
+ cluster_event_table.c.transitioned_at <
818
+ time.time() - retention_hours * 3600,
819
+ cluster_event_table.c.type == event_type.value)
776
820
  logger.debug(f'Deleting {query.count()} cluster events.')
777
821
  query.delete()
778
822
  session.commit()
@@ -787,9 +831,20 @@ async def cluster_event_retention_daemon():
787
831
  retention_hours = skypilot_config.get_nested(
788
832
  ('api_server', 'cluster_event_retention_hours'),
789
833
  DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
834
+ debug_retention_hours = skypilot_config.get_nested(
835
+ ('api_server', 'cluster_debug_event_retention_hours'),
836
+ DEBUG_CLUSTER_EVENT_RETENTION_HOURS)
790
837
  try:
791
838
  if retention_hours >= 0:
792
- cleanup_cluster_events_with_retention(retention_hours)
839
+ logger.debug('Cleaning up cluster events with retention '
840
+ f'{retention_hours} hours.')
841
+ cleanup_cluster_events_with_retention(
842
+ retention_hours, ClusterEventType.STATUS_CHANGE)
843
+ if debug_retention_hours >= 0:
844
+ logger.debug('Cleaning up debug cluster events with retention '
845
+ f'{debug_retention_hours} hours.')
846
+ cleanup_cluster_events_with_retention(debug_retention_hours,
847
+ ClusterEventType.DEBUG)
793
848
  except asyncio.CancelledError:
794
849
  logger.info('Cluster event retention daemon cancelled')
795
850
  break
@@ -797,8 +852,9 @@ async def cluster_event_retention_daemon():
797
852
  logger.error(f'Error running cluster event retention daemon: {e}')
798
853
 
799
854
  # Run daemon at most once every hour to avoid too frequent cleanup.
800
- sleep_amount = max(retention_hours * 3600,
801
- MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
855
+ sleep_amount = max(
856
+ min(retention_hours * 3600, debug_retention_hours * 3600),
857
+ MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
802
858
  await asyncio.sleep(sleep_amount)
803
859
 
804
860
 
@@ -864,8 +920,7 @@ def update_last_use(cluster_name: str):
864
920
 
865
921
 
866
922
  @_init_db
867
- def remove_cluster(cluster_name: str, terminate: bool,
868
- remove_events: bool) -> None:
923
+ def remove_cluster(cluster_name: str, terminate: bool) -> None:
869
924
  """Removes cluster_name mapping."""
870
925
  assert _SQLALCHEMY_ENGINE is not None
871
926
  cluster_hash = _get_hash_for_existing_cluster(cluster_name)
@@ -893,9 +948,6 @@ def remove_cluster(cluster_name: str, terminate: bool,
893
948
 
894
949
  if terminate:
895
950
  session.query(cluster_table).filter_by(name=cluster_name).delete()
896
- if remove_events:
897
- session.query(cluster_event_table).filter_by(
898
- cluster_hash=cluster_hash).delete()
899
951
  else:
900
952
  handle = get_handle_from_cluster_name(cluster_name)
901
953
  if handle is None:
@@ -1266,18 +1318,70 @@ def get_cluster_from_name(
1266
1318
 
1267
1319
 
1268
1320
  @_init_db
1269
- def get_clusters() -> List[Dict[str, Any]]:
1270
- assert _SQLALCHEMY_ENGINE is not None
1271
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1272
- rows = session.query(cluster_table).order_by(
1273
- sqlalchemy.desc(cluster_table.c.launched_at)).all()
1321
+ def get_clusters(
1322
+ *, # keyword only separator
1323
+ exclude_managed_clusters: bool = False,
1324
+ workspaces_filter: Optional[Set[str]] = None,
1325
+ user_hashes_filter: Optional[Set[str]] = None,
1326
+ ) -> List[Dict[str, Any]]:
1327
+ """Get clusters from the database.
1328
+
1329
+ Args:
1330
+ exclude_managed_clusters: If True, exclude clusters that have
1331
+ is_managed field set to True.
1332
+ workspaces_filter: If specified, only include clusters
1333
+ that has workspace field set to one of the values.
1334
+ user_hashes_filter: If specified, only include clusters
1335
+ that has user_hash field set to one of the values.
1336
+ """
1337
+ # is a cluster has a null user_hash,
1338
+ # we treat it as belonging to the current user.
1339
+ current_user_hash = common_utils.get_user_hash()
1340
+ assert _SQLALCHEMY_ENGINE is not None
1341
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1342
+ query = session.query(cluster_table)
1343
+ if exclude_managed_clusters:
1344
+ query = query.filter(cluster_table.c.is_managed == int(False))
1345
+ if workspaces_filter is not None:
1346
+ query = query.filter(
1347
+ cluster_table.c.workspace.in_(workspaces_filter))
1348
+ if user_hashes_filter is not None:
1349
+ if current_user_hash in user_hashes_filter:
1350
+ # backwards compatibility for old clusters.
1351
+ # If current_user_hash is in user_hashes_filter, we include
1352
+ # clusters that have a null user_hash.
1353
+ query = query.filter(
1354
+ cluster_table.c.user_hash.in_(user_hashes_filter) |
1355
+ (cluster_table.c.user_hash is None))
1356
+ else:
1357
+ query = query.filter(
1358
+ cluster_table.c.user_hash.in_(user_hashes_filter))
1359
+ query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
1360
+ rows = query.all()
1274
1361
  records = []
1362
+
1363
+ # get user hash for each row
1364
+ row_to_user_hash = {}
1275
1365
  for row in rows:
1276
- user_hash = _get_user_hash_or_current_user(row.user_hash)
1277
- user = get_user(user_hash)
1366
+ user_hash = (row.user_hash
1367
+ if row.user_hash is not None else current_user_hash)
1368
+ row_to_user_hash[row.cluster_hash] = user_hash
1369
+
1370
+ # get all users needed for the rows at once
1371
+ user_hashes = set(row_to_user_hash.values())
1372
+ user_hash_to_user = _get_users(user_hashes)
1373
+
1374
+ # get last cluster event for each row
1375
+ cluster_hashes = set(row_to_user_hash.keys())
1376
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1377
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1378
+
1379
+ # get user for each row
1380
+ for row in rows:
1381
+ user_hash = row_to_user_hash[row.cluster_hash]
1382
+ user = user_hash_to_user.get(user_hash, None)
1278
1383
  user_name = user.name if user is not None else None
1279
- last_event = get_last_cluster_event(
1280
- row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1384
+ last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1281
1385
  # TODO: use namedtuple instead of dict
1282
1386
  record = {
1283
1387
  'name': row.name,
@@ -1999,7 +2103,7 @@ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
1999
2103
  yaml_str = get_cluster_yaml_str(cluster_yaml_path)
2000
2104
  if yaml_str is None:
2001
2105
  raise ValueError(f'Cluster yaml {cluster_yaml_path} not found.')
2002
- return yaml.safe_load(yaml_str)
2106
+ return yaml_utils.safe_load(yaml_str)
2003
2107
 
2004
2108
 
2005
2109
  @_init_db
sky/jobs/client/sdk.py CHANGED
@@ -243,7 +243,7 @@ def tail_logs(name: Optional[str] = None,
243
243
  controller: bool = False,
244
244
  refresh: bool = False,
245
245
  tail: Optional[int] = None,
246
- output_stream: Optional['io.TextIOBase'] = None) -> int:
246
+ output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
247
247
  """Tails logs of managed jobs.
248
248
 
249
249
  You can provide either a job name or a job ID to tail logs. If both are not
@@ -263,6 +263,8 @@ def tail_logs(name: Optional[str] = None,
263
263
  Exit code based on success or failure of the job. 0 if success,
264
264
  100 if the job failed. See exceptions.JobExitCode for possible exit
265
265
  codes.
266
+ Will return None if follow is False
267
+ (see note in sky/client/sdk.py::stream_response)
266
268
 
267
269
  Request Raises:
268
270
  ValueError: invalid arguments.
@@ -289,7 +291,8 @@ def tail_logs(name: Optional[str] = None,
289
291
  return sdk.stream_response(request_id=request_id,
290
292
  response=response,
291
293
  output_stream=output_stream,
292
- resumable=(tail == 0))
294
+ resumable=(tail == 0),
295
+ get_result=follow)
293
296
 
294
297
 
295
298
  @usage_lib.entrypoint
@@ -327,10 +327,15 @@ class StrategyExecutor:
327
327
  cluster_name=self.cluster_name,
328
328
  # We expect to tear down the cluster as soon as
329
329
  # the job is finished. However, in case the
330
- # controller dies, set autodown to try and avoid
331
- # a resource leak.
332
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
333
- down=True,
330
+ # controller dies, we may end up with a
331
+ # resource leak.
332
+ # Ideally, we should autodown to be safe,
333
+ # but it's fine to disable it for now, as
334
+ # Nebius doesn't support autodown yet.
335
+ # TODO(kevin): set down=True once Nebius
336
+ # supports autodown.
337
+ # idle_minutes_to_autostop=_AUTODOWN_MINUTES,
338
+ # down=True,
334
339
  _is_launched_by_jobs_controller=True)
335
340
  else:
336
341
  self.cluster_name = (
sky/logs/agent.py CHANGED
@@ -5,8 +5,8 @@ import shlex
5
5
  from typing import Any, Dict
6
6
 
7
7
  from sky.skylet import constants
8
- from sky.utils import common_utils
9
8
  from sky.utils import resources_utils
9
+ from sky.utils import yaml_utils
10
10
 
11
11
 
12
12
  class LoggingAgent(abc.ABC):
@@ -65,7 +65,7 @@ class FluentbitAgent(LoggingAgent):
65
65
  'outputs': [self.fluentbit_output_config(cluster_name)],
66
66
  }
67
67
  }
68
- return common_utils.dump_yaml_str(cfg_dict)
68
+ return yaml_utils.dump_yaml_str(cfg_dict)
69
69
 
70
70
  @abc.abstractmethod
71
71
  def fluentbit_output_config(
sky/logs/aws.py CHANGED
@@ -6,8 +6,8 @@ import pydantic
6
6
 
7
7
  from sky.logs.agent import FluentbitAgent
8
8
  from sky.skylet import constants
9
- from sky.utils import common_utils
10
9
  from sky.utils import resources_utils
10
+ from sky.utils import yaml_utils
11
11
 
12
12
  EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
13
13
 
@@ -130,7 +130,10 @@ class CloudwatchLoggingAgent(FluentbitAgent):
130
130
 
131
131
  # If region is specified, set it in the environment
132
132
  if self.config.region:
133
- pre_cmd += f' export AWS_REGION={self.config.region};'
133
+ pre_cmd += (f' export AWS_REGION={self.config.region}'
134
+ f' AWS_DEFAULT_REGION={self.config.region};'
135
+ ' command -v aws &>/dev/null && '
136
+ f'aws configure set region {self.config.region};')
134
137
  else:
135
138
  # If region is not specified, check if it's available in
136
139
  # the environment or credentials file
@@ -213,7 +216,7 @@ class CloudwatchLoggingAgent(FluentbitAgent):
213
216
  }
214
217
  }
215
218
 
216
- return common_utils.dump_yaml_str(cfg_dict)
219
+ return yaml_utils.dump_yaml_str(cfg_dict)
217
220
 
218
221
  def fluentbit_output_config(
219
222
  self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
sky/provision/do/utils.py CHANGED
@@ -17,6 +17,7 @@ from sky.provision import constants as provision_constants
17
17
  from sky.provision.do import constants
18
18
  from sky.utils import annotations
19
19
  from sky.utils import common_utils
20
+ from sky.utils import yaml_utils
20
21
 
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
@@ -61,7 +62,7 @@ def _init_client():
61
62
  if get_credentials_path() is None:
62
63
  raise DigitalOceanError(
63
64
  'No credentials found, please run `doctl auth init`')
64
- credentials = common_utils.read_yaml(get_credentials_path())
65
+ credentials = yaml_utils.read_yaml(get_credentials_path())
65
66
  default_token = credentials.get('access-token', None)
66
67
  if default_token is not None:
67
68
  try:
@@ -3,20 +3,14 @@ import copy
3
3
  import logging
4
4
  import math
5
5
  import os
6
- import typing
7
6
  from typing import Any, Dict, Optional, Union
8
7
 
9
- from sky.adaptors import common as adaptors_common
10
8
  from sky.adaptors import kubernetes
11
9
  from sky.provision import common
12
10
  from sky.provision.kubernetes import network_utils
13
11
  from sky.provision.kubernetes import utils as kubernetes_utils
14
12
  from sky.utils import kubernetes_enums
15
-
16
- if typing.TYPE_CHECKING:
17
- import yaml
18
- else:
19
- yaml = adaptors_common.LazyImport('yaml')
13
+ from sky.utils import yaml_utils
20
14
 
21
15
  logger = logging.getLogger(__name__)
22
16
 
@@ -592,7 +586,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
592
586
  daemonset_path = os.path.join(
593
587
  root_dir, 'kubernetes/manifests/fusermount-server-daemonset.yaml')
594
588
  with open(daemonset_path, 'r', encoding='utf-8') as file:
595
- daemonset = yaml.safe_load(file)
589
+ daemonset = yaml_utils.safe_load(file)
596
590
  kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
597
591
  try:
598
592
  kubernetes.apps_api(context).create_namespaced_daemon_set(
@@ -1,5 +1,6 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
+ import datetime
3
4
  import json
4
5
  import re
5
6
  import time
@@ -1254,9 +1255,11 @@ def get_cluster_info(
1254
1255
  provider_config=provider_config)
1255
1256
 
1256
1257
 
1257
- def _get_pod_termination_reason(pod: Any) -> str:
1258
+ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1259
+ """Get pod termination reason and write to cluster events."""
1258
1260
  reasons = []
1259
- if pod.status.container_statuses:
1261
+ latest_timestamp = pod.status.start_time or datetime.datetime.min
1262
+ if pod.status and pod.status.container_statuses:
1260
1263
  for container_status in pod.status.container_statuses:
1261
1264
  terminated = container_status.state.terminated
1262
1265
  if terminated:
@@ -1264,20 +1267,38 @@ def _get_pod_termination_reason(pod: Any) -> str:
1264
1267
  reason = terminated.reason
1265
1268
  if exit_code == 0:
1266
1269
  # skip exit 0 (non-failed) just for sanity
1270
+ logger.debug(f'{pod.metadata.name}/{container_status.name} '
1271
+ 'had exit code 0. Skipping.')
1267
1272
  continue
1268
1273
  if reason is None:
1269
1274
  # just in-case reason is None, have default for debugging
1270
1275
  reason = f'exit({exit_code})'
1271
1276
  reasons.append(reason)
1277
+ if terminated.finished_at > latest_timestamp:
1278
+ latest_timestamp = terminated.finished_at
1279
+
1272
1280
  # TODO (kyuds): later, if needed, query `last_state` too.
1273
1281
 
1282
+ if not reasons:
1283
+ return ''
1284
+
1274
1285
  # Normally we will have a single container per pod for skypilot
1275
1286
  # but doing this just in-case there are multiple containers.
1276
- return ' | '.join(reasons)
1287
+ pod_reason = ' | '.join(reasons)
1288
+
1289
+ global_user_state.add_cluster_event(
1290
+ cluster_name,
1291
+ None,
1292
+ f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
1293
+ global_user_state.ClusterEventType.DEBUG,
1294
+ transitioned_at=int(latest_timestamp.timestamp()),
1295
+ )
1296
+ return pod_reason
1277
1297
 
1278
1298
 
1279
1299
  def _get_pod_missing_reason(context: Optional[str], namespace: str,
1280
1300
  cluster_name: str, pod_name: str) -> Optional[str]:
1301
+ """Get events for missing pod and write to cluster events."""
1281
1302
  logger.debug(f'Analyzing events for pod {pod_name}')
1282
1303
  pod_field_selector = (
1283
1304
  f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
@@ -1293,6 +1314,8 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1293
1314
  last_scheduled_node = None
1294
1315
  insert_new_pod_event = True
1295
1316
  new_event_inserted = False
1317
+ inserted_pod_events = 0
1318
+
1296
1319
  for event in pod_events:
1297
1320
  if event.reason == 'Scheduled':
1298
1321
  pattern = r'Successfully assigned (\S+) to (\S+)'
@@ -1313,10 +1336,18 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1313
1336
  transitioned_at=int(
1314
1337
  event.metadata.creation_timestamp.timestamp()),
1315
1338
  expose_duplicate_error=True)
1339
+ logger.debug(f'[pod {pod_name}] encountered new pod event: '
1340
+ f'{event.metadata.creation_timestamp} '
1341
+ f'{event.reason} {event.message}')
1316
1342
  except db_utils.UniqueConstraintViolationError:
1317
1343
  insert_new_pod_event = False
1318
1344
  else:
1319
1345
  new_event_inserted = True
1346
+ inserted_pod_events += 1
1347
+
1348
+ logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
1349
+ f'inserted {inserted_pod_events} new pod events '
1350
+ 'previously unseen')
1320
1351
 
1321
1352
  if last_scheduled_node is not None:
1322
1353
  node_field_selector = ('involvedObject.kind=Node,'
@@ -1331,6 +1362,7 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1331
1362
  # latest event appears first
1332
1363
  reverse=True)
1333
1364
  insert_new_node_event = True
1365
+ inserted_node_events = 0
1334
1366
  for event in node_events:
1335
1367
  if insert_new_node_event:
1336
1368
  # Try inserting the latest events first. If the event is a
@@ -1345,10 +1377,23 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1345
1377
  transitioned_at=int(
1346
1378
  event.metadata.creation_timestamp.timestamp()),
1347
1379
  expose_duplicate_error=True)
1380
+ logger.debug(
1381
+ f'[pod {pod_name}] encountered new node event: '
1382
+ f'{event.metadata.creation_timestamp} '
1383
+ f'{event.reason} {event.message}')
1348
1384
  except db_utils.UniqueConstraintViolationError:
1349
1385
  insert_new_node_event = False
1350
1386
  else:
1351
1387
  new_event_inserted = True
1388
+ inserted_node_events += 1
1389
+
1390
+ logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
1391
+ f'processed {len(node_events)} node events and '
1392
+ f'inserted {inserted_node_events} new node events '
1393
+ 'previously unseen')
1394
+ else:
1395
+ logger.debug(f'[pod {pod_name}] could not determine the node '
1396
+ 'the pod was scheduled to')
1352
1397
 
1353
1398
  if not new_event_inserted:
1354
1399
  # If new event is not inserted, there is no useful information to
@@ -1390,13 +1435,15 @@ def query_instances(
1390
1435
  provider_config: Optional[Dict[str, Any]] = None,
1391
1436
  non_terminated_only: bool = True
1392
1437
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1438
+ # Mapping from pod phase to skypilot status. These are the only valid pod
1439
+ # phases.
1440
+ # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1393
1441
  status_map = {
1394
1442
  'Pending': status_lib.ClusterStatus.INIT,
1395
1443
  'Running': status_lib.ClusterStatus.UP,
1396
1444
  'Failed': status_lib.ClusterStatus.INIT,
1397
1445
  'Unknown': None,
1398
1446
  'Succeeded': None,
1399
- 'Terminating': None,
1400
1447
  }
1401
1448
 
1402
1449
  assert provider_config is not None
@@ -1440,12 +1487,15 @@ def query_instances(
1440
1487
  for pod in pods:
1441
1488
  phase = pod.status.phase
1442
1489
  pod_status = status_map[phase]
1490
+ reason = None
1491
+ if phase in ('Failed', 'Unknown'):
1492
+ reason = _get_pod_termination_reason(pod, cluster_name)
1493
+ logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
1443
1494
  if non_terminated_only and pod_status is None:
1495
+ logger.debug(f'Pod {pod.metadata.name} is terminated, but '
1496
+ 'query_instances is called with '
1497
+ f'non_terminated_only=True. Phase: {phase}')
1444
1498
  continue
1445
- reason = None
1446
- if phase == 'Failed':
1447
- reason = _get_pod_termination_reason(pod)
1448
- logger.debug(f'Pod Status Reason(s): {reason}')
1449
1499
  pod_name = pod.metadata.name
1450
1500
  reason = f'{pod_name}: {reason}' if reason is not None else None
1451
1501
  cluster_status[pod_name] = (pod_status, reason)
@@ -13,13 +13,12 @@ from sky.provision.kubernetes import utils as kubernetes_utils
13
13
  from sky.utils import directory_utils
14
14
  from sky.utils import kubernetes_enums
15
15
  from sky.utils import ux_utils
16
+ from sky.utils import yaml_utils
16
17
 
17
18
  if typing.TYPE_CHECKING:
18
19
  import jinja2
19
- import yaml
20
20
  else:
21
21
  jinja2 = adaptors_common.LazyImport('jinja2')
22
- yaml = adaptors_common.LazyImport('yaml')
23
22
 
24
23
  logger = sky_logging.init_logger(__name__)
25
24
 
@@ -108,7 +107,7 @@ def fill_loadbalancer_template(namespace: str, context: Optional[str],
108
107
  annotations=annotations,
109
108
  labels=labels,
110
109
  )
111
- content = yaml.safe_load(cont)
110
+ content = yaml_utils.safe_load(cont)
112
111
  return content
113
112
 
114
113
 
@@ -147,7 +146,7 @@ def fill_ingress_template(namespace: str, context: Optional[str],
147
146
  annotations=annotations,
148
147
  labels=labels,
149
148
  )
150
- content = yaml.safe_load(cont)
149
+ content = yaml_utils.safe_load(cont)
151
150
 
152
151
  # Return a dictionary containing both specs
153
152
  return {