skypilot-nightly 1.0.0.dev20250820__py3-none-any.whl → 1.0.0.dev20250822__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (64) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +9 -1
  3. sky/client/cli/command.py +4 -1
  4. sky/client/cli/flags.py +3 -3
  5. sky/client/sdk.py +64 -19
  6. sky/client/sdk_async.py +1 -1
  7. sky/core.py +3 -5
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/{8ZscIHnvBWz3AXkxsJL6H → WD29VpW0S7wsYey0qFBHQ}/_buildManifest.js +1 -1
  10. sky/dashboard/out/_next/static/chunks/3015-6c9c09593b1e67b6.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/{3785.bc5d2853355c9c47.js → 3785.d5b86f6ebc88e6e6.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{9277.71481d5b2e606e33.js → 4783.c485f48348349f47.js} +8 -3
  13. sky/dashboard/out/_next/static/chunks/{6633-efe924b9b8136699.js → 7205-88191679e7988c57.js} +9 -4
  14. sky/dashboard/out/_next/static/chunks/8969-4a6f1a928fb6d370.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/{8838.e7953f42af2b0544.js → 9946.3b7b43c217ff70ec.js} +9 -4
  16. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-ec747e4f2dc39b57.js → [cluster]-a0527109c2fab467.js} +7 -2
  17. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +11 -0
  18. sky/dashboard/out/_next/static/chunks/pages/{jobs-4b3ba1792dc6f21d.js → jobs-7421e63ac35f8fce.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-65f72dee417237ef.js → [name]-de06e613e20bc977.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/{workspaces-338de9df523d883a.js → workspaces-be35b22e2046564c.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/webpack-6e76f636a048e145.js +1 -0
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/exceptions.py +8 -0
  38. sky/jobs/utils.py +10 -3
  39. sky/optimizer.py +14 -4
  40. sky/provision/docker_utils.py +20 -1
  41. sky/provision/kubernetes/instance.py +4 -1
  42. sky/resources.py +17 -7
  43. sky/server/requests/executor.py +2 -2
  44. sky/server/requests/serializers/decoders.py +5 -0
  45. sky/server/requests/serializers/encoders.py +5 -0
  46. sky/server/rest.py +38 -8
  47. sky/server/server.py +8 -6
  48. sky/skypilot_config.py +4 -4
  49. sky/users/permission.py +6 -7
  50. {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/METADATA +2 -2
  51. {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/RECORD +60 -60
  52. sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +0 -1
  53. sky/dashboard/out/_next/static/chunks/8969-23c8fbdb8b397d59.js +0 -1
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +0 -6
  55. sky/dashboard/out/_next/static/chunks/webpack-008593a02784a2df.js +0 -1
  56. /sky/dashboard/out/_next/static/{8ZscIHnvBWz3AXkxsJL6H → WD29VpW0S7wsYey0qFBHQ}/_ssgManifest.js +0 -0
  57. /sky/dashboard/out/_next/static/chunks/{1121-2edb8ab2ba080a76.js → 1121-8afcf719ea87debc.js} +0 -0
  58. /sky/dashboard/out/_next/static/chunks/{1141-2f60a90b7d76838e.js → 1141-943efc7aff0f0c06.js} +0 -0
  59. /sky/dashboard/out/_next/static/chunks/{6856-e6f350f567182e87.js → 6856-049014c6d43d127b.js} +0 -0
  60. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-7d4182df6625fe10.js → [pool]-07349868f7905d37.js} +0 -0
  61. {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/WHEEL +0 -0
  62. {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/entry_points.txt +0 -0
  63. {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/licenses/LICENSE +0 -0
  64. {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-008593a02784a2df.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/8ZscIHnvBWz3AXkxsJL6H/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ZscIHnvBWz3AXkxsJL6H/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"8ZscIHnvBWz3AXkxsJL6H","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"WD29VpW0S7wsYey0qFBHQ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-008593a02784a2df.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/6633-efe924b9b8136699.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-23c8fbdb8b397d59.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-2edb8ab2ba080a76.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-bf218e4973bf5c8f.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-2f60a90b7d76838e.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-65f72dee417237ef.js" defer=""></script><script src="/dashboard/_next/static/8ZscIHnvBWz3AXkxsJL6H/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ZscIHnvBWz3AXkxsJL6H/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"8ZscIHnvBWz3AXkxsJL6H","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"WD29VpW0S7wsYey0qFBHQ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-008593a02784a2df.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-338de9df523d883a.js" defer=""></script><script src="/dashboard/_next/static/8ZscIHnvBWz3AXkxsJL6H/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/8ZscIHnvBWz3AXkxsJL6H/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"8ZscIHnvBWz3AXkxsJL6H","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"WD29VpW0S7wsYey0qFBHQ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/exceptions.py CHANGED
@@ -661,3 +661,11 @@ class RequestInterruptedError(Exception):
661
661
  class SkyletInternalError(Exception):
662
662
  """Raised when a Skylet internal error occurs."""
663
663
  pass
664
+
665
+
666
+ class ClientError(Exception):
667
+ """Raised when a there is a client error occurs.
668
+
669
+ If a request encounters a ClientError, it will not be retried to the server.
670
+ """
671
+ pass
sky/jobs/utils.py CHANGED
@@ -1494,7 +1494,7 @@ def format_job_table(
1494
1494
  'JOB DURATION',
1495
1495
  '#RECOVERIES',
1496
1496
  'STATUS',
1497
- 'WORKER_POOL',
1497
+ 'POOL',
1498
1498
  ]
1499
1499
  if show_all:
1500
1500
  # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
@@ -1597,6 +1597,10 @@ def format_job_table(
1597
1597
 
1598
1598
  user_values = get_user_column_values(job_tasks[0])
1599
1599
 
1600
+ pool = job_tasks[0].get('pool')
1601
+ if pool is None:
1602
+ pool = '-'
1603
+
1600
1604
  job_id = job_hash[1] if tasks_have_k8s_user else job_hash
1601
1605
  job_values = [
1602
1606
  job_id,
@@ -1610,7 +1614,7 @@ def format_job_table(
1610
1614
  job_duration,
1611
1615
  recovery_cnt,
1612
1616
  status_str,
1613
- job_tasks[0].get('pool', '-'),
1617
+ pool,
1614
1618
  ]
1615
1619
  if show_all:
1616
1620
  details = job_tasks[current_task_id].get('details')
@@ -1637,6 +1641,9 @@ def format_job_table(
1637
1641
  submitted = log_utils.readable_time_duration(task['submitted_at'])
1638
1642
  user_values = get_user_column_values(task)
1639
1643
  task_workspace = '-' if len(job_tasks) > 1 else workspace
1644
+ pool = task.get('pool')
1645
+ if pool is None:
1646
+ pool = '-'
1640
1647
  values = [
1641
1648
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1642
1649
  task['task_id'] if len(job_tasks) > 1 else '-',
@@ -1653,7 +1660,7 @@ def format_job_table(
1653
1660
  job_duration,
1654
1661
  task['recovery_count'],
1655
1662
  task['status'].colored_str(),
1656
- task.get('pool', '-'),
1663
+ pool,
1657
1664
  ]
1658
1665
  if show_all:
1659
1666
  # schedule_state is only set at the job level, so if we have
sky/optimizer.py CHANGED
@@ -1262,12 +1262,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1262
1262
 
1263
1263
 
1264
1264
  def _check_specified_regions(task: task_lib.Task) -> None:
1265
- """Check if specified regions (Kubernetes contexts) are enabled.
1265
+ """Check if specified regions (Kubernetes/SSH contexts) are enabled.
1266
1266
 
1267
1267
  Args:
1268
1268
  task: The task to check.
1269
1269
  """
1270
- # Only check for Kubernetes now
1270
+ # Only check for Kubernetes/SSH for now
1271
+ # Below check works because SSH inherits Kubernetes cloud.
1271
1272
  if not all(
1272
1273
  isinstance(resources.cloud, clouds.Kubernetes)
1273
1274
  for resources in task.resources):
@@ -1276,12 +1277,21 @@ def _check_specified_regions(task: task_lib.Task) -> None:
1276
1277
  for resources in task.resources:
1277
1278
  if resources.region is None:
1278
1279
  continue
1279
- existing_contexts = clouds.Kubernetes.existing_allowed_contexts()
1280
+
1281
+ is_ssh = isinstance(resources.cloud, clouds.SSH)
1282
+ if is_ssh:
1283
+ existing_contexts = clouds.SSH.existing_allowed_contexts()
1284
+ else:
1285
+ existing_contexts = clouds.Kubernetes.existing_allowed_contexts()
1286
+
1280
1287
  region = resources.region
1281
1288
  task_name = f' {task.name!r}' if task.name is not None else ''
1282
1289
  msg = f'Task{task_name} requires '
1283
1290
  if region not in existing_contexts:
1284
- infra_str = f'Kubernetes/{region}'
1291
+ if is_ssh:
1292
+ infra_str = f'SSH/{region.lstrip("ssh-")}'
1293
+ else:
1294
+ infra_str = f'Kubernetes/{region}'
1285
1295
  logger.warning(f'{infra_str} is not enabled.')
1286
1296
  volume_mounts_str = ''
1287
1297
  if task.volume_mounts:
@@ -83,6 +83,21 @@ def check_docker_image(cname, docker_cmd):
83
83
  return _check_helper(cname, '.Config.Image', docker_cmd)
84
84
 
85
85
 
86
+ def maybe_remove_container_cmds(container_name, docker_cmd):
87
+ """Remove the container if it exists. If not, it will be a no-op.
88
+ """
89
+ docker_rm = [
90
+ docker_cmd,
91
+ 'rm',
92
+ '-f',
93
+ container_name,
94
+ '2>/dev/null',
95
+ '||',
96
+ 'true',
97
+ ]
98
+ return ' '.join(docker_rm)
99
+
100
+
86
101
  def docker_start_cmds(
87
102
  image,
88
103
  container_name,
@@ -285,6 +300,10 @@ class DockerInitializer:
285
300
  'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
286
301
  'sudo systemctl restart docker; } || true')
287
302
  user_docker_run_options = self.docker_config.get('run_options', [])
303
+ remove_container_cmd = maybe_remove_container_cmds(
304
+ self.container_name,
305
+ self.docker_cmd,
306
+ )
288
307
  start_command = docker_start_cmds(
289
308
  specific_image,
290
309
  self.container_name,
@@ -292,7 +311,7 @@ class DockerInitializer:
292
311
  self._auto_configure_shm(user_docker_run_options)),
293
312
  self.docker_cmd,
294
313
  )
295
- self._run(start_command)
314
+ self._run(f'{remove_container_cmd}; {start_command}')
296
315
 
297
316
  # SkyPilot: Setup Commands.
298
317
  # TODO(zhwu): the following setups should be aligned with the kubernetes
@@ -797,15 +797,18 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
797
797
  'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
798
798
 
799
799
  needs_gpus = False
800
+ needs_gpus_nvidia = False
800
801
  limits = pod_spec['spec']['containers'][0].get('resources',
801
802
  {}).get('limits')
802
803
  if limits is not None:
803
804
  needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
804
805
  0) > 0
806
+ needs_gpus_nvidia = limits.get(
807
+ kubernetes_utils.SUPPORTED_GPU_RESOURCE_KEYS['nvidia'], 0) > 0
805
808
 
806
809
  # TPU pods provisioned on GKE use the default containerd runtime.
807
810
  # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
808
- if nvidia_runtime_exists and needs_gpus:
811
+ if nvidia_runtime_exists and needs_gpus_nvidia:
809
812
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
810
813
 
811
814
  logger.debug(f'run_instances: calling create_namespaced_pod '
sky/resources.py CHANGED
@@ -1260,10 +1260,14 @@ class Resources:
1260
1260
  def extract_docker_image(self) -> Optional[str]:
1261
1261
  if self.image_id is None:
1262
1262
  return None
1263
- if len(self.image_id) == 1 and self.region in self.image_id:
1264
- image_id = self.image_id[self.region]
1265
- if image_id.startswith('docker:'):
1266
- return image_id[len('docker:'):]
1263
+ # Handle dict image_id
1264
+ if len(self.image_id) == 1:
1265
+ # Check if the single key matches the region or is None (any region)
1266
+ image_key = list(self.image_id.keys())[0]
1267
+ if image_key == self.region or image_key is None:
1268
+ image_id = self.image_id[image_key]
1269
+ if image_id.startswith('docker:'):
1270
+ return image_id[len('docker:'):]
1267
1271
  return None
1268
1272
 
1269
1273
  def _try_validate_image_id(self) -> None:
@@ -1333,13 +1337,19 @@ class Resources:
1333
1337
  'Kubernetes, please explicitly specify the cloud.') from e
1334
1338
 
1335
1339
  if self._region is not None:
1336
- if self._region not in self._image_id:
1340
+ # If the image_id has None as key (region-agnostic),
1341
+ # use it for any region
1342
+ if None in self._image_id:
1343
+ # Replace None key with the actual region
1344
+ self._image_id = {self._region: self._image_id[None]}
1345
+ elif self._region not in self._image_id:
1337
1346
  with ux_utils.print_exception_no_traceback():
1338
1347
  raise ValueError(
1339
1348
  f'image_id {self._image_id} should contain the image '
1340
1349
  f'for the specified region {self._region}.')
1341
- # Narrow down the image_id to the specified region.
1342
- self._image_id = {self._region: self._image_id[self._region]}
1350
+ else:
1351
+ # Narrow down the image_id to the specified region.
1352
+ self._image_id = {self._region: self._image_id[self._region]}
1343
1353
 
1344
1354
  # Check the image_id's are valid.
1345
1355
  for region, image_id in self._image_id.items():
@@ -427,9 +427,9 @@ async def execute_request_coroutine(request: api_requests.Request):
427
427
  event loop. This is designed for executing tasks that are not CPU
428
428
  intensive, e.g. sky logs.
429
429
  """
430
+ context.initialize()
430
431
  ctx = context.get()
431
- if ctx is None:
432
- raise ValueError('Context is not initialized')
432
+ assert ctx is not None, 'Context is not initialized'
433
433
  logger.info(f'Executing request {request.request_id} in coroutine')
434
434
  func = request.entrypoint
435
435
  request_body = request.request_body
@@ -203,3 +203,8 @@ def decode_job_status(
203
203
  def decode_kubernetes_node_info(
204
204
  return_value: Dict[str, Any]) -> models.KubernetesNodesInfo:
205
205
  return models.KubernetesNodesInfo.from_dict(return_value)
206
+
207
+
208
+ @register_decoders('endpoints')
209
+ def decode_endpoints(return_value: Dict[int, str]) -> Dict[int, str]:
210
+ return {int(k): v for k, v in return_value.items()}
@@ -209,3 +209,8 @@ def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
209
209
  def encode_kubernetes_node_info(
210
210
  return_value: 'models.KubernetesNodesInfo') -> Dict[str, Any]:
211
211
  return return_value.to_dict()
212
+
213
+
214
+ @register_encoder('endpoints')
215
+ def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
216
+ return {str(k): v for k, v in return_value.items()}
sky/server/rest.py CHANGED
@@ -47,9 +47,10 @@ class RetryContext:
47
47
 
48
48
  @contextlib.contextmanager
49
49
  def _retry_in_context():
50
- token = _RETRY_CONTEXT.set(RetryContext())
50
+ context = RetryContext()
51
+ token = _RETRY_CONTEXT.set(context)
51
52
  try:
52
- yield
53
+ yield context
53
54
  finally:
54
55
  _RETRY_CONTEXT.reset(token)
55
56
 
@@ -76,6 +77,8 @@ def retry_transient_errors(max_retries: int = 3,
76
77
  if isinstance(e, requests.exceptions.HTTPError):
77
78
  # Only server error is considered as transient.
78
79
  return e.response.status_code >= 500
80
+ if isinstance(e, exceptions.ClientError):
81
+ return False
79
82
  # It is hard to enumerate all other errors that are transient, e.g.
80
83
  # broken pipe, connection refused, etc. Instead, it is safer to assume
81
84
  # all other errors might be transient since we only retry for 3 times
@@ -88,26 +91,53 @@ def retry_transient_errors(max_retries: int = 3,
88
91
  @functools.wraps(func)
89
92
  def wrapper(*args, **kwargs):
90
93
  backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
91
- with _retry_in_context():
92
- for retry_cnt in range(max_retries):
94
+ consecutive_failed_count = 0
95
+
96
+ with _retry_in_context() as context:
97
+ previous_line_processed = context.line_processed # should be 0
98
+
99
+ def _handle_exception():
100
+ # If the function made progress on a retry,
101
+ # clears the backoff and resets the failed retry count.
102
+ # Otherwise, increments the failed retry count.
103
+ nonlocal backoff
104
+ nonlocal consecutive_failed_count
105
+ nonlocal previous_line_processed
106
+ if context.line_processed > previous_line_processed:
107
+ backoff = common_utils.Backoff(initial_backoff,
108
+ max_backoff_factor)
109
+ previous_line_processed = context.line_processed
110
+ consecutive_failed_count = 0
111
+ else:
112
+ consecutive_failed_count += 1
113
+
114
+ while consecutive_failed_count < max_retries:
93
115
  try:
94
116
  return func(*args, **kwargs)
95
117
  # Occurs when the server proactively interrupts the request
96
118
  # during rolling update, we can retry immediately on the
97
119
  # new replica.
98
120
  except exceptions.RequestInterruptedError:
121
+ _handle_exception()
99
122
  logger.debug('Request interrupted. Retry immediately.')
100
123
  continue
101
124
  except Exception as e: # pylint: disable=broad-except
102
- if retry_cnt >= max_retries - 1:
125
+ _handle_exception()
126
+ if consecutive_failed_count >= max_retries:
103
127
  # Retries exhausted.
104
128
  raise
105
129
  if not is_transient_error(e):
106
130
  # Permanent error, no need to retry.
107
131
  raise
108
- logger.debug(f'Retry {func.__name__} due to {e}, '
109
- f'attempt {retry_cnt + 1}/{max_retries}')
110
- time.sleep(backoff.current_backoff())
132
+ logger.debug(
133
+ f'Retry {func.__name__} due to {e}, '
134
+ f'attempt {consecutive_failed_count}/{max_retries}')
135
+ # Only sleep if this is not the first retry.
136
+ # The idea is that if the function made progress on a
137
+ # retry, we should try again immediately to reduce the
138
+ # waiting time.
139
+ if consecutive_failed_count > 0:
140
+ time.sleep(backoff.current_backoff())
111
141
 
112
142
  return cast(F, wrapper)
113
143
 
sky/server/server.py CHANGED
@@ -1185,10 +1185,6 @@ async def logs(
1185
1185
  # TODO(zhwu): This should wait for the request on the cluster, e.g., async
1186
1186
  # launch, to finish, so that a user does not need to manually pull the
1187
1187
  # request status.
1188
- # Only initialize the context in logs handler to limit the scope of this
1189
- # experimental change.
1190
- # TODO(aylei): init in lifespan() to enable SkyPilot context in all APIs.
1191
- context.initialize()
1192
1188
  request_task = executor.prepare_request(
1193
1189
  request_id=request.state.request_id,
1194
1190
  request_name='logs',
@@ -1198,8 +1194,14 @@ async def logs(
1198
1194
  )
1199
1195
  task = asyncio.create_task(executor.execute_request_coroutine(request_task))
1200
1196
 
1201
- def cancel_task():
1202
- task.cancel()
1197
+ async def cancel_task():
1198
+ try:
1199
+ logger.info('Client disconnected for request: '
1200
+ f'{request.state.request_id}')
1201
+ task.cancel()
1202
+ await task
1203
+ except asyncio.CancelledError:
1204
+ pass
1203
1205
 
1204
1206
  # Cancel the task after the request is done or client disconnects
1205
1207
  background_tasks.add_task(cancel_task)
sky/skypilot_config.py CHANGED
@@ -514,10 +514,10 @@ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
514
514
 
515
515
 
516
516
  def _parse_dotlist(dotlist: List[str]) -> config_utils.Config:
517
- """Parse a comma-separated list of key-value pairs into a dictionary.
517
+ """Parse a single key-value pair into a dictionary.
518
518
 
519
519
  Args:
520
- dotlist: A comma-separated list of key-value pairs.
520
+ dotlist: A single key-value pair.
521
521
 
522
522
  Returns:
523
523
  A config_utils.Config object with the parsed key-value pairs.
@@ -788,7 +788,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
788
788
  """Composes the skypilot CLI config.
789
789
  CLI config can either be:
790
790
  - A path to a config file
791
- - A comma-separated list of key-value pairs
791
+ - A single key-value pair
792
792
  """
793
793
 
794
794
  if not cli_config:
@@ -804,7 +804,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
804
804
  config_source = maybe_config_path
805
805
  # cli_config is a path to a config file
806
806
  parsed_config = parse_and_validate_config_file(maybe_config_path)
807
- else: # cli_config is a comma-separated list of key-value pairs
807
+ else: # cli_config is a single key-value pair
808
808
  parsed_config = _parse_dotlist(cli_config)
809
809
  _validate_config(parsed_config, config_source)
810
810
  except ValueError as e:
sky/users/permission.py CHANGED
@@ -46,7 +46,8 @@ class PermissionService:
46
46
  engine = global_user_state.initialize_and_get_db()
47
47
  db_utils.add_all_tables_to_db_sqlalchemy(
48
48
  sqlalchemy_adapter.Base.metadata, engine)
49
- adapter = sqlalchemy_adapter.Adapter(engine)
49
+ adapter = sqlalchemy_adapter.Adapter(
50
+ engine, db_class=sqlalchemy_adapter.CasbinRule)
50
51
  model_path = os.path.join(os.path.dirname(__file__),
51
52
  'model.conf')
52
53
  enforcer = casbin.Enforcer(model_path, adapter)
@@ -67,7 +68,7 @@ class PermissionService:
67
68
  username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
68
69
  user_info = global_user_state.get_user(user_hash)
69
70
  if user_info:
70
- logger.info(f'Basic auth user {username} already exists')
71
+ logger.debug(f'Basic auth user {username} already exists')
71
72
  return
72
73
  global_user_state.add_or_update_user(
73
74
  models.User(id=user_hash, name=username, password=password))
@@ -168,8 +169,6 @@ class PermissionService:
168
169
  """
169
170
  user_roles = self.enforcer.get_roles_for_user(user_id)
170
171
  if not user_roles:
171
- logger.info(f'User {user_id} has no roles, adding'
172
- f' default role {rbac.get_default_role()}')
173
172
  self.enforcer.add_grouping_policy(user_id, rbac.get_default_role())
174
173
  return True
175
174
  return False
@@ -183,7 +182,7 @@ class PermissionService:
183
182
  # Avoid calling get_user_roles, as it will require the lock.
184
183
  current_roles = self.enforcer.get_roles_for_user(user_id)
185
184
  if not current_roles:
186
- logger.warning(f'User {user_id} has no roles')
185
+ logger.debug(f'User {user_id} has no roles')
187
186
  return
188
187
  self.enforcer.remove_grouping_policy(user_id, current_roles[0])
189
188
  self.enforcer.save_policy()
@@ -197,12 +196,12 @@ class PermissionService:
197
196
  # Avoid calling get_user_roles, as it will require the lock.
198
197
  current_roles = self.enforcer.get_roles_for_user(user_id)
199
198
  if not current_roles:
200
- logger.warning(f'User {user_id} has no roles')
199
+ logger.debug(f'User {user_id} has no roles')
201
200
  else:
202
201
  # TODO(hailong): how to handle multiple roles?
203
202
  current_role = current_roles[0]
204
203
  if current_role == new_role:
205
- logger.info(f'User {user_id} already has role {new_role}')
204
+ logger.debug(f'User {user_id} already has role {new_role}')
206
205
  return
207
206
  self.enforcer.remove_grouping_policy(user_id, current_role)
208
207
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250820
3
+ Version: 1.0.0.dev20250822
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -253,7 +253,7 @@ Dynamic: summary
253
253
 
254
254
  ----
255
255
 
256
- SkyPilot is a system for running, managing, and scaling AI workloads on any AI infrastructure.
256
+ SkyPilot is a system to run, manage, and scale AI workloads on any AI infrastructure.
257
257
 
258
258
  SkyPilot gives **AI teams** a simple interface to run jobs on any infra.
259
259
  **Infra teams** get a unified control plane to manage any AI compute — with advanced scheduling, scaling, and orchestration.