skypilot-nightly 1.0.0.dev20250915__py3-none-any.whl → 1.0.0.dev20250918__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (78) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/primeintellect.py +1 -0
  3. sky/adaptors/seeweb.py +68 -4
  4. sky/authentication.py +25 -0
  5. sky/backends/__init__.py +3 -2
  6. sky/backends/backend_utils.py +16 -12
  7. sky/backends/cloud_vm_ray_backend.py +61 -4
  8. sky/catalog/primeintellect_catalog.py +95 -0
  9. sky/client/sdk.py +6 -0
  10. sky/clouds/__init__.py +2 -0
  11. sky/clouds/primeintellect.py +314 -0
  12. sky/core.py +10 -3
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/5339.4a881570243431a5.js +51 -0
  16. sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/{6990-11c8e9b982e8ffec.js → 6990-f6818c84ed8f1c86.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/{webpack-d1e29b3aa66bf4cf.js → webpack-487697b47d8c5e50.js} +1 -1
  21. sky/dashboard/out/_next/static/{dG6B0i0HO4jIoKb4ZFYJ_ → k1mo5xWZrV9djgjd0moOT}/_buildManifest.js +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/global_user_state.py +42 -34
  38. sky/jobs/server/server.py +14 -1
  39. sky/jobs/state.py +26 -1
  40. sky/provision/__init__.py +1 -0
  41. sky/provision/docker_utils.py +50 -3
  42. sky/provision/instance_setup.py +15 -1
  43. sky/provision/lambda_cloud/instance.py +12 -11
  44. sky/provision/primeintellect/__init__.py +10 -0
  45. sky/provision/primeintellect/config.py +11 -0
  46. sky/provision/primeintellect/instance.py +454 -0
  47. sky/provision/primeintellect/utils.py +398 -0
  48. sky/resources.py +9 -1
  49. sky/schemas/generated/servev1_pb2.py +58 -0
  50. sky/schemas/generated/servev1_pb2.pyi +115 -0
  51. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  52. sky/serve/serve_rpc_utils.py +179 -0
  53. sky/serve/serve_utils.py +29 -12
  54. sky/serve/server/core.py +37 -19
  55. sky/serve/server/impl.py +221 -129
  56. sky/server/common.py +13 -0
  57. sky/server/constants.py +3 -0
  58. sky/server/requests/executor.py +23 -6
  59. sky/server/server.py +10 -5
  60. sky/setup_files/dependencies.py +1 -0
  61. sky/skylet/constants.py +5 -3
  62. sky/skylet/services.py +98 -0
  63. sky/skylet/skylet.py +3 -1
  64. sky/skypilot_config.py +10 -3
  65. sky/templates/kubernetes-ray.yml.j2 +22 -12
  66. sky/templates/primeintellect-ray.yml.j2 +71 -0
  67. {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/METADATA +39 -38
  68. {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/RECORD +74 -62
  69. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
  70. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +0 -51
  71. sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
  73. /sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-7528cc0ef8c522c5.js} +0 -0
  74. /sky/dashboard/out/_next/static/{dG6B0i0HO4jIoKb4ZFYJ_ → k1mo5xWZrV9djgjd0moOT}/_ssgManifest.js +0 -0
  75. {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/WHEEL +0 -0
  76. {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/entry_points.txt +0 -0
  77. {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/licenses/LICENSE +0 -0
  78. {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-d1e29b3aa66bf4cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-0487dfbf149d9e53.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-11c8e9b982e8ffec.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-2ea98b57e318bd6e.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-159df2d4c441a9d1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/dG6B0i0HO4jIoKb4ZFYJ_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/dG6B0i0HO4jIoKb4ZFYJ_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"dG6B0i0HO4jIoKb4ZFYJ_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-487697b47d8c5e50.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-a3e3f0683e19d340.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-ba5be550eb80fd8c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-159df2d4c441a9d1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"k1mo5xWZrV9djgjd0moOT","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-d1e29b3aa66bf4cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/dG6B0i0HO4jIoKb4ZFYJ_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/dG6B0i0HO4jIoKb4ZFYJ_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"dG6B0i0HO4jIoKb4ZFYJ_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-487697b47d8c5e50.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7528cc0ef8c522c5.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"k1mo5xWZrV9djgjd0moOT","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/global_user_state.py CHANGED
@@ -516,7 +516,7 @@ def add_or_update_cluster(cluster_name: str,
516
516
  task_config: Optional[Dict[str, Any]] = None,
517
517
  is_managed: bool = False,
518
518
  provision_log_path: Optional[str] = None,
519
- update_only: bool = False):
519
+ existing_cluster_hash: Optional[str] = None):
520
520
  """Adds or updates cluster_name -> cluster_handle mapping.
521
521
 
522
522
  Args:
@@ -532,10 +532,12 @@ def add_or_update_cluster(cluster_name: str,
532
532
  is_managed: Whether the cluster is launched by the
533
533
  controller.
534
534
  provision_log_path: Absolute path to provision.log, if available.
535
- update_only: Whether to update the cluster only. If True,
536
- the cluster record will not be inserted if one does not exist.
535
+ existing_cluster_hash: If specified, the cluster will be updated
536
+ only if the cluster_hash matches. If a cluster does not exist,
537
+ it will not be inserted and an error will be raised.
537
538
  """
538
539
  assert _SQLALCHEMY_ENGINE is not None
540
+
539
541
  # FIXME: launched_at will be changed when `sky launch -c` is called.
540
542
  handle = pickle.dumps(cluster_handle)
541
543
  cluster_launched_at = int(time.time()) if is_launch else None
@@ -631,17 +633,17 @@ def add_or_update_cluster(cluster_name: str,
631
633
  session.rollback()
632
634
  raise ValueError('Unsupported database dialect')
633
635
 
634
- if update_only:
636
+ if existing_cluster_hash is not None:
635
637
  count = session.query(cluster_table).filter_by(
636
- name=cluster_name).update({
638
+ name=cluster_name, cluster_hash=existing_cluster_hash).update({
637
639
  **conditional_values, cluster_table.c.handle: handle,
638
640
  cluster_table.c.status: status.value,
639
- cluster_table.c.cluster_hash: cluster_hash,
640
641
  cluster_table.c.status_updated_at: status_updated_at
641
642
  })
642
643
  assert count <= 1
643
644
  if count == 0:
644
- raise ValueError(f'Cluster {cluster_name} not found.')
645
+ raise ValueError(f'Cluster {cluster_name} with hash '
646
+ f'{existing_cluster_hash} not found.')
645
647
  else:
646
648
  insert_stmnt = insert_func(cluster_table).values(
647
649
  name=cluster_name,
@@ -1235,16 +1237,16 @@ def _get_cluster_usage_intervals(
1235
1237
  return pickle.loads(row.usage_intervals)
1236
1238
 
1237
1239
 
1238
- def _get_cluster_launch_time(cluster_hash: str) -> Optional[int]:
1239
- usage_intervals = _get_cluster_usage_intervals(cluster_hash)
1240
+ def _get_cluster_launch_time(
1241
+ usage_intervals: List[Tuple[int, Optional[int]]]) -> Optional[int]:
1240
1242
  if usage_intervals is None:
1241
1243
  return None
1242
1244
  return usage_intervals[0][0]
1243
1245
 
1244
1246
 
1245
- def _get_cluster_duration(cluster_hash: str) -> int:
1247
+ def _get_cluster_duration(
1248
+ usage_intervals: List[Tuple[int, Optional[int]]]) -> int:
1246
1249
  total_duration = 0
1247
- usage_intervals = _get_cluster_usage_intervals(cluster_hash)
1248
1250
 
1249
1251
  if usage_intervals is None:
1250
1252
  return total_duration
@@ -1537,11 +1539,36 @@ def get_clusters_from_history(
1537
1539
  if days is not None:
1538
1540
  cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
1539
1541
 
1542
+ current_user_hash = common_utils.get_user_hash()
1543
+
1544
+ row_to_user_hash = {}
1545
+ usage_intervals_dict = {}
1546
+ for row in rows:
1547
+ user_hash = (row.user_hash
1548
+ if row.user_hash is not None else current_user_hash)
1549
+ row_to_user_hash[row.cluster_hash] = user_hash
1550
+ if row.usage_intervals:
1551
+ try:
1552
+ usage_intervals_dict[row.cluster_hash] = pickle.loads(
1553
+ row.usage_intervals)
1554
+ except (pickle.PickleError, AttributeError):
1555
+ usage_intervals_dict[row.cluster_hash] = []
1556
+ user_hashes = set(row_to_user_hash.values())
1557
+ user_hash_to_user = _get_users(user_hashes)
1558
+
1559
+ cluster_hashes = set(row_to_user_hash.keys())
1560
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1561
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1562
+
1540
1563
  records = []
1541
1564
  for row in rows:
1542
- user_hash = _get_user_hash_or_current_user(row.user_hash)
1543
- launched_at = _get_cluster_launch_time(row.cluster_hash)
1544
- duration = _get_cluster_duration(row.cluster_hash)
1565
+ user_hash = row_to_user_hash[row.cluster_hash]
1566
+ user = user_hash_to_user.get(user_hash, None)
1567
+ user_name = user.name if user is not None else None
1568
+ last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1569
+ usage_intervals = usage_intervals_dict.get(row.cluster_hash, None)
1570
+ launched_at = _get_cluster_launch_time(usage_intervals)
1571
+ duration = _get_cluster_duration(usage_intervals)
1545
1572
 
1546
1573
  # Parse status
1547
1574
  status = None
@@ -1554,13 +1581,6 @@ def get_clusters_from_history(
1554
1581
  # For historical clusters, check if they were used recently
1555
1582
  # Use the most recent activity from usage_intervals to determine
1556
1583
  # last use
1557
- usage_intervals = []
1558
- if row.usage_intervals:
1559
- try:
1560
- usage_intervals = pickle.loads(row.usage_intervals)
1561
- except (pickle.PickleError, AttributeError):
1562
- usage_intervals = []
1563
-
1564
1584
  # Find the most recent activity time from usage_intervals
1565
1585
  last_activity_time = None
1566
1586
  if usage_intervals:
@@ -1582,17 +1602,6 @@ def get_clusters_from_history(
1582
1602
  except (pickle.PickleError, AttributeError):
1583
1603
  launched_resources = None
1584
1604
 
1585
- # Parse usage intervals safely
1586
- usage_intervals = []
1587
- if row.usage_intervals:
1588
- try:
1589
- usage_intervals = pickle.loads(row.usage_intervals)
1590
- except (pickle.PickleError, AttributeError):
1591
- usage_intervals = []
1592
-
1593
- # Get user name from user hash
1594
- user = get_user(user_hash)
1595
- user_name = user.name if user is not None else None
1596
1605
  workspace = (row.history_workspace
1597
1606
  if row.history_workspace else row.workspace)
1598
1607
 
@@ -1610,8 +1619,7 @@ def get_clusters_from_history(
1610
1619
  'workspace': workspace,
1611
1620
  'last_creation_yaml': row.last_creation_yaml,
1612
1621
  'last_creation_command': row.last_creation_command,
1613
- 'last_event': get_last_cluster_event(
1614
- row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE),
1622
+ 'last_event': last_event,
1615
1623
  }
1616
1624
 
1617
1625
  records.append(record)
sky/jobs/server/server.py CHANGED
@@ -5,6 +5,7 @@ import pathlib
5
5
  import fastapi
6
6
 
7
7
  from sky import sky_logging
8
+ from sky.jobs import utils as managed_jobs_utils
8
9
  from sky.jobs.server import core
9
10
  from sky.server import common as server_common
10
11
  from sky.server import stream_utils
@@ -22,12 +23,24 @@ router = fastapi.APIRouter()
22
23
  @router.post('/launch')
23
24
  async def launch(request: fastapi.Request,
24
25
  jobs_launch_body: payloads.JobsLaunchBody) -> None:
26
+ # In consolidation mode, the jobs controller will use sky.launch on the same
27
+ # API server to launch the underlying job cluster. If you start run many
28
+ # jobs.launch requests, some may be blocked for a long time by sky.launch
29
+ # requests triggered by earlier jobs, which leads to confusing behavior as
30
+ # the jobs.launch requests trickle though. Also, since we don't have to
31
+ # actually launch a jobs controller sky cluster, the jobs.launch request is
32
+ # much quicker in consolidation mode. So we avoid the issue by just using
33
+ # the short executor instead - then jobs.launch will not be blocked by
34
+ # sky.launch.
35
+ consolidation_mode = managed_jobs_utils.is_consolidation_mode()
36
+ schedule_type = (api_requests.ScheduleType.SHORT
37
+ if consolidation_mode else api_requests.ScheduleType.LONG)
25
38
  executor.schedule_request(
26
39
  request_id=request.state.request_id,
27
40
  request_name='jobs.launch',
28
41
  request_body=jobs_launch_body,
29
42
  func=core.launch,
30
- schedule_type=api_requests.ScheduleType.LONG,
43
+ schedule_type=schedule_type,
31
44
  request_cluster_name=common.JOB_CONTROLLER_NAME,
32
45
  )
33
46
 
sky/jobs/state.py CHANGED
@@ -613,7 +613,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
613
613
  """
614
614
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
615
615
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
616
- count = await session.execute(
616
+ result = await session.execute(
617
617
  sqlalchemy.update(spot_table).where(
618
618
  sqlalchemy.and_(
619
619
  spot_table.c.spot_job_id == job_id,
@@ -625,6 +625,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
625
625
  spot_table.c.end_at.is_(None),
626
626
  )).values({spot_table.c.status: ManagedJobStatus.PENDING.value})
627
627
  )
628
+ count = result.rowcount
628
629
  await session.commit()
629
630
  if count != 1:
630
631
  raise exceptions.ManagedJobStatusError(
@@ -712,7 +713,19 @@ def set_failed(
712
713
  where_conditions = [spot_table.c.spot_job_id == job_id]
713
714
  if task_id is not None:
714
715
  where_conditions.append(spot_table.c.task_id == task_id)
716
+
717
+ # Handle failure_reason prepending when override_terminal is True
715
718
  if override_terminal:
719
+ # Get existing failure_reason with row lock to prevent race
720
+ # conditions
721
+ existing_reason_result = session.execute(
722
+ sqlalchemy.select(spot_table.c.failure_reason).where(
723
+ sqlalchemy.and_(*where_conditions)).with_for_update())
724
+ existing_reason_row = existing_reason_result.fetchone()
725
+ if existing_reason_row and existing_reason_row[0]:
726
+ # Prepend new failure reason to existing one
727
+ fields_to_set[spot_table.c.failure_reason] = (
728
+ failure_reason + '. Previously: ' + existing_reason_row[0])
716
729
  # Use COALESCE for end_at to avoid overriding the existing end_at if
717
730
  # it's already set.
718
731
  fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
@@ -1651,7 +1664,19 @@ async def set_failed_async(
1651
1664
  where_conditions = [spot_table.c.spot_job_id == job_id]
1652
1665
  if task_id is not None:
1653
1666
  where_conditions.append(spot_table.c.task_id == task_id)
1667
+
1668
+ # Handle failure_reason prepending when override_terminal is True
1654
1669
  if override_terminal:
1670
+ # Get existing failure_reason with row lock to prevent race
1671
+ # conditions
1672
+ existing_reason_result = await session.execute(
1673
+ sqlalchemy.select(spot_table.c.failure_reason).where(
1674
+ sqlalchemy.and_(*where_conditions)).with_for_update())
1675
+ existing_reason_row = existing_reason_result.fetchone()
1676
+ if existing_reason_row and existing_reason_row[0]:
1677
+ # Prepend new failure reason to existing one
1678
+ fields_to_set[spot_table.c.failure_reason] = (
1679
+ failure_reason + '. Previously: ' + existing_reason_row[0])
1655
1680
  fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
1656
1681
  spot_table.c.end_at, end_time)
1657
1682
  else:
sky/provision/__init__.py CHANGED
@@ -24,6 +24,7 @@ from sky.provision import kubernetes
24
24
  from sky.provision import lambda_cloud
25
25
  from sky.provision import nebius
26
26
  from sky.provision import oci
27
+ from sky.provision import primeintellect
27
28
  from sky.provision import runpod
28
29
  from sky.provision import scp
29
30
  from sky.provision import seeweb
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Configure environment variables. A docker image can have environment variables
16
16
  # set in the Dockerfile with `ENV``. We need to export these variables to the
17
17
  # shell environment, so that our ssh session can access them.
18
+ # Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
19
+ # Docker images with Ray 2.48.0+ set this for UV package manager support,
20
+ # but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
21
+ # See: https://github.com/skypilot-org/skypilot/pull/7181
18
22
  SETUP_ENV_VARS_CMD = (
19
23
  'prefix_cmd() '
20
24
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
- 'export -p > ~/container_env_var.sh && '
25
+ 'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
22
26
  '$(prefix_cmd) '
23
27
  'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
24
28
 
@@ -32,6 +36,30 @@ DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
32
36
 
33
37
  _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
34
38
 
39
+ # Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
40
+ # AWS CLI v2 is installed as a standalone binary, not a Python package. See:
41
+ # https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
42
+ INSTALL_AWS_CLI_CMD = (
43
+ 'which aws || ((command -v unzip >/dev/null 2>&1 || '
44
+ '(sudo apt-get update && sudo apt-get install -y unzip)) && '
45
+ 'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
46
+ '-o "/tmp/awscliv2.zip" && '
47
+ 'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
48
+ '&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
49
+
50
+
51
+ def _extract_region_from_ecr_server(server: str) -> str:
52
+ """Extract AWS region from ECR server URL.
53
+
54
+ ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
55
+ Returns the region part from the URL.
56
+ """
57
+ # Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
58
+ parts = server.split('.')
59
+ if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
60
+ return parts[3]
61
+ raise ValueError(f'Invalid ECR server format: {server}')
62
+
35
63
 
36
64
  @dataclasses.dataclass
37
65
  class DockerLoginConfig:
@@ -236,9 +264,9 @@ class DockerInitializer:
236
264
 
237
265
  # SkyPilot: Docker login if user specified a private docker registry.
238
266
  if 'docker_login_config' in self.docker_config:
239
- # TODO(tian): Maybe support a command to get the login password?
240
267
  docker_login_config = DockerLoginConfig(
241
268
  **self.docker_config['docker_login_config'])
269
+
242
270
  if docker_login_config.password:
243
271
  # Password is allowed to be empty, in that case, we will not run
244
272
  # the login command, and assume that the image pulling is
@@ -249,6 +277,25 @@ class DockerInitializer:
249
277
  f'--password {shlex.quote(docker_login_config.password)} '
250
278
  f'{shlex.quote(docker_login_config.server)}',
251
279
  wait_for_docker_daemon=True)
280
+ elif (docker_login_config.server.endswith('.amazonaws.com') and
281
+ '.dkr.ecr.' in docker_login_config.server):
282
+ # AWS ECR: Use aws ecr get-login-password for authentication
283
+ # ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
284
+ # This command uses the IAM credentials from the EC2 instance
285
+ # Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
286
+ region = _extract_region_from_ecr_server(
287
+ docker_login_config.server)
288
+
289
+ # AWS CLI is not pre-installed on AWS instances, unlike gcloud
290
+ # on GCP instances, so we need to install it first
291
+ self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
292
+
293
+ self._run(
294
+ f'aws ecr get-login-password --region {region} | '
295
+ f'{self.docker_cmd} login --username AWS '
296
+ f'--password-stdin '
297
+ f'{shlex.quote(docker_login_config.server)}',
298
+ wait_for_docker_daemon=True)
252
299
  elif docker_login_config.server.endswith('-docker.pkg.dev'):
253
300
  # Docker image server is on GCR, we need to do additional setup
254
301
  # to pull the image.
@@ -367,7 +414,7 @@ class DockerInitializer:
367
414
  # pylint: disable=anomalous-backslash-in-string
368
415
  self._run(
369
416
  'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
370
- f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
417
+ f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
371
418
  'mkdir -p ~/.ssh;'
372
419
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
373
420
  'sudo service ssh start;'
@@ -136,6 +136,20 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo,
136
136
  logger.info(f'Logs of worker nodes can be found at: {worker_log_path}')
137
137
 
138
138
 
139
+ class SSHThreadPoolExecutor(futures.ThreadPoolExecutor):
140
+ """ThreadPoolExecutor that kills children processes on exit."""
141
+
142
+ def __exit__(self, exc_type, exc_val, exc_tb):
143
+ # ssh command runner eventually calls
144
+ # log_lib.run_with_log, which will spawn
145
+ # subprocesses. If we are exiting the context
146
+ # we need to kill the children processes
147
+ # to avoid leakage.
148
+ subprocess_utils.kill_children_processes()
149
+ self.shutdown()
150
+ return False
151
+
152
+
139
153
  def _parallel_ssh_with_cache(func,
140
154
  cluster_name: str,
141
155
  stage_name: str,
@@ -148,7 +162,7 @@ def _parallel_ssh_with_cache(func,
148
162
  # as 32 is too large for some machines.
149
163
  max_workers = subprocess_utils.get_parallel_threads(
150
164
  cluster_info.provider_name)
151
- with futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
165
+ with SSHThreadPoolExecutor(max_workers=max_workers) as pool:
152
166
  results = []
153
167
  runners = provision.get_command_runners(cluster_info.provider_name,
154
168
  cluster_info, **ssh_credentials)
@@ -106,34 +106,35 @@ def run_instances(region: str, cluster_name_on_cloud: str,
106
106
  created_instance_ids = []
107
107
  remote_ssh_key_name = config.authentication_config['remote_key_name']
108
108
 
109
- def launch_nodes(node_type: str, quantity: int) -> List[str]:
109
+ def launch_node(node_type: str) -> str:
110
110
  try:
111
111
  instance_ids = lambda_client.create_instances(
112
112
  instance_type=config.node_config['InstanceType'],
113
113
  region=region,
114
114
  name=f'{cluster_name_on_cloud}-{node_type}',
115
- quantity=quantity,
115
+ # Quantity cannot actually be greater than 1; see:
116
+ # https://github.com/skypilot-org/skypilot/issues/7084
117
+ quantity=1,
116
118
  ssh_key_name=remote_ssh_key_name,
117
119
  )
118
- logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
119
- f'instance_ids: {instance_ids}')
120
- return instance_ids
120
+ logger.info(f'Launched {node_type} node, '
121
+ f'instance_id: {instance_ids[0]}')
122
+ return instance_ids[0]
121
123
  except Exception as e:
122
124
  logger.warning(f'run_instances error: {e}')
123
125
  raise
124
126
 
125
127
  if head_instance_id is None:
126
- instance_ids = launch_nodes('head', 1)
127
- assert len(instance_ids) == 1
128
- created_instance_ids.append(instance_ids[0])
129
- head_instance_id = instance_ids[0]
128
+ head_instance_id = launch_node('head')
129
+ created_instance_ids.append(head_instance_id)
130
130
 
131
131
  assert head_instance_id is not None, 'head_instance_id should not be None'
132
132
 
133
133
  worker_node_count = to_start_count - 1
134
134
  if worker_node_count > 0:
135
- instance_ids = launch_nodes('worker', worker_node_count)
136
- created_instance_ids.extend(instance_ids)
135
+ for _ in range(worker_node_count):
136
+ worker_instance_id = launch_node('worker')
137
+ created_instance_ids.append(worker_instance_id)
137
138
 
138
139
  while True:
139
140
  instances = _filter_instances(cluster_name_on_cloud, ['active'])
@@ -0,0 +1,10 @@
1
+ """Prime Intellect provisioner for SkyPilot."""
2
+
3
+ from sky.provision.primeintellect.config import bootstrap_instances
4
+ from sky.provision.primeintellect.instance import cleanup_ports
5
+ from sky.provision.primeintellect.instance import get_cluster_info
6
+ from sky.provision.primeintellect.instance import query_instances
7
+ from sky.provision.primeintellect.instance import run_instances
8
+ from sky.provision.primeintellect.instance import stop_instances
9
+ from sky.provision.primeintellect.instance import terminate_instances
10
+ from sky.provision.primeintellect.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Prime Intellect configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config