skypilot-nightly 1.0.0.dev20250915__py3-none-any.whl → 1.0.0.dev20250918__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +68 -4
- sky/authentication.py +25 -0
- sky/backends/__init__.py +3 -2
- sky/backends/backend_utils.py +16 -12
- sky/backends/cloud_vm_ray_backend.py +61 -4
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/client/sdk.py +6 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/primeintellect.py +314 -0
- sky/core.py +10 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3015-ba5be550eb80fd8c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.4a881570243431a5.js +51 -0
- sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{6990-11c8e9b982e8ffec.js → 6990-f6818c84ed8f1c86.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8969-a3e3f0683e19d340.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-d1e29b3aa66bf4cf.js → webpack-487697b47d8c5e50.js} +1 -1
- sky/dashboard/out/_next/static/{dG6B0i0HO4jIoKb4ZFYJ_ → k1mo5xWZrV9djgjd0moOT}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +42 -34
- sky/jobs/server/server.py +14 -1
- sky/jobs/state.py +26 -1
- sky/provision/__init__.py +1 -0
- sky/provision/docker_utils.py +50 -3
- sky/provision/instance_setup.py +15 -1
- sky/provision/lambda_cloud/instance.py +12 -11
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/resources.py +9 -1
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_utils.py +29 -12
- sky/serve/server/core.py +37 -19
- sky/serve/server/impl.py +221 -129
- sky/server/common.py +13 -0
- sky/server/constants.py +3 -0
- sky/server/requests/executor.py +23 -6
- sky/server/server.py +10 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +5 -3
- sky/skylet/services.py +98 -0
- sky/skylet/skylet.py +3 -1
- sky/skypilot_config.py +10 -3
- sky/templates/kubernetes-ray.yml.j2 +22 -12
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/METADATA +39 -38
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/RECORD +74 -62
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +0 -51
- sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
- /sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-7528cc0ef8c522c5.js} +0 -0
- /sky/dashboard/out/_next/static/{dG6B0i0HO4jIoKb4ZFYJ_ → k1mo5xWZrV9djgjd0moOT}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250915.dist-info → skypilot_nightly-1.0.0.dev20250918.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-487697b47d8c5e50.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-a3e3f0683e19d340.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-ba5be550eb80fd8c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-159df2d4c441a9d1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"k1mo5xWZrV9djgjd0moOT","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-487697b47d8c5e50.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7528cc0ef8c522c5.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/k1mo5xWZrV9djgjd0moOT/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"k1mo5xWZrV9djgjd0moOT","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
|
@@ -516,7 +516,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
516
516
|
task_config: Optional[Dict[str, Any]] = None,
|
|
517
517
|
is_managed: bool = False,
|
|
518
518
|
provision_log_path: Optional[str] = None,
|
|
519
|
-
|
|
519
|
+
existing_cluster_hash: Optional[str] = None):
|
|
520
520
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
|
521
521
|
|
|
522
522
|
Args:
|
|
@@ -532,10 +532,12 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
532
532
|
is_managed: Whether the cluster is launched by the
|
|
533
533
|
controller.
|
|
534
534
|
provision_log_path: Absolute path to provision.log, if available.
|
|
535
|
-
|
|
536
|
-
|
|
535
|
+
existing_cluster_hash: If specified, the cluster will be updated
|
|
536
|
+
only if the cluster_hash matches. If a cluster does not exist,
|
|
537
|
+
it will not be inserted and an error will be raised.
|
|
537
538
|
"""
|
|
538
539
|
assert _SQLALCHEMY_ENGINE is not None
|
|
540
|
+
|
|
539
541
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
|
540
542
|
handle = pickle.dumps(cluster_handle)
|
|
541
543
|
cluster_launched_at = int(time.time()) if is_launch else None
|
|
@@ -631,17 +633,17 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
631
633
|
session.rollback()
|
|
632
634
|
raise ValueError('Unsupported database dialect')
|
|
633
635
|
|
|
634
|
-
if
|
|
636
|
+
if existing_cluster_hash is not None:
|
|
635
637
|
count = session.query(cluster_table).filter_by(
|
|
636
|
-
name=cluster_name).update({
|
|
638
|
+
name=cluster_name, cluster_hash=existing_cluster_hash).update({
|
|
637
639
|
**conditional_values, cluster_table.c.handle: handle,
|
|
638
640
|
cluster_table.c.status: status.value,
|
|
639
|
-
cluster_table.c.cluster_hash: cluster_hash,
|
|
640
641
|
cluster_table.c.status_updated_at: status_updated_at
|
|
641
642
|
})
|
|
642
643
|
assert count <= 1
|
|
643
644
|
if count == 0:
|
|
644
|
-
raise ValueError(f'Cluster {cluster_name}
|
|
645
|
+
raise ValueError(f'Cluster {cluster_name} with hash '
|
|
646
|
+
f'{existing_cluster_hash} not found.')
|
|
645
647
|
else:
|
|
646
648
|
insert_stmnt = insert_func(cluster_table).values(
|
|
647
649
|
name=cluster_name,
|
|
@@ -1235,16 +1237,16 @@ def _get_cluster_usage_intervals(
|
|
|
1235
1237
|
return pickle.loads(row.usage_intervals)
|
|
1236
1238
|
|
|
1237
1239
|
|
|
1238
|
-
def _get_cluster_launch_time(
|
|
1239
|
-
|
|
1240
|
+
def _get_cluster_launch_time(
|
|
1241
|
+
usage_intervals: List[Tuple[int, Optional[int]]]) -> Optional[int]:
|
|
1240
1242
|
if usage_intervals is None:
|
|
1241
1243
|
return None
|
|
1242
1244
|
return usage_intervals[0][0]
|
|
1243
1245
|
|
|
1244
1246
|
|
|
1245
|
-
def _get_cluster_duration(
|
|
1247
|
+
def _get_cluster_duration(
|
|
1248
|
+
usage_intervals: List[Tuple[int, Optional[int]]]) -> int:
|
|
1246
1249
|
total_duration = 0
|
|
1247
|
-
usage_intervals = _get_cluster_usage_intervals(cluster_hash)
|
|
1248
1250
|
|
|
1249
1251
|
if usage_intervals is None:
|
|
1250
1252
|
return total_duration
|
|
@@ -1537,11 +1539,36 @@ def get_clusters_from_history(
|
|
|
1537
1539
|
if days is not None:
|
|
1538
1540
|
cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
|
|
1539
1541
|
|
|
1542
|
+
current_user_hash = common_utils.get_user_hash()
|
|
1543
|
+
|
|
1544
|
+
row_to_user_hash = {}
|
|
1545
|
+
usage_intervals_dict = {}
|
|
1546
|
+
for row in rows:
|
|
1547
|
+
user_hash = (row.user_hash
|
|
1548
|
+
if row.user_hash is not None else current_user_hash)
|
|
1549
|
+
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1550
|
+
if row.usage_intervals:
|
|
1551
|
+
try:
|
|
1552
|
+
usage_intervals_dict[row.cluster_hash] = pickle.loads(
|
|
1553
|
+
row.usage_intervals)
|
|
1554
|
+
except (pickle.PickleError, AttributeError):
|
|
1555
|
+
usage_intervals_dict[row.cluster_hash] = []
|
|
1556
|
+
user_hashes = set(row_to_user_hash.values())
|
|
1557
|
+
user_hash_to_user = _get_users(user_hashes)
|
|
1558
|
+
|
|
1559
|
+
cluster_hashes = set(row_to_user_hash.keys())
|
|
1560
|
+
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
|
1561
|
+
cluster_hashes, ClusterEventType.STATUS_CHANGE)
|
|
1562
|
+
|
|
1540
1563
|
records = []
|
|
1541
1564
|
for row in rows:
|
|
1542
|
-
user_hash =
|
|
1543
|
-
|
|
1544
|
-
|
|
1565
|
+
user_hash = row_to_user_hash[row.cluster_hash]
|
|
1566
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
1567
|
+
user_name = user.name if user is not None else None
|
|
1568
|
+
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1569
|
+
usage_intervals = usage_intervals_dict.get(row.cluster_hash, None)
|
|
1570
|
+
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
1571
|
+
duration = _get_cluster_duration(usage_intervals)
|
|
1545
1572
|
|
|
1546
1573
|
# Parse status
|
|
1547
1574
|
status = None
|
|
@@ -1554,13 +1581,6 @@ def get_clusters_from_history(
|
|
|
1554
1581
|
# For historical clusters, check if they were used recently
|
|
1555
1582
|
# Use the most recent activity from usage_intervals to determine
|
|
1556
1583
|
# last use
|
|
1557
|
-
usage_intervals = []
|
|
1558
|
-
if row.usage_intervals:
|
|
1559
|
-
try:
|
|
1560
|
-
usage_intervals = pickle.loads(row.usage_intervals)
|
|
1561
|
-
except (pickle.PickleError, AttributeError):
|
|
1562
|
-
usage_intervals = []
|
|
1563
|
-
|
|
1564
1584
|
# Find the most recent activity time from usage_intervals
|
|
1565
1585
|
last_activity_time = None
|
|
1566
1586
|
if usage_intervals:
|
|
@@ -1582,17 +1602,6 @@ def get_clusters_from_history(
|
|
|
1582
1602
|
except (pickle.PickleError, AttributeError):
|
|
1583
1603
|
launched_resources = None
|
|
1584
1604
|
|
|
1585
|
-
# Parse usage intervals safely
|
|
1586
|
-
usage_intervals = []
|
|
1587
|
-
if row.usage_intervals:
|
|
1588
|
-
try:
|
|
1589
|
-
usage_intervals = pickle.loads(row.usage_intervals)
|
|
1590
|
-
except (pickle.PickleError, AttributeError):
|
|
1591
|
-
usage_intervals = []
|
|
1592
|
-
|
|
1593
|
-
# Get user name from user hash
|
|
1594
|
-
user = get_user(user_hash)
|
|
1595
|
-
user_name = user.name if user is not None else None
|
|
1596
1605
|
workspace = (row.history_workspace
|
|
1597
1606
|
if row.history_workspace else row.workspace)
|
|
1598
1607
|
|
|
@@ -1610,8 +1619,7 @@ def get_clusters_from_history(
|
|
|
1610
1619
|
'workspace': workspace,
|
|
1611
1620
|
'last_creation_yaml': row.last_creation_yaml,
|
|
1612
1621
|
'last_creation_command': row.last_creation_command,
|
|
1613
|
-
'last_event':
|
|
1614
|
-
row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE),
|
|
1622
|
+
'last_event': last_event,
|
|
1615
1623
|
}
|
|
1616
1624
|
|
|
1617
1625
|
records.append(record)
|
sky/jobs/server/server.py
CHANGED
|
@@ -5,6 +5,7 @@ import pathlib
|
|
|
5
5
|
import fastapi
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
8
|
+
from sky.jobs import utils as managed_jobs_utils
|
|
8
9
|
from sky.jobs.server import core
|
|
9
10
|
from sky.server import common as server_common
|
|
10
11
|
from sky.server import stream_utils
|
|
@@ -22,12 +23,24 @@ router = fastapi.APIRouter()
|
|
|
22
23
|
@router.post('/launch')
|
|
23
24
|
async def launch(request: fastapi.Request,
|
|
24
25
|
jobs_launch_body: payloads.JobsLaunchBody) -> None:
|
|
26
|
+
# In consolidation mode, the jobs controller will use sky.launch on the same
|
|
27
|
+
# API server to launch the underlying job cluster. If you start run many
|
|
28
|
+
# jobs.launch requests, some may be blocked for a long time by sky.launch
|
|
29
|
+
# requests triggered by earlier jobs, which leads to confusing behavior as
|
|
30
|
+
# the jobs.launch requests trickle though. Also, since we don't have to
|
|
31
|
+
# actually launch a jobs controller sky cluster, the jobs.launch request is
|
|
32
|
+
# much quicker in consolidation mode. So we avoid the issue by just using
|
|
33
|
+
# the short executor instead - then jobs.launch will not be blocked by
|
|
34
|
+
# sky.launch.
|
|
35
|
+
consolidation_mode = managed_jobs_utils.is_consolidation_mode()
|
|
36
|
+
schedule_type = (api_requests.ScheduleType.SHORT
|
|
37
|
+
if consolidation_mode else api_requests.ScheduleType.LONG)
|
|
25
38
|
executor.schedule_request(
|
|
26
39
|
request_id=request.state.request_id,
|
|
27
40
|
request_name='jobs.launch',
|
|
28
41
|
request_body=jobs_launch_body,
|
|
29
42
|
func=core.launch,
|
|
30
|
-
schedule_type=
|
|
43
|
+
schedule_type=schedule_type,
|
|
31
44
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
32
45
|
)
|
|
33
46
|
|
sky/jobs/state.py
CHANGED
|
@@ -613,7 +613,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
|
|
|
613
613
|
"""
|
|
614
614
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
615
615
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
616
|
-
|
|
616
|
+
result = await session.execute(
|
|
617
617
|
sqlalchemy.update(spot_table).where(
|
|
618
618
|
sqlalchemy.and_(
|
|
619
619
|
spot_table.c.spot_job_id == job_id,
|
|
@@ -625,6 +625,7 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
|
|
|
625
625
|
spot_table.c.end_at.is_(None),
|
|
626
626
|
)).values({spot_table.c.status: ManagedJobStatus.PENDING.value})
|
|
627
627
|
)
|
|
628
|
+
count = result.rowcount
|
|
628
629
|
await session.commit()
|
|
629
630
|
if count != 1:
|
|
630
631
|
raise exceptions.ManagedJobStatusError(
|
|
@@ -712,7 +713,19 @@ def set_failed(
|
|
|
712
713
|
where_conditions = [spot_table.c.spot_job_id == job_id]
|
|
713
714
|
if task_id is not None:
|
|
714
715
|
where_conditions.append(spot_table.c.task_id == task_id)
|
|
716
|
+
|
|
717
|
+
# Handle failure_reason prepending when override_terminal is True
|
|
715
718
|
if override_terminal:
|
|
719
|
+
# Get existing failure_reason with row lock to prevent race
|
|
720
|
+
# conditions
|
|
721
|
+
existing_reason_result = session.execute(
|
|
722
|
+
sqlalchemy.select(spot_table.c.failure_reason).where(
|
|
723
|
+
sqlalchemy.and_(*where_conditions)).with_for_update())
|
|
724
|
+
existing_reason_row = existing_reason_result.fetchone()
|
|
725
|
+
if existing_reason_row and existing_reason_row[0]:
|
|
726
|
+
# Prepend new failure reason to existing one
|
|
727
|
+
fields_to_set[spot_table.c.failure_reason] = (
|
|
728
|
+
failure_reason + '. Previously: ' + existing_reason_row[0])
|
|
716
729
|
# Use COALESCE for end_at to avoid overriding the existing end_at if
|
|
717
730
|
# it's already set.
|
|
718
731
|
fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
|
|
@@ -1651,7 +1664,19 @@ async def set_failed_async(
|
|
|
1651
1664
|
where_conditions = [spot_table.c.spot_job_id == job_id]
|
|
1652
1665
|
if task_id is not None:
|
|
1653
1666
|
where_conditions.append(spot_table.c.task_id == task_id)
|
|
1667
|
+
|
|
1668
|
+
# Handle failure_reason prepending when override_terminal is True
|
|
1654
1669
|
if override_terminal:
|
|
1670
|
+
# Get existing failure_reason with row lock to prevent race
|
|
1671
|
+
# conditions
|
|
1672
|
+
existing_reason_result = await session.execute(
|
|
1673
|
+
sqlalchemy.select(spot_table.c.failure_reason).where(
|
|
1674
|
+
sqlalchemy.and_(*where_conditions)).with_for_update())
|
|
1675
|
+
existing_reason_row = existing_reason_result.fetchone()
|
|
1676
|
+
if existing_reason_row and existing_reason_row[0]:
|
|
1677
|
+
# Prepend new failure reason to existing one
|
|
1678
|
+
fields_to_set[spot_table.c.failure_reason] = (
|
|
1679
|
+
failure_reason + '. Previously: ' + existing_reason_row[0])
|
|
1655
1680
|
fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
|
|
1656
1681
|
spot_table.c.end_at, end_time)
|
|
1657
1682
|
else:
|
sky/provision/__init__.py
CHANGED
|
@@ -24,6 +24,7 @@ from sky.provision import kubernetes
|
|
|
24
24
|
from sky.provision import lambda_cloud
|
|
25
25
|
from sky.provision import nebius
|
|
26
26
|
from sky.provision import oci
|
|
27
|
+
from sky.provision import primeintellect
|
|
27
28
|
from sky.provision import runpod
|
|
28
29
|
from sky.provision import scp
|
|
29
30
|
from sky.provision import seeweb
|
sky/provision/docker_utils.py
CHANGED
|
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
|
|
|
15
15
|
# Configure environment variables. A docker image can have environment variables
|
|
16
16
|
# set in the Dockerfile with `ENV``. We need to export these variables to the
|
|
17
17
|
# shell environment, so that our ssh session can access them.
|
|
18
|
+
# Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
|
|
19
|
+
# Docker images with Ray 2.48.0+ set this for UV package manager support,
|
|
20
|
+
# but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
|
|
21
|
+
# See: https://github.com/skypilot-org/skypilot/pull/7181
|
|
18
22
|
SETUP_ENV_VARS_CMD = (
|
|
19
23
|
'prefix_cmd() '
|
|
20
24
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
|
21
|
-
'export -p > ~/container_env_var.sh && '
|
|
25
|
+
'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
|
|
22
26
|
'$(prefix_cmd) '
|
|
23
27
|
'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
|
|
24
28
|
|
|
@@ -32,6 +36,30 @@ DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
|
|
|
32
36
|
|
|
33
37
|
_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
|
|
34
38
|
|
|
39
|
+
# Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
|
|
40
|
+
# AWS CLI v2 is installed as a standalone binary, not a Python package. See:
|
|
41
|
+
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
|
|
42
|
+
INSTALL_AWS_CLI_CMD = (
|
|
43
|
+
'which aws || ((command -v unzip >/dev/null 2>&1 || '
|
|
44
|
+
'(sudo apt-get update && sudo apt-get install -y unzip)) && '
|
|
45
|
+
'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
|
|
46
|
+
'-o "/tmp/awscliv2.zip" && '
|
|
47
|
+
'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
|
|
48
|
+
'&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _extract_region_from_ecr_server(server: str) -> str:
|
|
52
|
+
"""Extract AWS region from ECR server URL.
|
|
53
|
+
|
|
54
|
+
ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
55
|
+
Returns the region part from the URL.
|
|
56
|
+
"""
|
|
57
|
+
# Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
|
|
58
|
+
parts = server.split('.')
|
|
59
|
+
if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
|
|
60
|
+
return parts[3]
|
|
61
|
+
raise ValueError(f'Invalid ECR server format: {server}')
|
|
62
|
+
|
|
35
63
|
|
|
36
64
|
@dataclasses.dataclass
|
|
37
65
|
class DockerLoginConfig:
|
|
@@ -236,9 +264,9 @@ class DockerInitializer:
|
|
|
236
264
|
|
|
237
265
|
# SkyPilot: Docker login if user specified a private docker registry.
|
|
238
266
|
if 'docker_login_config' in self.docker_config:
|
|
239
|
-
# TODO(tian): Maybe support a command to get the login password?
|
|
240
267
|
docker_login_config = DockerLoginConfig(
|
|
241
268
|
**self.docker_config['docker_login_config'])
|
|
269
|
+
|
|
242
270
|
if docker_login_config.password:
|
|
243
271
|
# Password is allowed to be empty, in that case, we will not run
|
|
244
272
|
# the login command, and assume that the image pulling is
|
|
@@ -249,6 +277,25 @@ class DockerInitializer:
|
|
|
249
277
|
f'--password {shlex.quote(docker_login_config.password)} '
|
|
250
278
|
f'{shlex.quote(docker_login_config.server)}',
|
|
251
279
|
wait_for_docker_daemon=True)
|
|
280
|
+
elif (docker_login_config.server.endswith('.amazonaws.com') and
|
|
281
|
+
'.dkr.ecr.' in docker_login_config.server):
|
|
282
|
+
# AWS ECR: Use aws ecr get-login-password for authentication
|
|
283
|
+
# ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
284
|
+
# This command uses the IAM credentials from the EC2 instance
|
|
285
|
+
# Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
|
|
286
|
+
region = _extract_region_from_ecr_server(
|
|
287
|
+
docker_login_config.server)
|
|
288
|
+
|
|
289
|
+
# AWS CLI is not pre-installed on AWS instances, unlike gcloud
|
|
290
|
+
# on GCP instances, so we need to install it first
|
|
291
|
+
self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
|
|
292
|
+
|
|
293
|
+
self._run(
|
|
294
|
+
f'aws ecr get-login-password --region {region} | '
|
|
295
|
+
f'{self.docker_cmd} login --username AWS '
|
|
296
|
+
f'--password-stdin '
|
|
297
|
+
f'{shlex.quote(docker_login_config.server)}',
|
|
298
|
+
wait_for_docker_daemon=True)
|
|
252
299
|
elif docker_login_config.server.endswith('-docker.pkg.dev'):
|
|
253
300
|
# Docker image server is on GCR, we need to do additional setup
|
|
254
301
|
# to pull the image.
|
|
@@ -367,7 +414,7 @@ class DockerInitializer:
|
|
|
367
414
|
# pylint: disable=anomalous-backslash-in-string
|
|
368
415
|
self._run(
|
|
369
416
|
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
|
370
|
-
f'
|
|
417
|
+
f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
|
|
371
418
|
'mkdir -p ~/.ssh;'
|
|
372
419
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
373
420
|
'sudo service ssh start;'
|
sky/provision/instance_setup.py
CHANGED
|
@@ -136,6 +136,20 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo,
|
|
|
136
136
|
logger.info(f'Logs of worker nodes can be found at: {worker_log_path}')
|
|
137
137
|
|
|
138
138
|
|
|
139
|
+
class SSHThreadPoolExecutor(futures.ThreadPoolExecutor):
|
|
140
|
+
"""ThreadPoolExecutor that kills children processes on exit."""
|
|
141
|
+
|
|
142
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
143
|
+
# ssh command runner eventually calls
|
|
144
|
+
# log_lib.run_with_log, which will spawn
|
|
145
|
+
# subprocesses. If we are exiting the context
|
|
146
|
+
# we need to kill the children processes
|
|
147
|
+
# to avoid leakage.
|
|
148
|
+
subprocess_utils.kill_children_processes()
|
|
149
|
+
self.shutdown()
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
|
|
139
153
|
def _parallel_ssh_with_cache(func,
|
|
140
154
|
cluster_name: str,
|
|
141
155
|
stage_name: str,
|
|
@@ -148,7 +162,7 @@ def _parallel_ssh_with_cache(func,
|
|
|
148
162
|
# as 32 is too large for some machines.
|
|
149
163
|
max_workers = subprocess_utils.get_parallel_threads(
|
|
150
164
|
cluster_info.provider_name)
|
|
151
|
-
with
|
|
165
|
+
with SSHThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
152
166
|
results = []
|
|
153
167
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
154
168
|
cluster_info, **ssh_credentials)
|
|
@@ -106,34 +106,35 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
106
106
|
created_instance_ids = []
|
|
107
107
|
remote_ssh_key_name = config.authentication_config['remote_key_name']
|
|
108
108
|
|
|
109
|
-
def
|
|
109
|
+
def launch_node(node_type: str) -> str:
|
|
110
110
|
try:
|
|
111
111
|
instance_ids = lambda_client.create_instances(
|
|
112
112
|
instance_type=config.node_config['InstanceType'],
|
|
113
113
|
region=region,
|
|
114
114
|
name=f'{cluster_name_on_cloud}-{node_type}',
|
|
115
|
-
|
|
115
|
+
# Quantity cannot actually be greater than 1; see:
|
|
116
|
+
# https://github.com/skypilot-org/skypilot/issues/7084
|
|
117
|
+
quantity=1,
|
|
116
118
|
ssh_key_name=remote_ssh_key_name,
|
|
117
119
|
)
|
|
118
|
-
logger.info(f'Launched {
|
|
119
|
-
f'
|
|
120
|
-
return instance_ids
|
|
120
|
+
logger.info(f'Launched {node_type} node, '
|
|
121
|
+
f'instance_id: {instance_ids[0]}')
|
|
122
|
+
return instance_ids[0]
|
|
121
123
|
except Exception as e:
|
|
122
124
|
logger.warning(f'run_instances error: {e}')
|
|
123
125
|
raise
|
|
124
126
|
|
|
125
127
|
if head_instance_id is None:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
created_instance_ids.append(instance_ids[0])
|
|
129
|
-
head_instance_id = instance_ids[0]
|
|
128
|
+
head_instance_id = launch_node('head')
|
|
129
|
+
created_instance_ids.append(head_instance_id)
|
|
130
130
|
|
|
131
131
|
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
132
132
|
|
|
133
133
|
worker_node_count = to_start_count - 1
|
|
134
134
|
if worker_node_count > 0:
|
|
135
|
-
|
|
136
|
-
|
|
135
|
+
for _ in range(worker_node_count):
|
|
136
|
+
worker_instance_id = launch_node('worker')
|
|
137
|
+
created_instance_ids.append(worker_instance_id)
|
|
137
138
|
|
|
138
139
|
while True:
|
|
139
140
|
instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Prime Intellect provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.primeintellect.config import bootstrap_instances
|
|
4
|
+
from sky.provision.primeintellect.instance import cleanup_ports
|
|
5
|
+
from sky.provision.primeintellect.instance import get_cluster_info
|
|
6
|
+
from sky.provision.primeintellect.instance import query_instances
|
|
7
|
+
from sky.provision.primeintellect.instance import run_instances
|
|
8
|
+
from sky.provision.primeintellect.instance import stop_instances
|
|
9
|
+
from sky.provision.primeintellect.instance import terminate_instances
|
|
10
|
+
from sky.provision.primeintellect.instance import wait_instances
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Prime Intellect configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
return config
|