skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +13 -10
- sky/client/cli/command.py +16 -8
- sky/client/common.py +4 -2
- sky/client/sdk.py +4 -2
- sky/core.py +3 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-8e64d11e58eab5cb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +54 -31
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +4 -0
- sky/jobs/server/core.py +2 -0
- sky/provision/runpod/__init__.py +2 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/server/requests/payloads.py +2 -1
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +8 -3
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
- sky/volumes/server/core.py +1 -0
- sky/volumes/volume.py +16 -17
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +51 -50
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
- /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/dashboard/out/users.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-8e64d11e58eab5cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-018bf31cda52e11b.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"VXU6_xE28M55BOdwmUUJS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-8e64d11e58eab5cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-739726d6b823f532.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"VXU6_xE28M55BOdwmUUJS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-8e64d11e58eab5cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"VXU6_xE28M55BOdwmUUJS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-8e64d11e58eab5cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-d8bc3a2b9cf839a9.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-d0782b9251f0fcd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-88c7c8d69b0b6dba.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-159df2d4c441a9d1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"VXU6_xE28M55BOdwmUUJS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-8e64d11e58eab5cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7528cc0ef8c522c5.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"VXU6_xE28M55BOdwmUUJS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
|
@@ -185,6 +185,14 @@ cluster_history_table = sqlalchemy.Table(
|
|
|
185
185
|
sqlalchemy.Column('provision_log_path',
|
|
186
186
|
sqlalchemy.Text,
|
|
187
187
|
server_default=None),
|
|
188
|
+
sqlalchemy.Column('last_activity_time',
|
|
189
|
+
sqlalchemy.Integer,
|
|
190
|
+
server_default=None,
|
|
191
|
+
index=True),
|
|
192
|
+
sqlalchemy.Column('launched_at',
|
|
193
|
+
sqlalchemy.Integer,
|
|
194
|
+
server_default=None,
|
|
195
|
+
index=True),
|
|
188
196
|
)
|
|
189
197
|
|
|
190
198
|
|
|
@@ -720,6 +728,10 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
720
728
|
conditional_values.get('last_creation_command'),
|
|
721
729
|
}
|
|
722
730
|
|
|
731
|
+
# Calculate last_activity_time and launched_at from usage_intervals
|
|
732
|
+
last_activity_time = _get_cluster_last_activity_time(usage_intervals)
|
|
733
|
+
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
734
|
+
|
|
723
735
|
insert_stmnt = insert_func(cluster_history_table).values(
|
|
724
736
|
cluster_hash=cluster_hash,
|
|
725
737
|
name=cluster_name,
|
|
@@ -730,6 +742,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
730
742
|
user_hash=user_hash,
|
|
731
743
|
workspace=history_workspace,
|
|
732
744
|
provision_log_path=provision_log_path,
|
|
745
|
+
last_activity_time=last_activity_time,
|
|
746
|
+
launched_at=launched_at,
|
|
733
747
|
**creation_info,
|
|
734
748
|
)
|
|
735
749
|
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
@@ -746,6 +760,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
746
760
|
cluster_history_table.c.user_hash: history_hash,
|
|
747
761
|
cluster_history_table.c.workspace: history_workspace,
|
|
748
762
|
cluster_history_table.c.provision_log_path: provision_log_path,
|
|
763
|
+
cluster_history_table.c.last_activity_time: last_activity_time,
|
|
764
|
+
cluster_history_table.c.launched_at: launched_at,
|
|
749
765
|
**creation_info,
|
|
750
766
|
})
|
|
751
767
|
session.execute(do_update_stmt)
|
|
@@ -1340,17 +1356,33 @@ def _get_cluster_duration(
|
|
|
1340
1356
|
return total_duration
|
|
1341
1357
|
|
|
1342
1358
|
|
|
1359
|
+
def _get_cluster_last_activity_time(
|
|
1360
|
+
usage_intervals: Optional[List[Tuple[int,
|
|
1361
|
+
Optional[int]]]]) -> Optional[int]:
|
|
1362
|
+
last_activity_time = None
|
|
1363
|
+
if usage_intervals:
|
|
1364
|
+
last_interval = usage_intervals[-1]
|
|
1365
|
+
last_activity_time = (last_interval[1] if last_interval[1] is not None
|
|
1366
|
+
else last_interval[0])
|
|
1367
|
+
return last_activity_time
|
|
1368
|
+
|
|
1369
|
+
|
|
1343
1370
|
@_init_db
|
|
1344
1371
|
@metrics_lib.time_me
|
|
1345
1372
|
def _set_cluster_usage_intervals(
|
|
1346
1373
|
cluster_hash: str, usage_intervals: List[Tuple[int,
|
|
1347
1374
|
Optional[int]]]) -> None:
|
|
1348
1375
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1376
|
+
|
|
1377
|
+
# Calculate last_activity_time from usage_intervals
|
|
1378
|
+
last_activity_time = _get_cluster_last_activity_time(usage_intervals)
|
|
1379
|
+
|
|
1349
1380
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1350
1381
|
count = session.query(cluster_history_table).filter_by(
|
|
1351
1382
|
cluster_hash=cluster_hash).update({
|
|
1352
1383
|
cluster_history_table.c.usage_intervals:
|
|
1353
|
-
pickle.dumps(usage_intervals)
|
|
1384
|
+
pickle.dumps(usage_intervals),
|
|
1385
|
+
cluster_history_table.c.last_activity_time: last_activity_time,
|
|
1354
1386
|
})
|
|
1355
1387
|
session.commit()
|
|
1356
1388
|
assert count <= 1, count
|
|
@@ -1706,7 +1738,7 @@ def get_clusters_from_history(
|
|
|
1706
1738
|
current_user_hash = common_utils.get_user_hash()
|
|
1707
1739
|
|
|
1708
1740
|
# Prepare filtering parameters
|
|
1709
|
-
cutoff_time =
|
|
1741
|
+
cutoff_time = 0
|
|
1710
1742
|
if days is not None:
|
|
1711
1743
|
cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
|
|
1712
1744
|
|
|
@@ -1720,7 +1752,9 @@ def get_clusters_from_history(
|
|
|
1720
1752
|
cluster_history_table.c.usage_intervals,
|
|
1721
1753
|
cluster_history_table.c.user_hash,
|
|
1722
1754
|
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1723
|
-
|
|
1755
|
+
cluster_history_table.c.last_activity_time,
|
|
1756
|
+
cluster_history_table.c.launched_at, cluster_table.c.status,
|
|
1757
|
+
cluster_table.c.workspace)
|
|
1724
1758
|
else:
|
|
1725
1759
|
query = session.query(
|
|
1726
1760
|
cluster_history_table.c.cluster_hash,
|
|
@@ -1731,19 +1765,33 @@ def get_clusters_from_history(
|
|
|
1731
1765
|
cluster_history_table.c.last_creation_yaml,
|
|
1732
1766
|
cluster_history_table.c.last_creation_command,
|
|
1733
1767
|
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1734
|
-
|
|
1768
|
+
cluster_history_table.c.last_activity_time,
|
|
1769
|
+
cluster_history_table.c.launched_at, cluster_table.c.status,
|
|
1770
|
+
cluster_table.c.workspace)
|
|
1735
1771
|
|
|
1736
1772
|
query = query.select_from(
|
|
1737
1773
|
cluster_history_table.join(cluster_table,
|
|
1738
1774
|
cluster_history_table.c.cluster_hash ==
|
|
1739
1775
|
cluster_table.c.cluster_hash,
|
|
1740
1776
|
isouter=True))
|
|
1777
|
+
|
|
1778
|
+
# Only include clusters that are either active (status is not None)
|
|
1779
|
+
# or are within the cutoff time (cutoff_time <= last_activity_time).
|
|
1780
|
+
# If days is not specified, we include all clusters by setting
|
|
1781
|
+
# cutoff_time to 0.
|
|
1782
|
+
query = query.filter(
|
|
1783
|
+
(cluster_table.c.status.isnot(None) |
|
|
1784
|
+
(cluster_history_table.c.last_activity_time >= cutoff_time)))
|
|
1785
|
+
|
|
1786
|
+
# Order by launched_at descending (most recent first)
|
|
1787
|
+
query = query.order_by(
|
|
1788
|
+
sqlalchemy.desc(cluster_history_table.c.launched_at))
|
|
1789
|
+
|
|
1741
1790
|
if cluster_hashes is not None:
|
|
1742
1791
|
query = query.filter(
|
|
1743
1792
|
cluster_history_table.c.cluster_hash.in_(cluster_hashes))
|
|
1744
1793
|
rows = query.all()
|
|
1745
1794
|
|
|
1746
|
-
filtered_rows = []
|
|
1747
1795
|
usage_intervals_dict = {}
|
|
1748
1796
|
row_to_user_hash = {}
|
|
1749
1797
|
for row in rows:
|
|
@@ -1753,36 +1801,11 @@ def get_clusters_from_history(
|
|
|
1753
1801
|
row_usage_intervals = pickle.loads(row.usage_intervals)
|
|
1754
1802
|
except (pickle.PickleError, AttributeError):
|
|
1755
1803
|
pass
|
|
1756
|
-
# Parse status
|
|
1757
|
-
status = None
|
|
1758
|
-
if row.status:
|
|
1759
|
-
status = status_lib.ClusterStatus[row.status]
|
|
1760
|
-
# Apply filtering: always include active clusters, filter historical
|
|
1761
|
-
# ones by time
|
|
1762
|
-
if cutoff_time is not None and status is None: # Historical cluster
|
|
1763
|
-
# For historical clusters, check if they were used recently
|
|
1764
|
-
# Use the most recent activity from usage_intervals to determine
|
|
1765
|
-
# last use
|
|
1766
|
-
# Find the most recent activity time from usage_intervals
|
|
1767
|
-
last_activity_time = None
|
|
1768
|
-
if row_usage_intervals:
|
|
1769
|
-
# Get the end time of the last interval (or start time if
|
|
1770
|
-
# still running)
|
|
1771
|
-
last_interval = row_usage_intervals[-1]
|
|
1772
|
-
last_activity_time = (last_interval[1] if last_interval[1]
|
|
1773
|
-
is not None else last_interval[0])
|
|
1774
|
-
|
|
1775
|
-
# Skip historical clusters that haven't been used recently
|
|
1776
|
-
if last_activity_time is None or last_activity_time < cutoff_time:
|
|
1777
|
-
continue
|
|
1778
|
-
|
|
1779
|
-
filtered_rows.append(row)
|
|
1780
1804
|
usage_intervals_dict[row.cluster_hash] = row_usage_intervals
|
|
1781
1805
|
user_hash = (row.user_hash
|
|
1782
1806
|
if row.user_hash is not None else current_user_hash)
|
|
1783
1807
|
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1784
1808
|
|
|
1785
|
-
rows = filtered_rows
|
|
1786
1809
|
user_hashes = set(row_to_user_hash.values())
|
|
1787
1810
|
user_hash_to_user = _get_users(user_hashes)
|
|
1788
1811
|
cluster_hashes = set(row_to_user_hash.keys())
|
|
@@ -1797,10 +1820,10 @@ def get_clusters_from_history(
|
|
|
1797
1820
|
user_name = user.name if user is not None else None
|
|
1798
1821
|
if not abbreviate_response:
|
|
1799
1822
|
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1823
|
+
launched_at = row.launched_at
|
|
1800
1824
|
usage_intervals: Optional[List[Tuple[
|
|
1801
1825
|
int,
|
|
1802
1826
|
Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
|
|
1803
|
-
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
1804
1827
|
duration = _get_cluster_duration(usage_intervals)
|
|
1805
1828
|
|
|
1806
1829
|
# Parse status
|
sky/jobs/constants.py
CHANGED
|
@@ -10,6 +10,8 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
|
|
10
10
|
|
|
11
11
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
|
12
12
|
|
|
13
|
+
JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
|
|
14
|
+
|
|
13
15
|
CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
|
|
14
16
|
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
15
17
|
# Resources as a dict for the jobs controller.
|
sky/jobs/controller.py
CHANGED
|
@@ -18,6 +18,7 @@ import sky
|
|
|
18
18
|
from sky import core
|
|
19
19
|
from sky import exceptions
|
|
20
20
|
from sky import sky_logging
|
|
21
|
+
from sky import skypilot_config
|
|
21
22
|
from sky.backends import backend_utils
|
|
22
23
|
from sky.backends import cloud_vm_ray_backend
|
|
23
24
|
from sky.data import data_utils
|
|
@@ -928,6 +929,9 @@ class Controller:
|
|
|
928
929
|
ctx.override_envs({key: value})
|
|
929
930
|
job_logger.debug(
|
|
930
931
|
f'Set environment variable: {key}={value}')
|
|
932
|
+
# Reload the skypilot config for this context to make sure
|
|
933
|
+
# the latest config is used.
|
|
934
|
+
skypilot_config.reload_config()
|
|
931
935
|
else:
|
|
932
936
|
job_logger.error(
|
|
933
937
|
'Context is None, cannot set environment variables')
|
sky/jobs/server/core.py
CHANGED
|
@@ -368,6 +368,8 @@ def launch(
|
|
|
368
368
|
'priority': priority,
|
|
369
369
|
'consolidation_mode_job_id': consolidation_mode_job_id,
|
|
370
370
|
'pool': pool,
|
|
371
|
+
'job_controller_indicator_file':
|
|
372
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
|
|
371
373
|
**controller_utils.shared_controller_vars_to_fill(
|
|
372
374
|
controller,
|
|
373
375
|
remote_user_config_path=remote_user_config_path,
|
sky/provision/runpod/__init__.py
CHANGED
|
@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
|
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
12
|
from sky.provision.runpod.volume import apply_volume
|
|
13
13
|
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_all_volumes_usedby
|
|
14
15
|
from sky.provision.runpod.volume import get_volume_usedby
|
|
16
|
+
from sky.provision.runpod.volume import map_all_volumes_usedby
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Add last_activity_time and launched_at to cluster history.
|
|
2
|
+
|
|
3
|
+
Revision ID: 009
|
|
4
|
+
Revises: 008
|
|
5
|
+
Create Date: 2025-09-24
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
import pickle
|
|
10
|
+
from typing import Sequence, Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
|
|
15
|
+
from sky.utils.db import db_utils
|
|
16
|
+
|
|
17
|
+
# revision identifiers, used by Alembic.
|
|
18
|
+
revision: str = '009'
|
|
19
|
+
down_revision: Union[str, Sequence[str], None] = '008'
|
|
20
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
21
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def upgrade():
|
|
25
|
+
"""Add last_activity_time and launched_at columns to cluster history."""
|
|
26
|
+
with op.get_context().autocommit_block():
|
|
27
|
+
# Add the columns with indices
|
|
28
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
29
|
+
'last_activity_time',
|
|
30
|
+
sa.Integer(),
|
|
31
|
+
server_default=None,
|
|
32
|
+
index=True)
|
|
33
|
+
|
|
34
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
35
|
+
'launched_at',
|
|
36
|
+
sa.Integer(),
|
|
37
|
+
server_default=None,
|
|
38
|
+
index=True)
|
|
39
|
+
|
|
40
|
+
# Populate the columns for existing rows
|
|
41
|
+
_populate_cluster_history_columns()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _populate_cluster_history_columns():
|
|
45
|
+
"""Populate last_activity_time and launched_at for existing rows using
|
|
46
|
+
usage_intervals logic."""
|
|
47
|
+
connection = op.get_bind()
|
|
48
|
+
|
|
49
|
+
# Get all existing rows with usage_intervals
|
|
50
|
+
result = connection.execute(
|
|
51
|
+
sa.text('SELECT cluster_hash, usage_intervals FROM cluster_history '
|
|
52
|
+
'WHERE usage_intervals IS NOT NULL'))
|
|
53
|
+
|
|
54
|
+
for row in result:
|
|
55
|
+
cluster_hash = row[0]
|
|
56
|
+
usage_intervals_blob = row[1]
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
# Deserialize the usage_intervals
|
|
60
|
+
usage_intervals = pickle.loads(usage_intervals_blob)
|
|
61
|
+
|
|
62
|
+
if usage_intervals:
|
|
63
|
+
# Calculate last_activity_time: end time of last interval
|
|
64
|
+
# or start time if still running
|
|
65
|
+
last_interval = usage_intervals[-1]
|
|
66
|
+
last_activity_time = (last_interval[1] if last_interval[1]
|
|
67
|
+
is not None else last_interval[0])
|
|
68
|
+
|
|
69
|
+
# Calculate launched_at: start time of first interval
|
|
70
|
+
launched_at = usage_intervals[0][0]
|
|
71
|
+
|
|
72
|
+
# Update the row with both calculated values
|
|
73
|
+
connection.execute(
|
|
74
|
+
sa.text('UPDATE cluster_history '
|
|
75
|
+
'SET last_activity_time = :last_activity_time, '
|
|
76
|
+
'launched_at = :launched_at '
|
|
77
|
+
'WHERE cluster_hash = :cluster_hash'), {
|
|
78
|
+
'last_activity_time': last_activity_time,
|
|
79
|
+
'launched_at': launched_at,
|
|
80
|
+
'cluster_hash': cluster_hash
|
|
81
|
+
})
|
|
82
|
+
except (pickle.PickleError, AttributeError, IndexError):
|
|
83
|
+
# Skip rows with corrupted or invalid usage_intervals
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def downgrade():
|
|
88
|
+
"""No-op for backward compatibility."""
|
|
89
|
+
pass
|
sky/server/requests/payloads.py
CHANGED
|
@@ -683,8 +683,9 @@ class LocalUpBody(RequestBody):
|
|
|
683
683
|
ssh_key: Optional[str] = None
|
|
684
684
|
cleanup: bool = False
|
|
685
685
|
context_name: Optional[str] = None
|
|
686
|
-
name: Optional[str] = None
|
|
687
686
|
password: Optional[str] = None
|
|
687
|
+
name: Optional[str] = None
|
|
688
|
+
port_start: Optional[int] = None
|
|
688
689
|
|
|
689
690
|
|
|
690
691
|
class LocalDownBody(RequestBody):
|
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
|
@@ -57,6 +57,9 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
|
57
57
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
58
58
|
# uv is used for venv and pip, much faster than python implementations.
|
|
59
59
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
60
|
+
# set UV_SYSTEM_PYTHON to false in case the
|
|
61
|
+
# user provided docker image set it to true.
|
|
62
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
60
63
|
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
61
64
|
f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
|
|
62
65
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
@@ -97,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
97
100
|
# cluster yaml is updated.
|
|
98
101
|
#
|
|
99
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
100
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '19'
|
|
101
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
102
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
103
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/skylet/events.py
CHANGED
|
@@ -11,6 +11,7 @@ import psutil
|
|
|
11
11
|
from sky import clouds
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.backends import cloud_vm_ray_backend
|
|
14
|
+
from sky.jobs import constants as managed_job_constants
|
|
14
15
|
from sky.jobs import scheduler
|
|
15
16
|
from sky.jobs import state as managed_job_state
|
|
16
17
|
from sky.jobs import utils as managed_job_utils
|
|
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
|
|
|
21
22
|
from sky.usage import usage_lib
|
|
22
23
|
from sky.utils import cluster_utils
|
|
23
24
|
from sky.utils import registry
|
|
25
|
+
from sky.utils import subprocess_utils
|
|
24
26
|
from sky.utils import ux_utils
|
|
25
27
|
from sky.utils import yaml_utils
|
|
26
28
|
|
|
@@ -74,6 +76,46 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
74
76
|
EVENT_INTERVAL_SECONDS = 300
|
|
75
77
|
|
|
76
78
|
def _run(self):
|
|
79
|
+
if not os.path.exists(
|
|
80
|
+
os.path.expanduser(
|
|
81
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)):
|
|
82
|
+
# Note: since the skylet is started before the user setup (in
|
|
83
|
+
# jobs-controller.yaml.j2) runs, it's possible that we hit this
|
|
84
|
+
# before the indicator file is written. However, since we will wait
|
|
85
|
+
# EVENT_INTERVAL_SECONDS before the first run, this should be very
|
|
86
|
+
# unlikely.
|
|
87
|
+
logger.info('No jobs controller indicator file found.')
|
|
88
|
+
all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
|
|
89
|
+
if not all_job_ids:
|
|
90
|
+
logger.info('No jobs running. Stopping controllers.')
|
|
91
|
+
# TODO(cooperc): Move this to a shared function also called by
|
|
92
|
+
# sdk.api_stop(). (#7229)
|
|
93
|
+
try:
|
|
94
|
+
with open(os.path.expanduser(
|
|
95
|
+
scheduler.JOB_CONTROLLER_PID_PATH),
|
|
96
|
+
'r',
|
|
97
|
+
encoding='utf-8') as f:
|
|
98
|
+
pids = f.read().split('\n')[:-1]
|
|
99
|
+
for pid in pids:
|
|
100
|
+
if subprocess_utils.is_process_alive(
|
|
101
|
+
int(pid.strip())):
|
|
102
|
+
subprocess_utils.kill_children_processes(
|
|
103
|
+
parent_pids=[int(pid.strip())], force=True)
|
|
104
|
+
os.remove(
|
|
105
|
+
os.path.expanduser(scheduler.JOB_CONTROLLER_PID_PATH))
|
|
106
|
+
except FileNotFoundError:
|
|
107
|
+
# its fine we will create it
|
|
108
|
+
pass
|
|
109
|
+
except Exception as e: # pylint: disable=broad-except
|
|
110
|
+
# in case we get perm issues or something is messed up, just
|
|
111
|
+
# ignore it and assume the process is dead
|
|
112
|
+
logger.error(
|
|
113
|
+
f'Error looking at job controller pid file: {e}')
|
|
114
|
+
pass
|
|
115
|
+
logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
|
|
116
|
+
'indicator file hasn\'t been written yet.')
|
|
117
|
+
return
|
|
118
|
+
|
|
77
119
|
logger.info('=== Updating managed job status ===')
|
|
78
120
|
managed_job_utils.update_managed_jobs_statuses()
|
|
79
121
|
scheduler.maybe_start_controllers()
|
|
@@ -36,6 +36,9 @@ setup: |
|
|
|
36
36
|
grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
|
|
37
37
|
{% endif %}
|
|
38
38
|
|
|
39
|
+
# This is used by the skylet events to check if we are a jobs controller.
|
|
40
|
+
touch {{job_controller_indicator_file}}
|
|
41
|
+
|
|
39
42
|
run: |
|
|
40
43
|
{%- if consolidation_mode_job_id is none %}
|
|
41
44
|
{{ sky_activate_python_env }}
|
|
@@ -901,15 +901,20 @@ available_node_types:
|
|
|
901
901
|
{{ conda_installation_commands }}
|
|
902
902
|
{{ ray_installation_commands }}
|
|
903
903
|
|
|
904
|
-
|
|
904
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
905
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
906
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
905
907
|
# Wait for `patch` package to be installed before applying ray patches
|
|
906
908
|
until dpkg -l | grep -q "^ii patch "; do
|
|
907
909
|
sleep 0.1
|
|
908
910
|
echo "Waiting for patch package to be installed..."
|
|
909
911
|
done
|
|
910
912
|
# Apply Ray patches for progress bar fix
|
|
911
|
-
|
|
912
|
-
|
|
913
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
914
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
915
|
+
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
916
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
917
|
+
$(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
913
918
|
}
|
|
914
919
|
touch /tmp/ray_skypilot_installation_complete
|
|
915
920
|
echo "=== Ray and skypilot installation completed ==="
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -201,6 +201,7 @@ def add_column_to_table_alembic(
|
|
|
201
201
|
server_default: Optional[str] = None,
|
|
202
202
|
copy_from: Optional[str] = None,
|
|
203
203
|
value_to_replace_existing_entries: Optional[Any] = None,
|
|
204
|
+
index: Optional[bool] = None,
|
|
204
205
|
):
|
|
205
206
|
"""Add a column to a table using Alembic operations.
|
|
206
207
|
|
|
@@ -215,6 +216,8 @@ def add_column_to_table_alembic(
|
|
|
215
216
|
copy_from: Column name to copy values from (for existing rows)
|
|
216
217
|
value_to_replace_existing_entries: Default value for existing NULL
|
|
217
218
|
entries
|
|
219
|
+
index: If True, create an index on this column. If None, no index
|
|
220
|
+
is created.
|
|
218
221
|
"""
|
|
219
222
|
from alembic import op # pylint: disable=import-outside-toplevel
|
|
220
223
|
|
|
@@ -222,7 +225,8 @@ def add_column_to_table_alembic(
|
|
|
222
225
|
# Create the column with server_default if provided
|
|
223
226
|
column = sqlalchemy.Column(column_name,
|
|
224
227
|
column_type,
|
|
225
|
-
server_default=server_default
|
|
228
|
+
server_default=server_default,
|
|
229
|
+
index=index)
|
|
226
230
|
op.add_column(table_name, column)
|
|
227
231
|
|
|
228
232
|
# Handle data migration
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -17,7 +17,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
17
17
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
18
18
|
|
|
19
19
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
20
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
20
|
+
GLOBAL_USER_STATE_VERSION = '009'
|
|
21
21
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
22
22
|
|
|
23
23
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|