skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/admin_policy.py +11 -4
- sky/backends/backend_utils.py +50 -24
- sky/backends/cloud_vm_ray_backend.py +41 -38
- sky/catalog/__init__.py +3 -1
- sky/catalog/aws_catalog.py +8 -5
- sky/catalog/azure_catalog.py +8 -5
- sky/catalog/common.py +8 -2
- sky/catalog/cudo_catalog.py +5 -2
- sky/catalog/do_catalog.py +4 -1
- sky/catalog/fluidstack_catalog.py +5 -2
- sky/catalog/gcp_catalog.py +8 -5
- sky/catalog/hyperbolic_catalog.py +5 -2
- sky/catalog/ibm_catalog.py +8 -5
- sky/catalog/lambda_catalog.py +8 -5
- sky/catalog/nebius_catalog.py +8 -5
- sky/catalog/oci_catalog.py +8 -5
- sky/catalog/paperspace_catalog.py +4 -1
- sky/catalog/runpod_catalog.py +5 -2
- sky/catalog/scp_catalog.py +8 -5
- sky/catalog/vast_catalog.py +5 -2
- sky/catalog/vsphere_catalog.py +4 -1
- sky/client/cli/command.py +63 -25
- sky/client/sdk.py +61 -11
- sky/clouds/aws.py +12 -7
- sky/clouds/azure.py +12 -7
- sky/clouds/cloud.py +9 -8
- sky/clouds/cudo.py +13 -7
- sky/clouds/do.py +12 -7
- sky/clouds/fluidstack.py +11 -6
- sky/clouds/gcp.py +12 -7
- sky/clouds/hyperbolic.py +11 -6
- sky/clouds/ibm.py +11 -6
- sky/clouds/kubernetes.py +7 -3
- sky/clouds/lambda_cloud.py +11 -6
- sky/clouds/nebius.py +14 -12
- sky/clouds/oci.py +12 -7
- sky/clouds/paperspace.py +12 -7
- sky/clouds/runpod.py +12 -7
- sky/clouds/scp.py +11 -6
- sky/clouds/vast.py +14 -8
- sky/clouds/vsphere.py +11 -6
- sky/core.py +6 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
- sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
- sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
- sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +93 -32
- sky/global_user_state.py +12 -143
- sky/jobs/state.py +9 -88
- sky/jobs/utils.py +28 -13
- sky/provision/nebius/utils.py +3 -6
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/serve/client/sdk.py +6 -2
- sky/serve/controller.py +7 -3
- sky/serve/serve_state.py +1 -1
- sky/serve/serve_utils.py +171 -75
- sky/serve/server/core.py +17 -6
- sky/server/common.py +4 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +1 -1
- sky/setup_files/MANIFEST.in +2 -0
- sky/setup_files/alembic.ini +148 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/configs.py +1 -1
- sky/skylet/constants.py +4 -0
- sky/skylet/job_lib.py +1 -1
- sky/skypilot_config.py +1 -1
- sky/users/permission.py +1 -1
- sky/utils/common_utils.py +85 -3
- sky/utils/config_utils.py +15 -0
- sky/utils/db/__init__.py +0 -0
- sky/utils/{db_utils.py → db/db_utils.py} +59 -0
- sky/utils/db/migration_utils.py +93 -0
- sky/utils/locks.py +319 -0
- sky/utils/schemas.py +38 -34
- sky/utils/timeline.py +41 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
- sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
- sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
- sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
- sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
- sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
- sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
- sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
- sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
- sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
- sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
- sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
- /sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-a305898dc479711e.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js" defer=""></script><script src="/dashboard/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"mym3Ciwp-zqU7ZpOLGnrW","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/mounting_utils.py
CHANGED
|
@@ -39,19 +39,32 @@ _GOOFYS_WRAPPER = ('$(if [ -S /dev/log ] ; then '
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def get_s3_mount_install_cmd() -> str:
|
|
42
|
-
"""Returns
|
|
42
|
+
"""Returns command for basic S3 mounting (goofys by default, rclone for
|
|
43
|
+
ARM64)."""
|
|
43
44
|
# TODO(aylei): maintain our goofys fork under skypilot-org
|
|
44
|
-
install_cmd = (
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
45
|
+
install_cmd = (
|
|
46
|
+
'ARCH=$(uname -m) && '
|
|
47
|
+
'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
|
48
|
+
# Use rclone for ARM64 since goofys doesn't support it
|
|
49
|
+
# Extract core rclone installation logic without redundant ARCH check
|
|
50
|
+
' ARCH_SUFFIX="arm" && '
|
|
51
|
+
f' (which dpkg > /dev/null 2>&1 && (which rclone > /dev/null || '
|
|
52
|
+
f'(cd ~ > /dev/null && curl -O https://downloads.rclone.org/'
|
|
53
|
+
f'{RCLONE_VERSION}/rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb '
|
|
54
|
+
f'&& sudo dpkg -i rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb '
|
|
55
|
+
f'&& rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.deb))) || '
|
|
56
|
+
f'(which rclone > /dev/null || (cd ~ > /dev/null && curl -O '
|
|
57
|
+
f'https://downloads.rclone.org/{RCLONE_VERSION}/'
|
|
58
|
+
f'rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm && '
|
|
59
|
+
f'sudo yum --nogpgcheck install '
|
|
60
|
+
f'rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm -y && '
|
|
61
|
+
f'rm -f rclone-{RCLONE_VERSION}-linux-${{ARCH_SUFFIX}}.rpm)); '
|
|
62
|
+
'else '
|
|
63
|
+
' sudo wget -nc https://github.com/aylei/goofys/'
|
|
64
|
+
'releases/download/0.24.0-aylei-upstream/goofys '
|
|
65
|
+
'-O /usr/local/bin/goofys && '
|
|
66
|
+
'sudo chmod 755 /usr/local/bin/goofys; '
|
|
67
|
+
'fi')
|
|
55
68
|
return install_cmd
|
|
56
69
|
|
|
57
70
|
|
|
@@ -59,15 +72,30 @@ def get_s3_mount_install_cmd() -> str:
|
|
|
59
72
|
def get_s3_mount_cmd(bucket_name: str,
|
|
60
73
|
mount_path: str,
|
|
61
74
|
_bucket_sub_path: Optional[str] = None) -> str:
|
|
62
|
-
"""Returns a command to mount an S3 bucket
|
|
75
|
+
"""Returns a command to mount an S3 bucket (goofys by default, rclone for
|
|
76
|
+
ARM64)"""
|
|
63
77
|
if _bucket_sub_path is None:
|
|
64
78
|
_bucket_sub_path = ''
|
|
65
79
|
else:
|
|
66
80
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
81
|
+
|
|
82
|
+
# Use rclone for ARM64 architectures since goofys doesn't support them
|
|
83
|
+
arch_check = 'ARCH=$(uname -m) && '
|
|
84
|
+
rclone_mount = (
|
|
85
|
+
f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
|
|
86
|
+
f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
|
|
87
|
+
'--daemon --allow-other')
|
|
88
|
+
goofys_mount = (f'{_GOOFYS_WRAPPER} -o allow_other '
|
|
89
|
+
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
|
90
|
+
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
|
91
|
+
f'{bucket_name}{_bucket_sub_path} {mount_path}')
|
|
92
|
+
|
|
93
|
+
mount_cmd = (f'{arch_check}'
|
|
94
|
+
f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
|
95
|
+
f' {rclone_mount}; '
|
|
96
|
+
f'else '
|
|
97
|
+
f' {goofys_mount}; '
|
|
98
|
+
f'fi')
|
|
71
99
|
return mount_cmd
|
|
72
100
|
|
|
73
101
|
|
|
@@ -76,17 +104,33 @@ def get_nebius_mount_cmd(nebius_profile_name: str,
|
|
|
76
104
|
endpoint_url: str,
|
|
77
105
|
mount_path: str,
|
|
78
106
|
_bucket_sub_path: Optional[str] = None) -> str:
|
|
79
|
-
"""Returns a command to
|
|
107
|
+
"""Returns a command to mount Nebius bucket (goofys by default, rclone for
|
|
108
|
+
ARM64)."""
|
|
80
109
|
if _bucket_sub_path is None:
|
|
81
110
|
_bucket_sub_path = ''
|
|
82
111
|
else:
|
|
83
112
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
113
|
+
|
|
114
|
+
# Use rclone for ARM64 architectures since goofys doesn't support them
|
|
115
|
+
arch_check = 'ARCH=$(uname -m) && '
|
|
116
|
+
rclone_mount = (
|
|
117
|
+
f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
|
|
118
|
+
f'AWS_PROFILE={nebius_profile_name} '
|
|
119
|
+
f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
|
|
120
|
+
f'--s3-endpoint {endpoint_url} --daemon --allow-other')
|
|
121
|
+
goofys_mount = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
|
|
122
|
+
'-o allow_other '
|
|
123
|
+
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
|
124
|
+
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
|
125
|
+
f'--endpoint {endpoint_url} '
|
|
126
|
+
f'{bucket_name}{_bucket_sub_path} {mount_path}')
|
|
127
|
+
|
|
128
|
+
mount_cmd = (f'{arch_check}'
|
|
129
|
+
f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
|
130
|
+
f' {rclone_mount}; '
|
|
131
|
+
f'else '
|
|
132
|
+
f' {goofys_mount}; '
|
|
133
|
+
f'fi')
|
|
90
134
|
return mount_cmd
|
|
91
135
|
|
|
92
136
|
|
|
@@ -236,18 +280,35 @@ def get_r2_mount_cmd(r2_credentials_path: str,
|
|
|
236
280
|
bucket_name: str,
|
|
237
281
|
mount_path: str,
|
|
238
282
|
_bucket_sub_path: Optional[str] = None) -> str:
|
|
239
|
-
"""Returns a command to
|
|
283
|
+
"""Returns a command to mount R2 bucket (goofys by default, rclone for
|
|
284
|
+
ARM64)."""
|
|
240
285
|
if _bucket_sub_path is None:
|
|
241
286
|
_bucket_sub_path = ''
|
|
242
287
|
else:
|
|
243
288
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
289
|
+
|
|
290
|
+
# Use rclone for ARM64 architectures since goofys doesn't support them
|
|
291
|
+
arch_check = 'ARCH=$(uname -m) && '
|
|
292
|
+
rclone_mount = (
|
|
293
|
+
f'{FUSERMOUNT3_SOFT_LINK_CMD} && '
|
|
294
|
+
f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
|
|
295
|
+
f'AWS_PROFILE={r2_profile_name} '
|
|
296
|
+
f'rclone mount :s3:{bucket_name}{_bucket_sub_path} {mount_path} '
|
|
297
|
+
f'--s3-endpoint {endpoint_url} --daemon --allow-other')
|
|
298
|
+
goofys_mount = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
|
|
299
|
+
f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
|
|
300
|
+
'-o allow_other '
|
|
301
|
+
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
|
302
|
+
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
|
303
|
+
f'--endpoint {endpoint_url} '
|
|
304
|
+
f'{bucket_name}{_bucket_sub_path} {mount_path}')
|
|
305
|
+
|
|
306
|
+
mount_cmd = (f'{arch_check}'
|
|
307
|
+
f'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
|
308
|
+
f' {rclone_mount}; '
|
|
309
|
+
f'else '
|
|
310
|
+
f' {goofys_mount}; '
|
|
311
|
+
f'fi')
|
|
251
312
|
return mount_cmd
|
|
252
313
|
|
|
253
314
|
|
sky/global_user_state.py
CHANGED
|
@@ -12,7 +12,6 @@ import os
|
|
|
12
12
|
import pathlib
|
|
13
13
|
import pickle
|
|
14
14
|
import re
|
|
15
|
-
import threading
|
|
16
15
|
import time
|
|
17
16
|
import typing
|
|
18
17
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
@@ -32,9 +31,10 @@ from sky import skypilot_config
|
|
|
32
31
|
from sky.skylet import constants
|
|
33
32
|
from sky.utils import common_utils
|
|
34
33
|
from sky.utils import context_utils
|
|
35
|
-
from sky.utils import db_utils
|
|
36
34
|
from sky.utils import registry
|
|
37
35
|
from sky.utils import status_lib
|
|
36
|
+
from sky.utils.db import db_utils
|
|
37
|
+
from sky.utils.db import migration_utils
|
|
38
38
|
|
|
39
39
|
if typing.TYPE_CHECKING:
|
|
40
40
|
from sky import backends
|
|
@@ -48,7 +48,6 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
|
48
48
|
_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
49
49
|
|
|
50
50
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
51
|
-
_DB_INIT_LOCK = threading.Lock()
|
|
52
51
|
|
|
53
52
|
Base = declarative.declarative_base()
|
|
54
53
|
|
|
@@ -238,152 +237,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
238
237
|
# If the database is locked, it is OK to continue, as the WAL mode
|
|
239
238
|
# is not critical and is likely to be enabled by other processes.
|
|
240
239
|
|
|
241
|
-
#
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
#
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
# Add autostop column to clusters table
|
|
249
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
250
|
-
'clusters',
|
|
251
|
-
'autostop',
|
|
252
|
-
sqlalchemy.Integer(),
|
|
253
|
-
default_statement='DEFAULT -1')
|
|
254
|
-
|
|
255
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
256
|
-
session,
|
|
257
|
-
'clusters',
|
|
258
|
-
'metadata',
|
|
259
|
-
sqlalchemy.Text(),
|
|
260
|
-
default_statement='DEFAULT \'{}\'')
|
|
261
|
-
|
|
262
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
263
|
-
'clusters',
|
|
264
|
-
'to_down',
|
|
265
|
-
sqlalchemy.Integer(),
|
|
266
|
-
default_statement='DEFAULT 0')
|
|
267
|
-
|
|
268
|
-
# The cloud identity that created the cluster.
|
|
269
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
270
|
-
session,
|
|
271
|
-
'clusters',
|
|
272
|
-
'owner',
|
|
273
|
-
sqlalchemy.Text(),
|
|
274
|
-
default_statement='DEFAULT NULL')
|
|
275
|
-
|
|
276
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
277
|
-
session,
|
|
278
|
-
'clusters',
|
|
279
|
-
'cluster_hash',
|
|
280
|
-
sqlalchemy.Text(),
|
|
281
|
-
default_statement='DEFAULT NULL')
|
|
282
|
-
|
|
283
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
284
|
-
session,
|
|
285
|
-
'clusters',
|
|
286
|
-
'storage_mounts_metadata',
|
|
287
|
-
sqlalchemy.LargeBinary(),
|
|
288
|
-
default_statement='DEFAULT NULL')
|
|
289
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
290
|
-
session,
|
|
291
|
-
'clusters',
|
|
292
|
-
'cluster_ever_up',
|
|
293
|
-
sqlalchemy.Integer(),
|
|
294
|
-
default_statement='DEFAULT 0',
|
|
295
|
-
# Set the value to 1 so that all the existing clusters before #2977
|
|
296
|
-
# are considered as ever up, i.e:
|
|
297
|
-
# existing cluster's default (null) -> 1;
|
|
298
|
-
# new cluster's default -> 0;
|
|
299
|
-
# This is conservative for the existing clusters: even if some INIT
|
|
300
|
-
# clusters were never really UP, setting it to 1 means they won't be
|
|
301
|
-
# auto-deleted during any failover.
|
|
302
|
-
value_to_replace_existing_entries=1)
|
|
303
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
304
|
-
session,
|
|
305
|
-
'clusters',
|
|
306
|
-
'status_updated_at',
|
|
307
|
-
sqlalchemy.Integer(),
|
|
308
|
-
default_statement='DEFAULT NULL')
|
|
309
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
310
|
-
session,
|
|
311
|
-
'clusters',
|
|
312
|
-
'user_hash',
|
|
313
|
-
sqlalchemy.Text(),
|
|
314
|
-
default_statement='DEFAULT NULL',
|
|
315
|
-
value_to_replace_existing_entries=common_utils.get_current_user(
|
|
316
|
-
).id)
|
|
317
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
318
|
-
session,
|
|
319
|
-
'clusters',
|
|
320
|
-
'config_hash',
|
|
321
|
-
sqlalchemy.Text(),
|
|
322
|
-
default_statement='DEFAULT NULL')
|
|
323
|
-
|
|
324
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
325
|
-
session,
|
|
326
|
-
'cluster_history',
|
|
327
|
-
'user_hash',
|
|
328
|
-
sqlalchemy.Text(),
|
|
329
|
-
default_statement='DEFAULT NULL')
|
|
330
|
-
|
|
331
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
332
|
-
session,
|
|
333
|
-
'clusters',
|
|
334
|
-
'workspace',
|
|
335
|
-
sqlalchemy.Text(),
|
|
336
|
-
default_statement='DEFAULT \'default\'',
|
|
337
|
-
value_to_replace_existing_entries=constants.
|
|
338
|
-
SKYPILOT_DEFAULT_WORKSPACE)
|
|
339
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
340
|
-
session,
|
|
341
|
-
'clusters',
|
|
342
|
-
'last_creation_yaml',
|
|
343
|
-
sqlalchemy.Text(),
|
|
344
|
-
default_statement='DEFAULT NULL',
|
|
345
|
-
)
|
|
346
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
347
|
-
session,
|
|
348
|
-
'clusters',
|
|
349
|
-
'last_creation_command',
|
|
350
|
-
sqlalchemy.Text(),
|
|
351
|
-
default_statement='DEFAULT NULL')
|
|
352
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
353
|
-
session,
|
|
354
|
-
'users',
|
|
355
|
-
'password',
|
|
356
|
-
sqlalchemy.Text(),
|
|
357
|
-
default_statement='DEFAULT NULL')
|
|
358
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
359
|
-
session,
|
|
360
|
-
'users',
|
|
361
|
-
'created_at',
|
|
362
|
-
sqlalchemy.Integer(),
|
|
363
|
-
default_statement='DEFAULT NULL')
|
|
364
|
-
|
|
365
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
366
|
-
session,
|
|
367
|
-
'cluster_history',
|
|
368
|
-
'last_creation_yaml',
|
|
369
|
-
sqlalchemy.Text(),
|
|
370
|
-
default_statement='DEFAULT NULL')
|
|
371
|
-
|
|
372
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
373
|
-
session,
|
|
374
|
-
'cluster_history',
|
|
375
|
-
'last_creation_command',
|
|
376
|
-
sqlalchemy.Text(),
|
|
377
|
-
default_statement='DEFAULT NULL')
|
|
378
|
-
|
|
379
|
-
session.commit()
|
|
240
|
+
# Get alembic config for state db and run migrations
|
|
241
|
+
alembic_config = migration_utils.get_alembic_config(
|
|
242
|
+
engine, migration_utils.GLOBAL_USER_STATE_DB_NAME)
|
|
243
|
+
# pylint: disable=line-too-long
|
|
244
|
+
alembic_config.config_ini_section = migration_utils.GLOBAL_USER_STATE_DB_NAME
|
|
245
|
+
migration_utils.safe_alembic_upgrade(
|
|
246
|
+
engine, alembic_config, migration_utils.GLOBAL_USER_STATE_VERSION)
|
|
380
247
|
|
|
381
248
|
|
|
382
249
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
383
250
|
global _SQLALCHEMY_ENGINE
|
|
384
251
|
if _SQLALCHEMY_ENGINE is not None:
|
|
385
252
|
return _SQLALCHEMY_ENGINE
|
|
386
|
-
with
|
|
253
|
+
with migration_utils.db_lock(migration_utils.GLOBAL_USER_STATE_DB_NAME):
|
|
387
254
|
if _SQLALCHEMY_ENGINE is None:
|
|
388
255
|
conn_string = None
|
|
389
256
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
@@ -520,6 +387,7 @@ def get_user(user_id: str) -> Optional[models.User]:
|
|
|
520
387
|
created_at=row.created_at)
|
|
521
388
|
|
|
522
389
|
|
|
390
|
+
@_init_db
|
|
523
391
|
def get_user_by_name(username: str) -> List[models.User]:
|
|
524
392
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
525
393
|
rows = session.query(user_table).filter_by(name=username).all()
|
|
@@ -533,6 +401,7 @@ def get_user_by_name(username: str) -> List[models.User]:
|
|
|
533
401
|
]
|
|
534
402
|
|
|
535
403
|
|
|
404
|
+
@_init_db
|
|
536
405
|
def delete_user(user_id: str) -> None:
|
|
537
406
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
538
407
|
session.query(user_table).filter_by(id=user_id).delete()
|
sky/jobs/state.py
CHANGED
|
@@ -6,7 +6,6 @@ import functools
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
import pathlib
|
|
9
|
-
import threading
|
|
10
9
|
import time
|
|
11
10
|
import typing
|
|
12
11
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
@@ -24,7 +23,8 @@ from sky import sky_logging
|
|
|
24
23
|
from sky import skypilot_config
|
|
25
24
|
from sky.skylet import constants
|
|
26
25
|
from sky.utils import common_utils
|
|
27
|
-
from sky.utils import db_utils
|
|
26
|
+
from sky.utils.db import db_utils
|
|
27
|
+
from sky.utils.db import migration_utils
|
|
28
28
|
|
|
29
29
|
if typing.TYPE_CHECKING:
|
|
30
30
|
from sqlalchemy.engine import row
|
|
@@ -36,7 +36,6 @@ CallbackType = Callable[[str], None]
|
|
|
36
36
|
logger = sky_logging.init_logger(__name__)
|
|
37
37
|
|
|
38
38
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
39
|
-
_DB_INIT_LOCK = threading.Lock()
|
|
40
39
|
|
|
41
40
|
Base = declarative.declarative_base()
|
|
42
41
|
|
|
@@ -130,97 +129,19 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
130
129
|
# If the database is locked, it is OK to continue, as the WAL mode
|
|
131
130
|
# is not critical and is likely to be enabled by other processes.
|
|
132
131
|
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
'failure_reason',
|
|
140
|
-
sqlalchemy.Text())
|
|
141
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
142
|
-
'spot',
|
|
143
|
-
'spot_job_id',
|
|
144
|
-
sqlalchemy.Integer(),
|
|
145
|
-
copy_from='job_id')
|
|
146
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
147
|
-
session,
|
|
148
|
-
'spot',
|
|
149
|
-
'task_id',
|
|
150
|
-
sqlalchemy.Integer(),
|
|
151
|
-
default_statement='DEFAULT 0',
|
|
152
|
-
value_to_replace_existing_entries=0)
|
|
153
|
-
db_utils.add_column_to_table_sqlalchemy(session,
|
|
154
|
-
'spot',
|
|
155
|
-
'task_name',
|
|
156
|
-
sqlalchemy.Text(),
|
|
157
|
-
copy_from='job_name')
|
|
158
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
159
|
-
session,
|
|
160
|
-
'spot',
|
|
161
|
-
'specs',
|
|
162
|
-
sqlalchemy.Text(),
|
|
163
|
-
value_to_replace_existing_entries=json.dumps({
|
|
164
|
-
'max_restarts_on_errors': 0,
|
|
165
|
-
}))
|
|
166
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
167
|
-
session,
|
|
168
|
-
'spot',
|
|
169
|
-
'local_log_file',
|
|
170
|
-
sqlalchemy.Text(),
|
|
171
|
-
default_statement='DEFAULT NULL')
|
|
172
|
-
|
|
173
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
174
|
-
session,
|
|
175
|
-
'spot',
|
|
176
|
-
'metadata',
|
|
177
|
-
sqlalchemy.Text(),
|
|
178
|
-
default_statement='DEFAULT \'{}\'',
|
|
179
|
-
value_to_replace_existing_entries='{}')
|
|
180
|
-
|
|
181
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
182
|
-
'schedule_state',
|
|
183
|
-
sqlalchemy.Text())
|
|
184
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
185
|
-
session,
|
|
186
|
-
'job_info',
|
|
187
|
-
'controller_pid',
|
|
188
|
-
sqlalchemy.Integer(),
|
|
189
|
-
default_statement='DEFAULT NULL')
|
|
190
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
191
|
-
'dag_yaml_path',
|
|
192
|
-
sqlalchemy.Text())
|
|
193
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
194
|
-
'env_file_path',
|
|
195
|
-
sqlalchemy.Text())
|
|
196
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
197
|
-
'user_hash', sqlalchemy.Text())
|
|
198
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
199
|
-
session,
|
|
200
|
-
'job_info',
|
|
201
|
-
'workspace',
|
|
202
|
-
sqlalchemy.Text(),
|
|
203
|
-
default_statement='DEFAULT NULL',
|
|
204
|
-
value_to_replace_existing_entries='default')
|
|
205
|
-
db_utils.add_column_to_table_sqlalchemy(
|
|
206
|
-
session,
|
|
207
|
-
'job_info',
|
|
208
|
-
'priority',
|
|
209
|
-
sqlalchemy.Integer(),
|
|
210
|
-
value_to_replace_existing_entries=constants.DEFAULT_PRIORITY)
|
|
211
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
212
|
-
'entrypoint', sqlalchemy.Text())
|
|
213
|
-
db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
|
|
214
|
-
'original_user_yaml_path',
|
|
215
|
-
sqlalchemy.Text())
|
|
216
|
-
session.commit()
|
|
132
|
+
# Get alembic config for spot jobs db and run migrations
|
|
133
|
+
alembic_config = migration_utils.get_alembic_config(
|
|
134
|
+
engine, migration_utils.SPOT_JOBS_DB_NAME)
|
|
135
|
+
alembic_config.config_ini_section = migration_utils.SPOT_JOBS_DB_NAME
|
|
136
|
+
migration_utils.safe_alembic_upgrade(engine, alembic_config,
|
|
137
|
+
migration_utils.SPOT_JOBS_VERSION)
|
|
217
138
|
|
|
218
139
|
|
|
219
140
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
220
141
|
global _SQLALCHEMY_ENGINE
|
|
221
142
|
if _SQLALCHEMY_ENGINE is not None:
|
|
222
143
|
return _SQLALCHEMY_ENGINE
|
|
223
|
-
with
|
|
144
|
+
with migration_utils.db_lock(migration_utils.SPOT_JOBS_DB_NAME):
|
|
224
145
|
if _SQLALCHEMY_ENGINE is None:
|
|
225
146
|
conn_string = None
|
|
226
147
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
sky/jobs/utils.py
CHANGED
|
@@ -67,6 +67,9 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
|
67
67
|
|
|
68
68
|
_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
|
69
69
|
|
|
70
|
+
_JOB_STATUS_FETCH_MAX_RETRIES = 3
|
|
71
|
+
_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
|
|
72
|
+
|
|
70
73
|
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
|
71
74
|
'Waiting for task to start[/]'
|
|
72
75
|
'{status_str}. It may take a few minutes.\n'
|
|
@@ -250,19 +253,31 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
|
250
253
|
logger.info(f'Cluster {cluster_name} not found.')
|
|
251
254
|
return None
|
|
252
255
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
256
|
+
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
257
|
+
try:
|
|
258
|
+
logger.info('=== Checking the job status... ===')
|
|
259
|
+
statuses = backend.get_job_status(handle, stream_logs=False)
|
|
260
|
+
status = list(statuses.values())[0]
|
|
261
|
+
if status is None:
|
|
262
|
+
logger.info('No job found.')
|
|
263
|
+
else:
|
|
264
|
+
logger.info(f'Job status: {status}')
|
|
265
|
+
logger.info('=' * 34)
|
|
266
|
+
return status
|
|
267
|
+
except exceptions.CommandError as e:
|
|
268
|
+
# Retry on k8s transient network errors. This is useful when using
|
|
269
|
+
# coreweave which may have transient network issue sometimes.
|
|
270
|
+
if (e.detailed_reason is not None and
|
|
271
|
+
_JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
|
|
272
|
+
logger.info('Failed to connect to the cluster. Retrying '
|
|
273
|
+
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
|
274
|
+
logger.info('=' * 34)
|
|
275
|
+
time.sleep(1)
|
|
276
|
+
else:
|
|
277
|
+
logger.info(f'Failed to get job status: {e.detailed_reason}')
|
|
278
|
+
logger.info('=' * 34)
|
|
279
|
+
return None
|
|
280
|
+
return None
|
|
266
281
|
|
|
267
282
|
|
|
268
283
|
def _controller_process_alive(pid: int, job_id: int) -> bool:
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -41,10 +41,7 @@ def get_project_by_region(region: str) -> str:
|
|
|
41
41
|
|
|
42
42
|
# Check is there project if in config
|
|
43
43
|
project_id = skypilot_config.get_effective_region_config(
|
|
44
|
-
cloud='nebius',
|
|
45
|
-
region=None,
|
|
46
|
-
keys=(region, 'project_id'),
|
|
47
|
-
default_value=None)
|
|
44
|
+
cloud='nebius', region=region, keys=('project_id',), default_value=None)
|
|
48
45
|
if project_id is not None:
|
|
49
46
|
return project_id
|
|
50
47
|
for project in projects.items:
|
|
@@ -189,8 +186,8 @@ def launch(cluster_name_on_cloud: str,
|
|
|
189
186
|
if preset == '8gpu-128vcpu-1600gb':
|
|
190
187
|
fabric = skypilot_config.get_effective_region_config(
|
|
191
188
|
cloud='nebius',
|
|
192
|
-
region=
|
|
193
|
-
keys=(
|
|
189
|
+
region=region,
|
|
190
|
+
keys=('fabric',),
|
|
194
191
|
default_value=None)
|
|
195
192
|
|
|
196
193
|
# Auto-select fabric if network_tier=best and no fabric configured
|
sky/schemas/db/README
ADDED