skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +222 -4
- sky/client/sdk.py +110 -82
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +1 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +152 -0
- sky/server/server.py +66 -16
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +14 -3
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6133dc1e928bd0b5.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-0ef7418d1a3822f3.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"ZWdSYkqVe3WjnFR8ocqoG","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6133dc1e928bd0b5.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-0ef7418d1a3822f3.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/230-d6e363362017ff3a.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/989-db34c16ad7ea6155.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-92dd1614396389be.js" defer=""></script><script src="/dashboard/_next/static/chunks/66-66ae330df2d3c1c7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-d3a0b53f728d280a.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-cdf66268ec878d0c.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-5b5019ba333e8d62.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-068520cc11738deb.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-07d25a7e64462fd8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-0b4c662a25e4747a.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"ZWdSYkqVe3WjnFR8ocqoG","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b23cb0257bf96c51.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6133dc1e928bd0b5.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-0ef7418d1a3822f3.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-862b120406461b10.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"ZWdSYkqVe3WjnFR8ocqoG","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/storage_utils.py
CHANGED
@@ -22,8 +22,6 @@ logger = sky_logging.init_logger(__name__)
|
|
22
22
|
_USE_SKYIGNORE_HINT = (
|
23
23
|
'To avoid using .gitignore, you can create a .skyignore file instead.')
|
24
24
|
|
25
|
-
_LAST_USE_TRUNC_LENGTH = 25
|
26
|
-
|
27
25
|
|
28
26
|
def format_storage_table(storages: List[Dict[str, Any]],
|
29
27
|
show_all: bool = False) -> str:
|
@@ -48,8 +46,8 @@ def format_storage_table(storages: List[Dict[str, Any]],
|
|
48
46
|
if show_all:
|
49
47
|
command = row['last_use']
|
50
48
|
else:
|
51
|
-
command = common_utils.truncate_long_string(
|
52
|
-
|
49
|
+
command = common_utils.truncate_long_string(
|
50
|
+
row['last_use'], constants.LAST_USE_TRUNC_LENGTH)
|
53
51
|
storage_table.add_row([
|
54
52
|
# NAME
|
55
53
|
row['name'],
|
sky/exceptions.py
CHANGED
@@ -614,6 +614,21 @@ class PermissionDeniedError(Exception):
|
|
614
614
|
pass
|
615
615
|
|
616
616
|
|
617
|
+
class VolumeNotFoundError(Exception):
|
618
|
+
"""Raised when a volume is not found."""
|
619
|
+
pass
|
620
|
+
|
621
|
+
|
622
|
+
class VolumeTopologyConflictError(Exception):
|
623
|
+
"""Raised when the there is conflict in the volumes and compute topology"""
|
624
|
+
pass
|
625
|
+
|
626
|
+
|
627
|
+
class ServerTemporarilyUnavailableError(Exception):
|
628
|
+
"""Raised when the server is temporarily unavailable."""
|
629
|
+
pass
|
630
|
+
|
631
|
+
|
617
632
|
class RestfulPolicyError(Exception):
|
618
633
|
"""Raised when failed to call a RESTful policy."""
|
619
634
|
pass
|
sky/execution.py
CHANGED
@@ -171,6 +171,11 @@ def _execute(
|
|
171
171
|
if dryrun.
|
172
172
|
"""
|
173
173
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
174
|
+
dag.resolve_and_validate_volumes()
|
175
|
+
if (not _is_launched_by_jobs_controller and
|
176
|
+
not _is_launched_by_sky_serve_controller):
|
177
|
+
# Only process pre-mount operations on API server.
|
178
|
+
dag.pre_mount_volumes()
|
174
179
|
for task in dag.tasks:
|
175
180
|
if task.storage_mounts is not None:
|
176
181
|
for storage in task.storage_mounts.values():
|
sky/global_user_state.py
CHANGED
@@ -111,6 +111,23 @@ storage_table = sqlalchemy.Table(
|
|
111
111
|
sqlalchemy.Column('status', sqlalchemy.Text),
|
112
112
|
)
|
113
113
|
|
114
|
+
volume_table = sqlalchemy.Table(
|
115
|
+
'volumes',
|
116
|
+
Base.metadata,
|
117
|
+
sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
|
118
|
+
sqlalchemy.Column('launched_at', sqlalchemy.Integer),
|
119
|
+
sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
|
120
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text, server_default=None),
|
121
|
+
sqlalchemy.Column('workspace',
|
122
|
+
sqlalchemy.Text,
|
123
|
+
server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
|
124
|
+
sqlalchemy.Column('last_attached_at',
|
125
|
+
sqlalchemy.Integer,
|
126
|
+
server_default=None),
|
127
|
+
sqlalchemy.Column('last_use', sqlalchemy.Text),
|
128
|
+
sqlalchemy.Column('status', sqlalchemy.Text),
|
129
|
+
)
|
130
|
+
|
114
131
|
# Table for Cluster History
|
115
132
|
# usage_intervals: List[Tuple[int, int]]
|
116
133
|
# Specifies start and end timestamps of cluster.
|
@@ -1426,6 +1443,118 @@ def get_storage() -> List[Dict[str, Any]]:
|
|
1426
1443
|
return records
|
1427
1444
|
|
1428
1445
|
|
1446
|
+
@_init_db
|
1447
|
+
def get_volume_names_start_with(starts_with: str) -> List[str]:
|
1448
|
+
assert _SQLALCHEMY_ENGINE is not None
|
1449
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1450
|
+
rows = session.query(volume_table).filter(
|
1451
|
+
volume_table.c.name.like(f'{starts_with}%')).all()
|
1452
|
+
return [row.name for row in rows]
|
1453
|
+
|
1454
|
+
|
1455
|
+
@_init_db
|
1456
|
+
def get_volumes() -> List[Dict[str, Any]]:
|
1457
|
+
assert _SQLALCHEMY_ENGINE is not None
|
1458
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1459
|
+
rows = session.query(volume_table).all()
|
1460
|
+
records = []
|
1461
|
+
for row in rows:
|
1462
|
+
records.append({
|
1463
|
+
'name': row.name,
|
1464
|
+
'launched_at': row.launched_at,
|
1465
|
+
'handle': pickle.loads(row.handle),
|
1466
|
+
'user_hash': row.user_hash,
|
1467
|
+
'workspace': row.workspace,
|
1468
|
+
'last_attached_at': row.last_attached_at,
|
1469
|
+
'last_use': row.last_use,
|
1470
|
+
'status': status_lib.VolumeStatus[row.status],
|
1471
|
+
})
|
1472
|
+
return records
|
1473
|
+
|
1474
|
+
|
1475
|
+
@_init_db
|
1476
|
+
def get_volume_by_name(name: str) -> Optional[Dict[str, Any]]:
|
1477
|
+
assert _SQLALCHEMY_ENGINE is not None
|
1478
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1479
|
+
row = session.query(volume_table).filter_by(name=name).first()
|
1480
|
+
if row:
|
1481
|
+
return {
|
1482
|
+
'name': row.name,
|
1483
|
+
'launched_at': row.launched_at,
|
1484
|
+
'handle': pickle.loads(row.handle),
|
1485
|
+
'user_hash': row.user_hash,
|
1486
|
+
'workspace': row.workspace,
|
1487
|
+
'last_attached_at': row.last_attached_at,
|
1488
|
+
'last_use': row.last_use,
|
1489
|
+
'status': status_lib.VolumeStatus[row.status],
|
1490
|
+
}
|
1491
|
+
return None
|
1492
|
+
|
1493
|
+
|
1494
|
+
@_init_db
|
1495
|
+
def add_volume(name: str, config: models.VolumeConfig,
|
1496
|
+
status: status_lib.VolumeStatus) -> None:
|
1497
|
+
assert _SQLALCHEMY_ENGINE is not None
|
1498
|
+
volume_launched_at = int(time.time())
|
1499
|
+
handle = pickle.dumps(config)
|
1500
|
+
last_use = common_utils.get_current_command()
|
1501
|
+
user_hash = common_utils.get_current_user().id
|
1502
|
+
active_workspace = skypilot_config.get_active_workspace()
|
1503
|
+
|
1504
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1505
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
1506
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
1507
|
+
insert_func = sqlite.insert
|
1508
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
1509
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
1510
|
+
insert_func = postgresql.insert
|
1511
|
+
else:
|
1512
|
+
raise ValueError('Unsupported database dialect')
|
1513
|
+
insert_stmnt = insert_func(volume_table).values(
|
1514
|
+
name=name,
|
1515
|
+
launched_at=volume_launched_at,
|
1516
|
+
handle=handle,
|
1517
|
+
user_hash=user_hash,
|
1518
|
+
workspace=active_workspace,
|
1519
|
+
last_attached_at=None,
|
1520
|
+
last_use=last_use,
|
1521
|
+
status=status.value,
|
1522
|
+
)
|
1523
|
+
do_update_stmt = insert_stmnt.on_conflict_do_nothing()
|
1524
|
+
session.execute(do_update_stmt)
|
1525
|
+
session.commit()
|
1526
|
+
|
1527
|
+
|
1528
|
+
@_init_db
|
1529
|
+
def update_volume(name: str, last_attached_at: int,
|
1530
|
+
status: status_lib.VolumeStatus) -> None:
|
1531
|
+
assert _SQLALCHEMY_ENGINE is not None
|
1532
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1533
|
+
session.query(volume_table).filter_by(name=name).update({
|
1534
|
+
volume_table.c.last_attached_at: last_attached_at,
|
1535
|
+
volume_table.c.status: status.value,
|
1536
|
+
})
|
1537
|
+
session.commit()
|
1538
|
+
|
1539
|
+
|
1540
|
+
@_init_db
|
1541
|
+
def update_volume_status(name: str, status: status_lib.VolumeStatus) -> None:
|
1542
|
+
assert _SQLALCHEMY_ENGINE is not None
|
1543
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1544
|
+
session.query(volume_table).filter_by(name=name).update({
|
1545
|
+
volume_table.c.status: status.value,
|
1546
|
+
})
|
1547
|
+
session.commit()
|
1548
|
+
|
1549
|
+
|
1550
|
+
@_init_db
|
1551
|
+
def delete_volume(name: str) -> None:
|
1552
|
+
assert _SQLALCHEMY_ENGINE is not None
|
1553
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1554
|
+
session.query(volume_table).filter_by(name=name).delete()
|
1555
|
+
session.commit()
|
1556
|
+
|
1557
|
+
|
1429
1558
|
@_init_db
|
1430
1559
|
def get_ssh_keys(user_hash: str) -> Tuple[str, str, bool]:
|
1431
1560
|
assert _SQLALCHEMY_ENGINE is not None
|
sky/jobs/client/sdk.py
CHANGED
@@ -7,10 +7,10 @@ import webbrowser
|
|
7
7
|
import click
|
8
8
|
|
9
9
|
from sky import sky_logging
|
10
|
-
from sky.adaptors import common as adaptors_common
|
11
10
|
from sky.client import common as client_common
|
12
11
|
from sky.client import sdk
|
13
12
|
from sky.server import common as server_common
|
13
|
+
from sky.server import rest
|
14
14
|
from sky.server.requests import payloads
|
15
15
|
from sky.skylet import constants
|
16
16
|
from sky.usage import usage_lib
|
@@ -22,11 +22,7 @@ from sky.utils import dag_utils
|
|
22
22
|
if typing.TYPE_CHECKING:
|
23
23
|
import io
|
24
24
|
|
25
|
-
import requests
|
26
|
-
|
27
25
|
import sky
|
28
|
-
else:
|
29
|
-
requests = adaptors_common.LazyImport('requests')
|
30
26
|
|
31
27
|
logger = sky_logging.init_logger(__name__)
|
32
28
|
|
@@ -86,7 +82,7 @@ def launch(
|
|
86
82
|
task=dag_str,
|
87
83
|
name=name,
|
88
84
|
)
|
89
|
-
response =
|
85
|
+
response = rest.post(
|
90
86
|
f'{server_common.get_server_url()}/jobs/launch',
|
91
87
|
json=json.loads(body.model_dump_json()),
|
92
88
|
timeout=(5, None),
|
@@ -146,7 +142,7 @@ def queue(refresh: bool,
|
|
146
142
|
all_users=all_users,
|
147
143
|
job_ids=job_ids,
|
148
144
|
)
|
149
|
-
response =
|
145
|
+
response = rest.post(
|
150
146
|
f'{server_common.get_server_url()}/jobs/queue',
|
151
147
|
json=json.loads(body.model_dump_json()),
|
152
148
|
timeout=(5, None),
|
@@ -186,7 +182,7 @@ def cancel(
|
|
186
182
|
all=all,
|
187
183
|
all_users=all_users,
|
188
184
|
)
|
189
|
-
response =
|
185
|
+
response = rest.post(
|
190
186
|
f'{server_common.get_server_url()}/jobs/cancel',
|
191
187
|
json=json.loads(body.model_dump_json()),
|
192
188
|
timeout=(5, None),
|
@@ -197,6 +193,7 @@ def cancel(
|
|
197
193
|
|
198
194
|
@usage_lib.entrypoint
|
199
195
|
@server_common.check_server_healthy_or_start
|
196
|
+
@rest.retry_on_server_unavailable()
|
200
197
|
def tail_logs(name: Optional[str] = None,
|
201
198
|
job_id: Optional[int] = None,
|
202
199
|
follow: bool = True,
|
@@ -236,7 +233,7 @@ def tail_logs(name: Optional[str] = None,
|
|
236
233
|
refresh=refresh,
|
237
234
|
tail=tail,
|
238
235
|
)
|
239
|
-
response =
|
236
|
+
response = rest.post(
|
240
237
|
f'{server_common.get_server_url()}/jobs/logs',
|
241
238
|
json=json.loads(body.model_dump_json()),
|
242
239
|
stream=True,
|
@@ -244,7 +241,12 @@ def tail_logs(name: Optional[str] = None,
|
|
244
241
|
cookies=server_common.get_api_cookie_jar(),
|
245
242
|
)
|
246
243
|
request_id = server_common.get_request_id(response)
|
247
|
-
|
244
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
245
|
+
# streaming point on retry.
|
246
|
+
return sdk.stream_response(request_id=request_id,
|
247
|
+
response=response,
|
248
|
+
output_stream=output_stream,
|
249
|
+
resumable=(tail == 0))
|
248
250
|
|
249
251
|
|
250
252
|
@usage_lib.entrypoint
|
@@ -281,7 +283,7 @@ def download_logs(
|
|
281
283
|
controller=controller,
|
282
284
|
local_dir=local_dir,
|
283
285
|
)
|
284
|
-
response =
|
286
|
+
response = rest.post(
|
285
287
|
f'{server_common.get_server_url()}/jobs/download_logs',
|
286
288
|
json=json.loads(body.model_dump_json()),
|
287
289
|
timeout=(5, None),
|
sky/jobs/server/core.py
CHANGED
@@ -145,6 +145,7 @@ def launch(
|
|
145
145
|
entrypoint = task
|
146
146
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
147
147
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
148
|
+
dag.resolve_and_validate_volumes()
|
148
149
|
# Always apply the policy again here, even though it might have been applied
|
149
150
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
150
151
|
# and get the mutated config.
|
@@ -154,6 +155,9 @@ def launch(
|
|
154
155
|
raise ValueError('Only single-task or chain DAG is '
|
155
156
|
f'allowed for job_launch. Dag: {dag}')
|
156
157
|
dag.validate()
|
158
|
+
# TODO(aylei): use consolidated job controller instead of performing
|
159
|
+
# pre-mount operations when submitting jobs.
|
160
|
+
dag.pre_mount_volumes()
|
157
161
|
|
158
162
|
user_dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
159
163
|
|
sky/models.py
CHANGED
@@ -6,6 +6,8 @@ import getpass
|
|
6
6
|
import os
|
7
7
|
from typing import Any, Dict, Optional
|
8
8
|
|
9
|
+
import pydantic
|
10
|
+
|
9
11
|
from sky.skylet import constants
|
10
12
|
from sky.utils import common_utils
|
11
13
|
|
@@ -48,6 +50,8 @@ class KubernetesNodeInfo:
|
|
48
50
|
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
|
49
51
|
total: Dict[str, int]
|
50
52
|
free: Dict[str, int]
|
53
|
+
# IP address of the node (external IP preferred, fallback to internal IP)
|
54
|
+
ip_address: Optional[str] = None
|
51
55
|
|
52
56
|
|
53
57
|
@dataclasses.dataclass
|
@@ -76,3 +80,15 @@ class KubernetesNodesInfo:
|
|
76
80
|
},
|
77
81
|
hint=data['hint'],
|
78
82
|
)
|
83
|
+
|
84
|
+
|
85
|
+
class VolumeConfig(pydantic.BaseModel):
|
86
|
+
"""Configuration for creating a volume."""
|
87
|
+
name: str
|
88
|
+
type: str
|
89
|
+
cloud: str
|
90
|
+
region: Optional[str]
|
91
|
+
zone: Optional[str]
|
92
|
+
name_on_cloud: str
|
93
|
+
size: Optional[str]
|
94
|
+
config: Dict[str, Any] = {}
|
sky/provision/__init__.py
CHANGED
@@ -8,6 +8,7 @@ import inspect
|
|
8
8
|
import typing
|
9
9
|
from typing import Any, Dict, List, Optional, Type
|
10
10
|
|
11
|
+
from sky import models
|
11
12
|
from sky import sky_logging
|
12
13
|
# These provision.<cloud> modules should never fail even if underlying cloud SDK
|
13
14
|
# dependencies are not installed. This is ensured by using sky.adaptors inside
|
@@ -103,6 +104,31 @@ def bootstrap_instances(
|
|
103
104
|
raise NotImplementedError
|
104
105
|
|
105
106
|
|
107
|
+
@_route_to_cloud_impl
|
108
|
+
def apply_volume(provider_name: str,
|
109
|
+
config: models.VolumeConfig) -> models.VolumeConfig:
|
110
|
+
"""Create or register a volume.
|
111
|
+
|
112
|
+
This function creates or registers a volume with the provided configuration,
|
113
|
+
and returns a VolumeConfig object with updated configuration.
|
114
|
+
"""
|
115
|
+
raise NotImplementedError
|
116
|
+
|
117
|
+
|
118
|
+
@_route_to_cloud_impl
|
119
|
+
def delete_volume(provider_name: str,
|
120
|
+
config: models.VolumeConfig) -> models.VolumeConfig:
|
121
|
+
"""Delete a volume."""
|
122
|
+
raise NotImplementedError
|
123
|
+
|
124
|
+
|
125
|
+
@_route_to_cloud_impl
|
126
|
+
def get_volume_usedby(provider_name: str,
|
127
|
+
config: models.VolumeConfig) -> List[str]:
|
128
|
+
"""Get the usedby of a volume."""
|
129
|
+
raise NotImplementedError
|
130
|
+
|
131
|
+
|
106
132
|
@_route_to_cloud_impl
|
107
133
|
def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
|
108
134
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
@@ -11,3 +11,6 @@ from sky.provision.kubernetes.instance import wait_instances
|
|
11
11
|
from sky.provision.kubernetes.network import cleanup_ports
|
12
12
|
from sky.provision.kubernetes.network import open_ports
|
13
13
|
from sky.provision.kubernetes.network import query_ports
|
14
|
+
from sky.provision.kubernetes.volume import apply_volume
|
15
|
+
from sky.provision.kubernetes.volume import delete_volume
|
16
|
+
from sky.provision.kubernetes.volume import get_volume_usedby
|
@@ -3,7 +3,6 @@ import copy
|
|
3
3
|
import json
|
4
4
|
import time
|
5
5
|
from typing import Any, Callable, Dict, List, Optional, Union
|
6
|
-
import uuid
|
7
6
|
|
8
7
|
from sky import exceptions
|
9
8
|
from sky import sky_logging
|
@@ -15,6 +14,7 @@ from sky.provision import docker_utils
|
|
15
14
|
from sky.provision.kubernetes import config as config_lib
|
16
15
|
from sky.provision.kubernetes import network_utils
|
17
16
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
|
+
from sky.provision.kubernetes import volume
|
18
18
|
from sky.utils import command_runner
|
19
19
|
from sky.utils import common_utils
|
20
20
|
from sky.utils import config_utils
|
@@ -240,7 +240,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
240
240
|
extra_msg,
|
241
241
|
details=event_message))
|
242
242
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
243
|
-
f'Pod status: {pod_status}'
|
243
|
+
f'Pod status: {pod_status} '
|
244
244
|
f'Details: \'{event_message}\' ')
|
245
245
|
raise config_lib.KubernetesError(f'{timeout_err_msg}')
|
246
246
|
|
@@ -673,21 +673,6 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
673
673
|
raise e
|
674
674
|
|
675
675
|
|
676
|
-
def _create_persistent_volume_claim(namespace: str, context: Optional[str],
|
677
|
-
pvc_spec: Dict[str, Any]) -> None:
|
678
|
-
"""Creates a persistent volume claim for SkyServe controller."""
|
679
|
-
try:
|
680
|
-
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
681
|
-
name=pvc_spec['metadata']['name'], namespace=namespace)
|
682
|
-
return
|
683
|
-
except kubernetes.api_exception() as e:
|
684
|
-
if e.status != 404: # Not found
|
685
|
-
raise
|
686
|
-
|
687
|
-
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
688
|
-
namespace=namespace, body=pvc_spec)
|
689
|
-
|
690
|
-
|
691
676
|
@timeline.event
|
692
677
|
def _wait_for_deployment_pod(context,
|
693
678
|
namespace,
|
@@ -832,9 +817,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
832
817
|
# Worker pods
|
833
818
|
pod_spec_copy['metadata']['labels'].update(
|
834
819
|
constants.WORKER_NODE_TAGS)
|
835
|
-
|
836
|
-
pod_name
|
837
|
-
|
820
|
+
pod_name = f'{cluster_name_on_cloud}-worker{i}'
|
821
|
+
if pod_name in running_pods:
|
822
|
+
# If the pod is already running, we skip creating it.
|
823
|
+
return
|
824
|
+
pod_spec_copy['metadata']['name'] = pod_name
|
825
|
+
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
838
826
|
# For multi-node support, we put a soft-constraint to schedule
|
839
827
|
# worker pods on different nodes than the head pod.
|
840
828
|
# This is not set as a hard constraint because if different nodes
|
@@ -888,7 +876,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
888
876
|
]
|
889
877
|
|
890
878
|
if to_create_deployment:
|
891
|
-
|
879
|
+
volume.create_persistent_volume_claim(namespace, context, pvc_spec)
|
892
880
|
|
893
881
|
# It's safe to directly modify the template spec in the deployment spec
|
894
882
|
# because controller pod is singleton, i in [0].
|
@@ -910,6 +898,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
910
898
|
print('Deployment failed', e)
|
911
899
|
raise e
|
912
900
|
|
901
|
+
# Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
|
902
|
+
# is used by any pod in the namespace.
|
903
|
+
volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
|
904
|
+
|
913
905
|
return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
|
914
906
|
context)
|
915
907
|
|
@@ -1012,40 +1004,6 @@ def stop_instances(
|
|
1012
1004
|
raise NotImplementedError()
|
1013
1005
|
|
1014
1006
|
|
1015
|
-
def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
1016
|
-
resource_name: str) -> None:
|
1017
|
-
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
1018
|
-
|
1019
|
-
Args:
|
1020
|
-
delete_func: Function to call to delete the resource
|
1021
|
-
resource_type: Type of resource being deleted (e.g. 'service'),
|
1022
|
-
used in logging
|
1023
|
-
resource_name: Name of the resource being deleted, used in logging
|
1024
|
-
"""
|
1025
|
-
max_retries = 3
|
1026
|
-
retry_delay = 5 # seconds
|
1027
|
-
|
1028
|
-
for attempt in range(max_retries):
|
1029
|
-
try:
|
1030
|
-
delete_func()
|
1031
|
-
return
|
1032
|
-
except kubernetes.api_exception() as e:
|
1033
|
-
if e.status == 404:
|
1034
|
-
logger.warning(
|
1035
|
-
f'terminate_instances: Tried to delete {resource_type} '
|
1036
|
-
f'{resource_name}, but the {resource_type} was not '
|
1037
|
-
'found (404).')
|
1038
|
-
return
|
1039
|
-
elif attempt < max_retries - 1:
|
1040
|
-
logger.warning(f'terminate_instances: Failed to delete '
|
1041
|
-
f'{resource_type} {resource_name} (attempt '
|
1042
|
-
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
1043
|
-
f'Retrying in {retry_delay} seconds...')
|
1044
|
-
time.sleep(retry_delay)
|
1045
|
-
else:
|
1046
|
-
raise
|
1047
|
-
|
1048
|
-
|
1049
1007
|
def _delete_services(name_prefix: str, namespace: str,
|
1050
1008
|
context: Optional[str]) -> None:
|
1051
1009
|
"""Delete services with the given name prefix.
|
@@ -1061,13 +1019,14 @@ def _delete_services(name_prefix: str, namespace: str,
|
|
1061
1019
|
# TODO(andyl): Wait for
|
1062
1020
|
# https://github.com/pylint-dev/pylint/issues/5263.
|
1063
1021
|
# pylint: disable=cell-var-from-loop
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1022
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1023
|
+
delete_func=lambda: kubernetes.core_api(
|
1024
|
+
context).delete_namespaced_service(name=service_name,
|
1025
|
+
namespace=namespace,
|
1026
|
+
_request_timeout=config_lib.
|
1027
|
+
DELETION_TIMEOUT),
|
1028
|
+
resource_type='service',
|
1029
|
+
resource_name=service_name)
|
1071
1030
|
|
1072
1031
|
|
1073
1032
|
def _terminate_node(namespace: str,
|
@@ -1087,7 +1046,7 @@ def _terminate_node(namespace: str,
|
|
1087
1046
|
# from within the pod, e.g., for autodown.
|
1088
1047
|
# Note - some misbehaving pods may not terminate gracefully if they have
|
1089
1048
|
# open file descriptors. We force delete pods to avoid this.
|
1090
|
-
|
1049
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1091
1050
|
delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
|
1092
1051
|
name=pod_name,
|
1093
1052
|
namespace=namespace,
|
@@ -1105,26 +1064,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
|
|
1105
1064
|
|
1106
1065
|
# Delete deployment
|
1107
1066
|
deployment_name = _get_deployment_name(cluster_name)
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1067
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1068
|
+
delete_func=lambda: kubernetes.apps_api(
|
1069
|
+
context).delete_namespaced_deployment(name=deployment_name,
|
1070
|
+
namespace=namespace,
|
1071
|
+
_request_timeout=config_lib.
|
1072
|
+
DELETION_TIMEOUT),
|
1073
|
+
resource_type='deployment',
|
1074
|
+
resource_name=deployment_name)
|
1115
1075
|
|
1116
1076
|
# Delete PVCs
|
1117
1077
|
pvc_name = _get_pvc_name(
|
1118
1078
|
cluster_name,
|
1119
1079
|
kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
|
1120
1080
|
# pylint: disable=cell-var-from-loop
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1081
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1082
|
+
delete_func=lambda: kubernetes.core_api(
|
1083
|
+
context).delete_namespaced_persistent_volume_claim(
|
1084
|
+
name=pvc_name,
|
1085
|
+
namespace=namespace,
|
1086
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
1087
|
+
resource_type='pvc',
|
1088
|
+
resource_name=pvc_name)
|
1128
1089
|
|
1129
1090
|
|
1130
1091
|
def terminate_instances(
|