skypilot-nightly 1.0.0.dev20250523__py3-none-any.whl → 1.0.0.dev20250526__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +62 -45
- sky/backends/cloud_vm_ray_backend.py +3 -1
- sky/check.py +335 -170
- sky/cli.py +56 -13
- sky/client/cli.py +56 -13
- sky/client/sdk.py +54 -10
- sky/clouds/gcp.py +19 -3
- sky/core.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/7GEgRyZKRaSnYZCV1Jwol/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/25-062253ea41fb8eec.js +6 -0
- sky/dashboard/out/_next/static/chunks/480-5a0de8b6570ea105.js +1 -0
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
- sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
- sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
- sky/dashboard/out/_next/static/chunks/578-d351125af46c293f.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-a6e01d7f98904741.js +1 -0
- sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-59956af3950b02ed.js +1 -0
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-96a715a6fb01e228.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-3b5aad09a25f64b7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-abb7d744ecf15109.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-48dc8d67d4b60be1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-b8acf6e6735323a2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-bbf436f41381e169.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7733c960685b4385.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5ed48b3201b998c8.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
- sky/dashboard/out/_next/static/css/28558d57108b05ae.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/storage.py +1 -1
- sky/global_user_state.py +606 -543
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +72 -56
- sky/jobs/state.py +26 -5
- sky/jobs/utils.py +65 -13
- sky/optimizer.py +6 -3
- sky/provision/fluidstack/instance.py +1 -0
- sky/serve/server/core.py +9 -6
- sky/server/html/token_page.html +6 -1
- sky/server/requests/executor.py +1 -0
- sky/server/requests/payloads.py +28 -0
- sky/server/server.py +59 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +4 -1
- sky/skypilot_config.py +107 -11
- sky/utils/cli_utils/status_utils.py +18 -8
- sky/utils/db_utils.py +53 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +166 -147
- sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
- sky/utils/kubernetes/ssh-tunnel.sh +20 -28
- sky/utils/log_utils.py +4 -0
- sky/utils/schemas.py +54 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +295 -0
- sky/workspaces/server.py +62 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/RECORD +79 -63
- sky/dashboard/out/_next/static/ECKwDNS9v9y3_IKFZ2lpp/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
- sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
- /sky/dashboard/out/_next/static/{ECKwDNS9v9y3_IKFZ2lpp → 7GEgRyZKRaSnYZCV1Jwol}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
@@ -10,14 +10,20 @@ import json
|
|
10
10
|
import os
|
11
11
|
import pathlib
|
12
12
|
import pickle
|
13
|
-
import sqlite3
|
14
13
|
import time
|
15
14
|
import typing
|
16
15
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
17
16
|
import uuid
|
18
17
|
|
18
|
+
import sqlalchemy
|
19
|
+
from sqlalchemy import exc as sqlalchemy_exc
|
20
|
+
from sqlalchemy import orm
|
21
|
+
from sqlalchemy.dialects import sqlite
|
22
|
+
from sqlalchemy.ext import declarative
|
23
|
+
|
19
24
|
from sky import models
|
20
25
|
from sky import sky_logging
|
26
|
+
from sky.skylet import constants
|
21
27
|
from sky.utils import common_utils
|
22
28
|
from sky.utils import context_utils
|
23
29
|
from sky.utils import db_utils
|
@@ -37,159 +43,215 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
37
43
|
_DB_PATH = os.path.expanduser('~/.sky/state.db')
|
38
44
|
pathlib.Path(_DB_PATH).parents[0].mkdir(parents=True, exist_ok=True)
|
39
45
|
|
40
|
-
|
41
|
-
|
46
|
+
_SQLALCHEMY_ENGINE = sqlalchemy.create_engine(f'sqlite:///{_DB_PATH}')
|
47
|
+
|
48
|
+
Base = declarative.declarative_base()
|
49
|
+
|
50
|
+
config_table = sqlalchemy.Table(
|
51
|
+
'config',
|
52
|
+
Base.metadata,
|
53
|
+
sqlalchemy.Column('key', sqlalchemy.Text, primary_key=True),
|
54
|
+
sqlalchemy.Column('value', sqlalchemy.Text),
|
55
|
+
)
|
56
|
+
|
57
|
+
user_table = sqlalchemy.Table(
|
58
|
+
'users',
|
59
|
+
Base.metadata,
|
60
|
+
sqlalchemy.Column('id', sqlalchemy.Text, primary_key=True),
|
61
|
+
sqlalchemy.Column('name', sqlalchemy.Text),
|
62
|
+
)
|
63
|
+
|
64
|
+
cluster_table = sqlalchemy.Table(
|
65
|
+
'clusters',
|
66
|
+
Base.metadata,
|
67
|
+
sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
|
68
|
+
sqlalchemy.Column('launched_at', sqlalchemy.Integer),
|
69
|
+
sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
|
70
|
+
sqlalchemy.Column('last_use', sqlalchemy.Text),
|
71
|
+
sqlalchemy.Column('status', sqlalchemy.Text),
|
72
|
+
sqlalchemy.Column('autostop', sqlalchemy.Integer, server_default='-1'),
|
73
|
+
sqlalchemy.Column('to_down', sqlalchemy.Integer, server_default='0'),
|
74
|
+
sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
|
75
|
+
sqlalchemy.Column('owner', sqlalchemy.Text, server_default=None),
|
76
|
+
sqlalchemy.Column('cluster_hash', sqlalchemy.Text, server_default=None),
|
77
|
+
sqlalchemy.Column('storage_mounts_metadata',
|
78
|
+
sqlalchemy.LargeBinary,
|
79
|
+
server_default=None),
|
80
|
+
sqlalchemy.Column('cluster_ever_up', sqlalchemy.Integer,
|
81
|
+
server_default='0'),
|
82
|
+
sqlalchemy.Column('status_updated_at',
|
83
|
+
sqlalchemy.Integer,
|
84
|
+
server_default=None),
|
85
|
+
sqlalchemy.Column('config_hash', sqlalchemy.Text, server_default=None),
|
86
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text, server_default=None),
|
87
|
+
sqlalchemy.Column('workspace',
|
88
|
+
sqlalchemy.Text,
|
89
|
+
server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
|
90
|
+
)
|
91
|
+
|
92
|
+
storage_table = sqlalchemy.Table(
|
93
|
+
'storage',
|
94
|
+
Base.metadata,
|
95
|
+
sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
|
96
|
+
sqlalchemy.Column('launched_at', sqlalchemy.Integer),
|
97
|
+
sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
|
98
|
+
sqlalchemy.Column('last_use', sqlalchemy.Text),
|
99
|
+
sqlalchemy.Column('status', sqlalchemy.Text),
|
100
|
+
)
|
101
|
+
|
102
|
+
# Table for Cluster History
|
103
|
+
# usage_intervals: List[Tuple[int, int]]
|
104
|
+
# Specifies start and end timestamps of cluster.
|
105
|
+
# When the last end time is None, the cluster is still UP.
|
106
|
+
# Example: [(start1, end1), (start2, end2), (start3, None)]
|
107
|
+
|
108
|
+
# requested_resources: Set[resource_lib.Resource]
|
109
|
+
# Requested resources fetched from task that user specifies.
|
110
|
+
|
111
|
+
# launched_resources: Optional[resources_lib.Resources]
|
112
|
+
# Actual launched resources fetched from handle for cluster.
|
113
|
+
|
114
|
+
# num_nodes: Optional[int] number of nodes launched.
|
115
|
+
cluster_history_table = sqlalchemy.Table(
|
116
|
+
'cluster_history',
|
117
|
+
Base.metadata,
|
118
|
+
sqlalchemy.Column('cluster_hash', sqlalchemy.Text, primary_key=True),
|
119
|
+
sqlalchemy.Column('name', sqlalchemy.Text),
|
120
|
+
sqlalchemy.Column('num_nodes', sqlalchemy.Integer),
|
121
|
+
sqlalchemy.Column('requested_resources', sqlalchemy.LargeBinary),
|
122
|
+
sqlalchemy.Column('launched_resources', sqlalchemy.LargeBinary),
|
123
|
+
sqlalchemy.Column('usage_intervals', sqlalchemy.LargeBinary),
|
124
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text),
|
125
|
+
)
|
126
|
+
|
127
|
+
|
128
|
+
def create_table():
|
42
129
|
# Enable WAL mode to avoid locking issues.
|
43
130
|
# See: issue #1441 and PR #1509
|
44
131
|
# https://github.com/microsoft/WSL/issues/2395
|
45
132
|
# TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
|
46
133
|
# This may cause the database locked problem from WSL issue #1441.
|
47
|
-
if
|
134
|
+
if (_SQLALCHEMY_ENGINE.dialect.name
|
135
|
+
== db_utils.SQLAlchemyDialect.SQLITE.value and
|
136
|
+
not common_utils.is_wsl()):
|
48
137
|
try:
|
49
|
-
|
50
|
-
|
138
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
139
|
+
session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
|
140
|
+
session.commit()
|
141
|
+
except sqlalchemy_exc.OperationalError as e:
|
51
142
|
if 'database is locked' not in str(e):
|
52
143
|
raise
|
53
144
|
# If the database is locked, it is OK to continue, as the WAL mode
|
54
145
|
# is not critical and is likely to be enabled by other processes.
|
55
146
|
|
56
|
-
#
|
57
|
-
|
58
|
-
|
59
|
-
name TEXT PRIMARY KEY,
|
60
|
-
launched_at INTEGER,
|
61
|
-
handle BLOB,
|
62
|
-
last_use TEXT,
|
63
|
-
status TEXT,
|
64
|
-
autostop INTEGER DEFAULT -1,
|
65
|
-
metadata TEXT DEFAULT '{}',
|
66
|
-
to_down INTEGER DEFAULT 0,
|
67
|
-
owner TEXT DEFAULT null,
|
68
|
-
cluster_hash TEXT DEFAULT null,
|
69
|
-
storage_mounts_metadata BLOB DEFAULT null,
|
70
|
-
cluster_ever_up INTEGER DEFAULT 0,
|
71
|
-
status_updated_at INTEGER DEFAULT null,
|
72
|
-
config_hash TEXT DEFAULT null,
|
73
|
-
user_hash TEXT DEFAULT null)""")
|
74
|
-
|
75
|
-
# Table for Cluster History
|
76
|
-
# usage_intervals: List[Tuple[int, int]]
|
77
|
-
# Specifies start and end timestamps of cluster.
|
78
|
-
# When the last end time is None, the cluster is still UP.
|
79
|
-
# Example: [(start1, end1), (start2, end2), (start3, None)]
|
80
|
-
|
81
|
-
# requested_resources: Set[resource_lib.Resource]
|
82
|
-
# Requested resources fetched from task that user specifies.
|
83
|
-
|
84
|
-
# launched_resources: Optional[resources_lib.Resources]
|
85
|
-
# Actual launched resources fetched from handle for cluster.
|
86
|
-
|
87
|
-
# num_nodes: Optional[int] number of nodes launched.
|
88
|
-
|
89
|
-
cursor.execute("""\
|
90
|
-
CREATE TABLE IF NOT EXISTS cluster_history (
|
91
|
-
cluster_hash TEXT PRIMARY KEY,
|
92
|
-
name TEXT,
|
93
|
-
num_nodes int,
|
94
|
-
requested_resources BLOB,
|
95
|
-
launched_resources BLOB,
|
96
|
-
usage_intervals BLOB,
|
97
|
-
user_hash TEXT)""")
|
98
|
-
# Table for configs (e.g. enabled clouds)
|
99
|
-
cursor.execute("""\
|
100
|
-
CREATE TABLE IF NOT EXISTS config (
|
101
|
-
key TEXT PRIMARY KEY, value TEXT)""")
|
102
|
-
# Table for Storage
|
103
|
-
cursor.execute("""\
|
104
|
-
CREATE TABLE IF NOT EXISTS storage (
|
105
|
-
name TEXT PRIMARY KEY,
|
106
|
-
launched_at INTEGER,
|
107
|
-
handle BLOB,
|
108
|
-
last_use TEXT,
|
109
|
-
status TEXT)""")
|
110
|
-
# Table for User
|
111
|
-
cursor.execute("""\
|
112
|
-
CREATE TABLE IF NOT EXISTS users (
|
113
|
-
id TEXT PRIMARY KEY,
|
114
|
-
name TEXT)""")
|
147
|
+
# Create tables if they don't exist
|
148
|
+
Base.metadata.create_all(bind=_SQLALCHEMY_ENGINE)
|
149
|
+
|
115
150
|
# For backward compatibility.
|
116
151
|
# TODO(zhwu): Remove this function after all users have migrated to
|
117
152
|
# the latest version of SkyPilot.
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
153
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
154
|
+
# Add autostop column to clusters table
|
155
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters', 'autostop',
|
156
|
+
'INTEGER DEFAULT -1')
|
157
|
+
|
158
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters', 'metadata',
|
159
|
+
'TEXT DEFAULT \'{}\'')
|
160
|
+
|
161
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters', 'to_down',
|
162
|
+
'INTEGER DEFAULT 0')
|
163
|
+
|
164
|
+
# The cloud identity that created the cluster.
|
165
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters', 'owner',
|
166
|
+
'TEXT')
|
167
|
+
|
168
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters',
|
169
|
+
'cluster_hash',
|
170
|
+
'TEXT DEFAULT null')
|
171
|
+
|
172
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters',
|
173
|
+
'storage_mounts_metadata',
|
174
|
+
'BLOB DEFAULT null')
|
175
|
+
db_utils.add_column_to_table_sqlalchemy(
|
176
|
+
session,
|
177
|
+
'clusters',
|
178
|
+
'cluster_ever_up',
|
179
|
+
'INTEGER DEFAULT 0',
|
180
|
+
# Set the value to 1 so that all the existing clusters before #2977
|
181
|
+
# are considered as ever up, i.e:
|
182
|
+
# existing cluster's default (null) -> 1;
|
183
|
+
# new cluster's default -> 0;
|
184
|
+
# This is conservative for the existing clusters: even if some INIT
|
185
|
+
# clusters were never really UP, setting it to 1 means they won't be
|
186
|
+
# auto-deleted during any failover.
|
187
|
+
value_to_replace_existing_entries=1)
|
188
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters',
|
189
|
+
'status_updated_at',
|
190
|
+
'INTEGER DEFAULT null')
|
191
|
+
db_utils.add_column_to_table_sqlalchemy(
|
192
|
+
session,
|
193
|
+
'clusters',
|
194
|
+
'user_hash',
|
195
|
+
'TEXT DEFAULT null',
|
196
|
+
value_to_replace_existing_entries=common_utils.get_user_hash())
|
197
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'clusters',
|
198
|
+
'config_hash',
|
199
|
+
'TEXT DEFAULT null')
|
200
|
+
|
201
|
+
db_utils.add_column_to_table_sqlalchemy(session, 'cluster_history',
|
202
|
+
'user_hash',
|
203
|
+
'TEXT DEFAULT null')
|
204
|
+
|
205
|
+
db_utils.add_column_to_table_sqlalchemy(
|
206
|
+
session,
|
207
|
+
'clusters',
|
208
|
+
'workspace',
|
209
|
+
'TEXT DEFAULT \'default\'',
|
210
|
+
value_to_replace_existing_entries=constants.
|
211
|
+
SKYPILOT_DEFAULT_WORKSPACE)
|
212
|
+
session.commit()
|
213
|
+
|
214
|
+
|
215
|
+
create_table()
|
171
216
|
|
172
217
|
|
173
218
|
def add_or_update_user(user: models.User):
|
174
219
|
"""Store the mapping from user hash to user name for display purposes."""
|
175
220
|
if user.name is None:
|
176
221
|
return
|
177
|
-
|
178
|
-
|
179
|
-
|
222
|
+
|
223
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
224
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
225
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
226
|
+
insert_stmnt = sqlite.insert(user_table).values(id=user.id,
|
227
|
+
name=user.name)
|
228
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
229
|
+
index_elements=[user_table.c.id],
|
230
|
+
set_={user_table.c.name: user.name})
|
231
|
+
session.execute(do_update_stmt)
|
232
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
233
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
234
|
+
# TODO(syang) support postgres dialect
|
235
|
+
session.rollback()
|
236
|
+
raise ValueError('Unsupported database dialect')
|
237
|
+
else:
|
238
|
+
session.rollback()
|
239
|
+
raise ValueError('Unsupported database dialect')
|
240
|
+
session.commit()
|
180
241
|
|
181
242
|
|
182
243
|
def get_user(user_id: str) -> models.User:
|
183
|
-
|
184
|
-
|
244
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
245
|
+
row = session.query(user_table).filter_by(id=user_id).first()
|
185
246
|
if row is None:
|
186
247
|
return models.User(id=user_id)
|
187
|
-
return models.User(id=row
|
248
|
+
return models.User(id=row.id, name=row.name)
|
188
249
|
|
189
250
|
|
190
251
|
def get_all_users() -> List[models.User]:
|
191
|
-
|
192
|
-
|
252
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
253
|
+
rows = session.query(user_table).all()
|
254
|
+
return [models.User(id=row.id, name=row.name) for row in rows]
|
193
255
|
|
194
256
|
|
195
257
|
def add_or_update_cluster(cluster_name: str,
|
@@ -209,6 +271,9 @@ def add_or_update_cluster(cluster_name: str,
|
|
209
271
|
is_launch: if the cluster is firstly launched. If True, the launched_at
|
210
272
|
and last_use will be updated. Otherwise, use the old value.
|
211
273
|
"""
|
274
|
+
# TODO(zhwu): have to be imported here to avoid circular import.
|
275
|
+
from sky import skypilot_config # pylint: disable=import-outside-toplevel
|
276
|
+
|
212
277
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
213
278
|
handle = pickle.dumps(cluster_handle)
|
214
279
|
cluster_launched_at = int(time.time()) if is_launch else None
|
@@ -242,142 +307,118 @@ def add_or_update_cluster(cluster_name: str,
|
|
242
307
|
usage_intervals.append((cluster_launched_at, None))
|
243
308
|
|
244
309
|
user_hash = common_utils.get_user_hash()
|
310
|
+
active_workspace = skypilot_config.get_active_workspace()
|
245
311
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
#
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
#
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
# number of nodes
|
357
|
-
'?, '
|
358
|
-
# usage intervals
|
359
|
-
'?, '
|
360
|
-
# user_hash
|
361
|
-
'?'
|
362
|
-
')',
|
363
|
-
(
|
364
|
-
# hash
|
365
|
-
cluster_hash,
|
366
|
-
# name
|
367
|
-
cluster_name,
|
368
|
-
# number of nodes
|
369
|
-
launched_nodes,
|
370
|
-
# requested resources
|
371
|
-
pickle.dumps(requested_resources),
|
372
|
-
# launched resources
|
373
|
-
pickle.dumps(launched_resources),
|
374
|
-
# usage intervals
|
375
|
-
pickle.dumps(usage_intervals),
|
376
|
-
# user_hash
|
377
|
-
user_hash,
|
378
|
-
))
|
379
|
-
|
380
|
-
_DB.conn.commit()
|
312
|
+
conditional_values = {}
|
313
|
+
if is_launch:
|
314
|
+
conditional_values.update({
|
315
|
+
'launched_at': cluster_launched_at,
|
316
|
+
'last_use': last_use
|
317
|
+
})
|
318
|
+
|
319
|
+
if int(ready) == 1:
|
320
|
+
conditional_values.update({
|
321
|
+
'cluster_ever_up': 1,
|
322
|
+
})
|
323
|
+
|
324
|
+
if config_hash is not None:
|
325
|
+
conditional_values.update({
|
326
|
+
'config_hash': config_hash,
|
327
|
+
})
|
328
|
+
|
329
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
330
|
+
# with_for_update() locks the row until commit() or rollback()
|
331
|
+
# is called, or until the code escapes the with block.
|
332
|
+
cluster_row = session.query(cluster_table).filter_by(
|
333
|
+
name=cluster_name).with_for_update().first()
|
334
|
+
if (not cluster_row or
|
335
|
+
cluster_row.status == status_lib.ClusterStatus.STOPPED.value):
|
336
|
+
conditional_values.update({
|
337
|
+
'autostop': -1,
|
338
|
+
'to_down': 0,
|
339
|
+
})
|
340
|
+
if not cluster_row or not cluster_row.user_hash:
|
341
|
+
conditional_values.update({
|
342
|
+
'user_hash': user_hash,
|
343
|
+
})
|
344
|
+
if not cluster_row or not cluster_row.workspace:
|
345
|
+
conditional_values.update({
|
346
|
+
'workspace': active_workspace,
|
347
|
+
})
|
348
|
+
|
349
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
350
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
351
|
+
insert_stmnt = sqlite.insert(cluster_table).values(
|
352
|
+
name=cluster_name,
|
353
|
+
**conditional_values,
|
354
|
+
handle=handle,
|
355
|
+
status=status.value,
|
356
|
+
# set metadata to server default ('{}')
|
357
|
+
# set owner to server default (null)
|
358
|
+
cluster_hash=cluster_hash,
|
359
|
+
# set storage_mounts_metadata to server default (null)
|
360
|
+
status_updated_at=status_updated_at,
|
361
|
+
)
|
362
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
363
|
+
index_elements=[cluster_table.c.name],
|
364
|
+
set_={
|
365
|
+
**conditional_values,
|
366
|
+
cluster_table.c.handle: handle,
|
367
|
+
cluster_table.c.status: status.value,
|
368
|
+
# do not update metadata value
|
369
|
+
# do not update owner value
|
370
|
+
cluster_table.c.cluster_hash: cluster_hash,
|
371
|
+
# do not update storage_mounts_metadata
|
372
|
+
cluster_table.c.status_updated_at: status_updated_at,
|
373
|
+
# do not update user_hash
|
374
|
+
})
|
375
|
+
session.execute(do_update_stmt)
|
376
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
377
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
378
|
+
# TODO(syang) support postgres dialect
|
379
|
+
session.rollback()
|
380
|
+
raise ValueError('Unsupported database dialect')
|
381
|
+
else:
|
382
|
+
session.rollback()
|
383
|
+
raise ValueError('Unsupported database dialect')
|
384
|
+
|
385
|
+
# Modify cluster history table
|
386
|
+
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
387
|
+
launched_resources = getattr(cluster_handle, 'launched_resources', None)
|
388
|
+
|
389
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
390
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
391
|
+
insert_stmnt = sqlite.insert(cluster_history_table).values(
|
392
|
+
cluster_hash=cluster_hash,
|
393
|
+
name=cluster_name,
|
394
|
+
num_nodes=launched_nodes,
|
395
|
+
requested_resources=pickle.dumps(requested_resources),
|
396
|
+
launched_resources=pickle.dumps(launched_resources),
|
397
|
+
usage_intervals=pickle.dumps(usage_intervals),
|
398
|
+
user_hash=user_hash)
|
399
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
400
|
+
index_elements=[cluster_history_table.c.cluster_hash],
|
401
|
+
set_={
|
402
|
+
cluster_history_table.c.name: cluster_name,
|
403
|
+
cluster_history_table.c.num_nodes: launched_nodes,
|
404
|
+
cluster_history_table.c.requested_resources:
|
405
|
+
pickle.dumps(requested_resources),
|
406
|
+
cluster_history_table.c.launched_resources:
|
407
|
+
pickle.dumps(launched_resources),
|
408
|
+
cluster_history_table.c.usage_intervals:
|
409
|
+
pickle.dumps(usage_intervals),
|
410
|
+
cluster_history_table.c.user_hash: user_hash
|
411
|
+
})
|
412
|
+
session.execute(do_update_stmt)
|
413
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
414
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
415
|
+
# TODO(syang) support postgres dialect
|
416
|
+
session.rollback()
|
417
|
+
raise ValueError('Unsupported database dialect')
|
418
|
+
else:
|
419
|
+
session.rollback()
|
420
|
+
raise ValueError('Unsupported database dialect')
|
421
|
+
session.commit()
|
381
422
|
|
382
423
|
|
383
424
|
def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
@@ -395,16 +436,18 @@ def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
|
395
436
|
def update_cluster_handle(cluster_name: str,
|
396
437
|
cluster_handle: 'backends.ResourceHandle'):
|
397
438
|
handle = pickle.dumps(cluster_handle)
|
398
|
-
|
399
|
-
|
400
|
-
|
439
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
440
|
+
session.query(cluster_table).filter_by(name=cluster_name).update(
|
441
|
+
{cluster_table.c.handle: handle})
|
442
|
+
session.commit()
|
401
443
|
|
402
444
|
|
403
445
|
def update_last_use(cluster_name: str):
|
404
446
|
"""Updates the last used command for the cluster."""
|
405
|
-
|
406
|
-
|
407
|
-
|
447
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
448
|
+
session.query(cluster_table).filter_by(name=cluster_name).update(
|
449
|
+
{cluster_table.c.last_use: common_utils.get_current_command()})
|
450
|
+
session.commit()
|
408
451
|
|
409
452
|
|
410
453
|
def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
@@ -412,63 +455,73 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
412
455
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
413
456
|
usage_intervals = _get_cluster_usage_intervals(cluster_hash)
|
414
457
|
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
))
|
444
|
-
_DB.conn.commit()
|
458
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
459
|
+
# usage_intervals is not None and not empty
|
460
|
+
if usage_intervals:
|
461
|
+
assert cluster_hash is not None, cluster_name
|
462
|
+
start_time = usage_intervals.pop()[0]
|
463
|
+
end_time = int(time.time())
|
464
|
+
usage_intervals.append((start_time, end_time))
|
465
|
+
_set_cluster_usage_intervals(cluster_hash, usage_intervals)
|
466
|
+
|
467
|
+
if terminate:
|
468
|
+
session.query(cluster_table).filter_by(name=cluster_name).delete()
|
469
|
+
else:
|
470
|
+
handle = get_handle_from_cluster_name(cluster_name)
|
471
|
+
if handle is None:
|
472
|
+
return
|
473
|
+
# Must invalidate IP list to avoid directly trying to ssh into a
|
474
|
+
# stopped VM, which leads to timeout.
|
475
|
+
if hasattr(handle, 'stable_internal_external_ips'):
|
476
|
+
handle = typing.cast('backends.CloudVmRayResourceHandle',
|
477
|
+
handle)
|
478
|
+
handle.stable_internal_external_ips = None
|
479
|
+
current_time = int(time.time())
|
480
|
+
session.query(cluster_table).filter_by(name=cluster_name).update({
|
481
|
+
cluster_table.c.handle: pickle.dumps(handle),
|
482
|
+
cluster_table.c.status: status_lib.ClusterStatus.STOPPED.value,
|
483
|
+
cluster_table.c.status_updated_at: current_time
|
484
|
+
})
|
485
|
+
session.commit()
|
445
486
|
|
446
487
|
|
447
488
|
def get_handle_from_cluster_name(
|
448
489
|
cluster_name: str) -> Optional['backends.ResourceHandle']:
|
449
490
|
assert cluster_name is not None, 'cluster_name cannot be None'
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
return
|
454
|
-
return
|
491
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
492
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
493
|
+
if row is None:
|
494
|
+
return None
|
495
|
+
return pickle.loads(row.handle)
|
455
496
|
|
456
497
|
|
457
498
|
def get_glob_cluster_names(cluster_name: str) -> List[str]:
|
458
499
|
assert cluster_name is not None, 'cluster_name cannot be None'
|
459
|
-
|
460
|
-
|
461
|
-
|
500
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
501
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
502
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
503
|
+
rows = session.query(cluster_table).filter(
|
504
|
+
cluster_table.c.name.op('GLOB')(cluster_name)).all()
|
505
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
506
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
507
|
+
# TODO(syang) support postgres dialect
|
508
|
+
# postgres does not support GLOB
|
509
|
+
raise ValueError('Unsupported database dialect')
|
510
|
+
else:
|
511
|
+
raise ValueError('Unsupported database dialect')
|
512
|
+
return [row.name for row in rows]
|
462
513
|
|
463
514
|
|
464
515
|
def set_cluster_status(cluster_name: str,
|
465
516
|
status: status_lib.ClusterStatus) -> None:
|
466
517
|
current_time = int(time.time())
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
518
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
519
|
+
count = session.query(cluster_table).filter_by(
|
520
|
+
name=cluster_name).update({
|
521
|
+
cluster_table.c.status: status.value,
|
522
|
+
cluster_table.c.status_updated_at: current_time
|
523
|
+
})
|
524
|
+
session.commit()
|
472
525
|
assert count <= 1, count
|
473
526
|
if count == 0:
|
474
527
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
@@ -476,46 +529,40 @@ def set_cluster_status(cluster_name: str,
|
|
476
529
|
|
477
530
|
def set_cluster_autostop_value(cluster_name: str, idle_minutes: int,
|
478
531
|
to_down: bool) -> None:
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
_DB.conn.commit()
|
532
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
533
|
+
count = session.query(cluster_table).filter_by(
|
534
|
+
name=cluster_name).update({
|
535
|
+
cluster_table.c.autostop: idle_minutes,
|
536
|
+
cluster_table.c.to_down: int(to_down)
|
537
|
+
})
|
538
|
+
session.commit()
|
487
539
|
assert count <= 1, count
|
488
540
|
if count == 0:
|
489
541
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
490
542
|
|
491
543
|
|
492
544
|
def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
return int(launch_time)
|
499
|
-
return None
|
545
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
546
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
547
|
+
if row is None or row.launched_at is None:
|
548
|
+
return None
|
549
|
+
return int(row.launched_at)
|
500
550
|
|
501
551
|
|
502
552
|
def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
return json.loads(metadata)
|
509
|
-
return None
|
553
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
554
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
555
|
+
if row is None or row.metadata is None:
|
556
|
+
return None
|
557
|
+
return json.loads(row.metadata)
|
510
558
|
|
511
559
|
|
512
560
|
def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
_DB.conn.commit()
|
561
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
562
|
+
count = session.query(cluster_table).filter_by(
|
563
|
+
name=cluster_name).update(
|
564
|
+
{cluster_table.c.metadata: json.dumps(metadata)})
|
565
|
+
session.commit()
|
519
566
|
assert count <= 1, count
|
520
567
|
if count == 0:
|
521
568
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
@@ -523,25 +570,22 @@ def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
|
|
523
570
|
|
524
571
|
def get_cluster_storage_mounts_metadata(
|
525
572
|
cluster_name: str) -> Optional[Dict[str, Any]]:
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
return None
|
532
|
-
return pickle.loads(storage_mounts_metadata)
|
533
|
-
return None
|
573
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
574
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
575
|
+
if row is None or row.storage_mounts_metadata is None:
|
576
|
+
return None
|
577
|
+
return pickle.loads(row.storage_mounts_metadata)
|
534
578
|
|
535
579
|
|
536
580
|
def set_cluster_storage_mounts_metadata(
|
537
581
|
cluster_name: str, storage_mounts_metadata: Dict[str, Any]) -> None:
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
582
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
583
|
+
count = session.query(cluster_table).filter_by(
|
584
|
+
name=cluster_name).update({
|
585
|
+
cluster_table.c.storage_mounts_metadata:
|
586
|
+
pickle.dumps(storage_mounts_metadata)
|
587
|
+
})
|
588
|
+
session.commit()
|
545
589
|
assert count <= 1, count
|
546
590
|
if count == 0:
|
547
591
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
@@ -552,14 +596,12 @@ def _get_cluster_usage_intervals(
|
|
552
596
|
) -> Optional[List[Tuple[int, Optional[int]]]]:
|
553
597
|
if cluster_hash is None:
|
554
598
|
return None
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
return pickle.loads(usage_intervals)
|
562
|
-
return None
|
599
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
600
|
+
row = session.query(cluster_history_table).filter_by(
|
601
|
+
cluster_hash=cluster_hash).first()
|
602
|
+
if row is None or row.usage_intervals is None:
|
603
|
+
return None
|
604
|
+
return pickle.loads(row.usage_intervals)
|
563
605
|
|
564
606
|
|
565
607
|
def _get_cluster_launch_time(cluster_hash: str) -> Optional[int]:
|
@@ -591,15 +633,13 @@ def _get_cluster_duration(cluster_hash: str) -> int:
|
|
591
633
|
def _set_cluster_usage_intervals(
|
592
634
|
cluster_hash: str, usage_intervals: List[Tuple[int,
|
593
635
|
Optional[int]]]) -> None:
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
count = _DB.cursor.rowcount
|
602
|
-
_DB.conn.commit()
|
636
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
637
|
+
count = session.query(cluster_history_table).filter_by(
|
638
|
+
cluster_hash=cluster_hash).update({
|
639
|
+
cluster_history_table.c.usage_intervals:
|
640
|
+
pickle.dumps(usage_intervals)
|
641
|
+
})
|
642
|
+
session.commit()
|
603
643
|
assert count <= 1, count
|
604
644
|
if count == 0:
|
605
645
|
raise ValueError(f'Cluster hash {cluster_hash} not found.')
|
@@ -610,38 +650,38 @@ def set_owner_identity_for_cluster(cluster_name: str,
|
|
610
650
|
if owner_identity is None:
|
611
651
|
return
|
612
652
|
owner_identity_str = json.dumps(owner_identity)
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
653
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
654
|
+
count = session.query(cluster_table).filter_by(
|
655
|
+
name=cluster_name).update(
|
656
|
+
{cluster_table.c.owner: owner_identity_str})
|
657
|
+
session.commit()
|
618
658
|
assert count <= 1, count
|
619
659
|
if count == 0:
|
620
660
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
621
661
|
|
622
662
|
|
623
663
|
def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
return cluster_hash
|
630
|
-
return None
|
664
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
665
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
666
|
+
if row is None or row.cluster_hash is None:
|
667
|
+
return None
|
668
|
+
return row.cluster_hash
|
631
669
|
|
632
670
|
|
633
671
|
def get_launched_resources_from_cluster_hash(
|
634
672
|
cluster_hash: str) -> Optional[Tuple[int, Any]]:
|
673
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
674
|
+
row = session.query(cluster_history_table).filter_by(
|
675
|
+
cluster_hash=cluster_hash).first()
|
676
|
+
if row is None:
|
677
|
+
return None
|
678
|
+
num_nodes = row.num_nodes
|
679
|
+
launched_resources = row.launched_resources
|
635
680
|
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
if num_nodes is None or launched_resources is None:
|
641
|
-
return None
|
642
|
-
launched_resources = pickle.loads(launched_resources)
|
643
|
-
return num_nodes, launched_resources
|
644
|
-
return None
|
681
|
+
if num_nodes is None or launched_resources is None:
|
682
|
+
return None
|
683
|
+
launched_resources = pickle.loads(launched_resources)
|
684
|
+
return num_nodes, launched_resources
|
645
685
|
|
646
686
|
|
647
687
|
def _load_owner(record_owner: Optional[str]) -> Optional[List[str]]:
|
@@ -675,74 +715,62 @@ def _load_storage_mounts_metadata(
|
|
675
715
|
@context_utils.cancellation_guard
|
676
716
|
def get_cluster_from_name(
|
677
717
|
cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
(
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
'cluster_ever_up': bool(cluster_ever_up),
|
706
|
-
'status_updated_at': status_updated_at,
|
707
|
-
'user_hash': user_hash,
|
708
|
-
'user_name': get_user(user_hash).name,
|
709
|
-
'config_hash': config_hash,
|
710
|
-
}
|
711
|
-
return record
|
712
|
-
return None
|
718
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
719
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
720
|
+
if row is None:
|
721
|
+
return None
|
722
|
+
user_hash = _get_user_hash_or_current_user(row.user_hash)
|
723
|
+
# TODO: use namedtuple instead of dict
|
724
|
+
record = {
|
725
|
+
'name': row.name,
|
726
|
+
'launched_at': row.launched_at,
|
727
|
+
'handle': pickle.loads(row.handle),
|
728
|
+
'last_use': row.last_use,
|
729
|
+
'status': status_lib.ClusterStatus[row.status],
|
730
|
+
'autostop': row.autostop,
|
731
|
+
'to_down': bool(row.to_down),
|
732
|
+
'owner': _load_owner(row.owner),
|
733
|
+
'metadata': json.loads(row.metadata),
|
734
|
+
'cluster_hash': row.cluster_hash,
|
735
|
+
'storage_mounts_metadata': _load_storage_mounts_metadata(
|
736
|
+
row.storage_mounts_metadata),
|
737
|
+
'cluster_ever_up': bool(row.cluster_ever_up),
|
738
|
+
'status_updated_at': row.status_updated_at,
|
739
|
+
'user_hash': user_hash,
|
740
|
+
'user_name': get_user(user_hash).name,
|
741
|
+
'config_hash': row.config_hash,
|
742
|
+
'workspace': row.workspace,
|
743
|
+
}
|
744
|
+
return record
|
713
745
|
|
714
746
|
|
715
747
|
def get_clusters() -> List[Dict[str, Any]]:
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
'cluster_ever_up, status_updated_at, config_hash, user_hash '
|
720
|
-
'from clusters order by launched_at desc').fetchall()
|
748
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
749
|
+
rows = session.query(cluster_table).order_by(
|
750
|
+
sqlalchemy.desc(cluster_table.c.launched_at)).all()
|
721
751
|
records = []
|
722
752
|
for row in rows:
|
723
|
-
|
724
|
-
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
725
|
-
status_updated_at, config_hash, user_hash) = row
|
726
|
-
user_hash = _get_user_hash_or_current_user(user_hash)
|
753
|
+
user_hash = _get_user_hash_or_current_user(row.user_hash)
|
727
754
|
# TODO: use namedtuple instead of dict
|
728
755
|
record = {
|
729
|
-
'name': name,
|
730
|
-
'launched_at': launched_at,
|
731
|
-
'handle': pickle.loads(handle),
|
732
|
-
'last_use': last_use,
|
733
|
-
'status': status_lib.ClusterStatus[status],
|
734
|
-
'autostop': autostop,
|
735
|
-
'to_down': bool(to_down),
|
736
|
-
'owner': _load_owner(owner),
|
737
|
-
'metadata': json.loads(metadata),
|
738
|
-
'cluster_hash': cluster_hash,
|
739
|
-
'storage_mounts_metadata':
|
740
|
-
|
741
|
-
'cluster_ever_up': bool(cluster_ever_up),
|
742
|
-
'status_updated_at': status_updated_at,
|
756
|
+
'name': row.name,
|
757
|
+
'launched_at': row.launched_at,
|
758
|
+
'handle': pickle.loads(row.handle),
|
759
|
+
'last_use': row.last_use,
|
760
|
+
'status': status_lib.ClusterStatus[row.status],
|
761
|
+
'autostop': row.autostop,
|
762
|
+
'to_down': bool(row.to_down),
|
763
|
+
'owner': _load_owner(row.owner),
|
764
|
+
'metadata': json.loads(row.metadata),
|
765
|
+
'cluster_hash': row.cluster_hash,
|
766
|
+
'storage_mounts_metadata': _load_storage_mounts_metadata(
|
767
|
+
row.storage_mounts_metadata),
|
768
|
+
'cluster_ever_up': bool(row.cluster_ever_up),
|
769
|
+
'status_updated_at': row.status_updated_at,
|
743
770
|
'user_hash': user_hash,
|
744
771
|
'user_name': get_user(user_hash).name,
|
745
|
-
'config_hash': config_hash,
|
772
|
+
'config_hash': row.config_hash,
|
773
|
+
'workspace': row.workspace,
|
746
774
|
}
|
747
775
|
|
748
776
|
records.append(record)
|
@@ -750,43 +778,30 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
750
778
|
|
751
779
|
|
752
780
|
def get_clusters_from_history() -> List[Dict[str, Any]]:
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
'ON ch.cluster_hash=clusters.cluster_hash ').fetchall()
|
781
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
782
|
+
rows = session.query(
|
783
|
+
cluster_history_table.join(cluster_table,
|
784
|
+
cluster_history_table.c.cluster_hash ==
|
785
|
+
cluster_table.c.cluster_hash,
|
786
|
+
isouter=True)).all()
|
760
787
|
|
761
788
|
# '(cluster_hash, name, num_nodes, requested_resources, '
|
762
789
|
# 'launched_resources, usage_intervals) '
|
763
790
|
records = []
|
764
|
-
|
765
791
|
for row in rows:
|
766
792
|
# TODO: use namedtuple instead of dict
|
767
|
-
|
768
|
-
|
769
|
-
cluster_hash,
|
770
|
-
name,
|
771
|
-
num_nodes,
|
772
|
-
launched_resources,
|
773
|
-
usage_intervals,
|
774
|
-
status,
|
775
|
-
user_hash,
|
776
|
-
) = row[:7]
|
777
|
-
user_hash = _get_user_hash_or_current_user(user_hash)
|
778
|
-
|
793
|
+
user_hash = _get_user_hash_or_current_user(row.user_hash)
|
794
|
+
status = row.status
|
779
795
|
if status is not None:
|
780
796
|
status = status_lib.ClusterStatus[status]
|
781
|
-
|
782
797
|
record = {
|
783
|
-
'name': name,
|
784
|
-
'launched_at': _get_cluster_launch_time(cluster_hash),
|
785
|
-
'duration': _get_cluster_duration(cluster_hash),
|
786
|
-
'num_nodes': num_nodes,
|
787
|
-
'resources': pickle.loads(launched_resources),
|
788
|
-
'cluster_hash': cluster_hash,
|
789
|
-
'usage_intervals': pickle.loads(usage_intervals),
|
798
|
+
'name': row.name,
|
799
|
+
'launched_at': _get_cluster_launch_time(row.cluster_hash),
|
800
|
+
'duration': _get_cluster_duration(row.cluster_hash),
|
801
|
+
'num_nodes': row.num_nodes,
|
802
|
+
'resources': pickle.loads(row.launched_resources),
|
803
|
+
'cluster_hash': row.cluster_hash,
|
804
|
+
'usage_intervals': pickle.loads(row.usage_intervals),
|
790
805
|
'status': status,
|
791
806
|
'user_hash': user_hash,
|
792
807
|
}
|
@@ -799,29 +814,29 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
|
|
799
814
|
|
800
815
|
|
801
816
|
def get_cluster_names_start_with(starts_with: str) -> List[str]:
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
817
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
818
|
+
rows = session.query(cluster_table).filter(
|
819
|
+
cluster_table.c.name.like(f'{starts_with}%')).all()
|
820
|
+
return [row.name for row in rows]
|
806
821
|
|
807
|
-
def get_cached_enabled_clouds(
|
808
|
-
cloud_capability: 'cloud.CloudCapability') -> List['clouds.Cloud']:
|
809
822
|
|
810
|
-
|
811
|
-
|
823
|
+
def get_cached_enabled_clouds(cloud_capability: 'cloud.CloudCapability',
|
824
|
+
workspace: str) -> List['clouds.Cloud']:
|
825
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
826
|
+
row = session.query(config_table).filter_by(
|
827
|
+
key=_get_enabled_clouds_key(cloud_capability, workspace)).first()
|
812
828
|
ret = []
|
813
|
-
|
814
|
-
ret = json.loads(value)
|
815
|
-
break
|
829
|
+
if row:
|
830
|
+
ret = json.loads(row.value)
|
816
831
|
enabled_clouds: List['clouds.Cloud'] = []
|
817
832
|
for c in ret:
|
818
833
|
try:
|
819
834
|
cloud = registry.CLOUD_REGISTRY.from_str(c)
|
820
835
|
except ValueError:
|
821
|
-
# Handle the case for the clouds whose support has been
|
822
|
-
# SkyPilot, e.g., 'local' was a cloud in the past
|
823
|
-
# in the database for users before #3037.
|
824
|
-
# clouds and continue.
|
836
|
+
# Handle the case for the clouds whose support has been
|
837
|
+
# removed from SkyPilot, e.g., 'local' was a cloud in the past
|
838
|
+
# and may be stored in the database for users before #3037.
|
839
|
+
# We should ignore removed clouds and continue.
|
825
840
|
continue
|
826
841
|
if cloud is not None:
|
827
842
|
enabled_clouds.append(cloud)
|
@@ -829,15 +844,32 @@ def get_cached_enabled_clouds(
|
|
829
844
|
|
830
845
|
|
831
846
|
def set_enabled_clouds(enabled_clouds: List[str],
|
832
|
-
cloud_capability: 'cloud.CloudCapability'
|
833
|
-
|
834
|
-
|
835
|
-
(
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
847
|
+
cloud_capability: 'cloud.CloudCapability',
|
848
|
+
workspace: str) -> None:
|
849
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
850
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
851
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
852
|
+
insert_stmnt = sqlite.insert(config_table).values(
|
853
|
+
key=_get_enabled_clouds_key(cloud_capability, workspace),
|
854
|
+
value=json.dumps(enabled_clouds))
|
855
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
856
|
+
index_elements=[config_table.c.key],
|
857
|
+
set_={config_table.c.value: json.dumps(enabled_clouds)})
|
858
|
+
session.execute(do_update_stmt)
|
859
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
860
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
861
|
+
# TODO(syang) support postgres dialect
|
862
|
+
session.rollback()
|
863
|
+
raise ValueError('Unsupported database dialect')
|
864
|
+
else:
|
865
|
+
session.rollback()
|
866
|
+
raise ValueError('Unsupported database dialect')
|
867
|
+
session.commit()
|
868
|
+
|
869
|
+
|
870
|
+
def _get_enabled_clouds_key(cloud_capability: 'cloud.CloudCapability',
|
871
|
+
workspace: str) -> str:
|
872
|
+
return _ENABLED_CLOUDS_KEY_PREFIX + workspace + '_' + cloud_capability.value
|
841
873
|
|
842
874
|
|
843
875
|
def add_or_update_storage(storage_name: str,
|
@@ -853,26 +885,48 @@ def add_or_update_storage(storage_name: str,
|
|
853
885
|
if not status_check(storage_status):
|
854
886
|
raise ValueError(f'Error in updating global state. Storage Status '
|
855
887
|
f'{storage_status} is passed in incorrectly')
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
888
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
889
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
890
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
891
|
+
insert_stmnt = sqlite.insert(storage_table).values(
|
892
|
+
name=storage_name,
|
893
|
+
handle=handle,
|
894
|
+
last_use=last_use,
|
895
|
+
launched_at=storage_launched_at,
|
896
|
+
status=storage_status.value)
|
897
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
898
|
+
index_elements=[storage_table.c.name],
|
899
|
+
set_={
|
900
|
+
storage_table.c.handle: handle,
|
901
|
+
storage_table.c.last_use: last_use,
|
902
|
+
storage_table.c.launched_at: storage_launched_at,
|
903
|
+
storage_table.c.status: storage_status.value
|
904
|
+
})
|
905
|
+
session.execute(do_update_stmt)
|
906
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
907
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
908
|
+
# TODO(syang) support postgres dialect
|
909
|
+
session.rollback()
|
910
|
+
raise ValueError('Unsupported database dialect')
|
911
|
+
else:
|
912
|
+
session.rollback()
|
913
|
+
raise ValueError('Unsupported database dialect')
|
914
|
+
session.commit()
|
860
915
|
|
861
916
|
|
862
917
|
def remove_storage(storage_name: str):
|
863
918
|
"""Removes Storage from Database"""
|
864
|
-
|
865
|
-
|
919
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
920
|
+
session.query(storage_table).filter_by(name=storage_name).delete()
|
921
|
+
session.commit()
|
866
922
|
|
867
923
|
|
868
924
|
def set_storage_status(storage_name: str,
|
869
925
|
status: status_lib.StorageStatus) -> None:
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
count = _DB.cursor.rowcount
|
875
|
-
_DB.conn.commit()
|
926
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
927
|
+
count = session.query(storage_table).filter_by(
|
928
|
+
name=storage_name).update({storage_table.c.status: status.value})
|
929
|
+
session.commit()
|
876
930
|
assert count <= 1, count
|
877
931
|
if count == 0:
|
878
932
|
raise ValueError(f'Storage {storage_name} not found.')
|
@@ -880,21 +934,20 @@ def set_storage_status(storage_name: str,
|
|
880
934
|
|
881
935
|
def get_storage_status(storage_name: str) -> Optional[status_lib.StorageStatus]:
|
882
936
|
assert storage_name is not None, 'storage_name cannot be None'
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
return status_lib.StorageStatus[status]
|
937
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
938
|
+
row = session.query(storage_table).filter_by(name=storage_name).first()
|
939
|
+
if row:
|
940
|
+
return status_lib.StorageStatus[row.status]
|
887
941
|
return None
|
888
942
|
|
889
943
|
|
890
944
|
def set_storage_handle(storage_name: str,
|
891
945
|
handle: 'Storage.StorageMetadata') -> None:
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
_DB.conn.commit()
|
946
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
947
|
+
count = session.query(storage_table).filter_by(
|
948
|
+
name=storage_name).update(
|
949
|
+
{storage_table.c.handle: pickle.dumps(handle)})
|
950
|
+
session.commit()
|
898
951
|
assert count <= 1, count
|
899
952
|
if count == 0:
|
900
953
|
raise ValueError(f'Storage{storage_name} not found.')
|
@@ -904,38 +957,48 @@ def get_handle_from_storage_name(
|
|
904
957
|
storage_name: Optional[str]) -> Optional['Storage.StorageMetadata']:
|
905
958
|
if storage_name is None:
|
906
959
|
return None
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
return None
|
912
|
-
return pickle.loads(handle)
|
960
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
961
|
+
row = session.query(storage_table).filter_by(name=storage_name).first()
|
962
|
+
if row:
|
963
|
+
return pickle.loads(row.handle)
|
913
964
|
return None
|
914
965
|
|
915
966
|
|
916
967
|
def get_glob_storage_name(storage_name: str) -> List[str]:
|
917
968
|
assert storage_name is not None, 'storage_name cannot be None'
|
918
|
-
|
919
|
-
|
920
|
-
|
969
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
970
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
971
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
972
|
+
rows = session.query(storage_table).filter(
|
973
|
+
storage_table.c.name.op('GLOB')(storage_name)).all()
|
974
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
975
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
976
|
+
# TODO(syang) support postgres dialect
|
977
|
+
# postgres does not support GLOB
|
978
|
+
raise ValueError('Unsupported database dialect')
|
979
|
+
else:
|
980
|
+
raise ValueError('Unsupported database dialect')
|
981
|
+
return [row.name for row in rows]
|
921
982
|
|
922
983
|
|
923
984
|
def get_storage_names_start_with(starts_with: str) -> List[str]:
|
924
|
-
|
925
|
-
|
926
|
-
|
985
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
986
|
+
rows = session.query(storage_table).filter(
|
987
|
+
storage_table.c.name.like(f'{starts_with}%')).all()
|
988
|
+
return [row.name for row in rows]
|
927
989
|
|
928
990
|
|
929
991
|
def get_storage() -> List[Dict[str, Any]]:
|
930
|
-
|
992
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
993
|
+
rows = session.query(storage_table).all()
|
931
994
|
records = []
|
932
|
-
for
|
995
|
+
for row in rows:
|
933
996
|
# TODO: use namedtuple instead of dict
|
934
997
|
records.append({
|
935
|
-
'name': name,
|
936
|
-
'launched_at': launched_at,
|
937
|
-
'handle': pickle.loads(handle),
|
938
|
-
'last_use': last_use,
|
939
|
-
'status': status_lib.StorageStatus[status],
|
998
|
+
'name': row.name,
|
999
|
+
'launched_at': row.launched_at,
|
1000
|
+
'handle': pickle.loads(row.handle),
|
1001
|
+
'last_use': row.last_use,
|
1002
|
+
'status': status_lib.StorageStatus[row.status],
|
940
1003
|
})
|
941
1004
|
return records
|