skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/cloud_vm_ray_backend.py +43 -60
- sky/cli.py +55 -637
- sky/client/cli.py +55 -637
- sky/clouds/kubernetes.py +3 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/provision/__init__.py +1 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/skylet/constants.py +39 -0
- sky/skylet/job_lib.py +8 -0
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +51 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +19 -36
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +43 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/benchmark/benchmark_state.py
DELETED
@@ -1,295 +0,0 @@
|
|
1
|
-
"""Sky benchmark database, backed by sqlite."""
|
2
|
-
import enum
|
3
|
-
import functools
|
4
|
-
import os
|
5
|
-
import pathlib
|
6
|
-
import pickle
|
7
|
-
import sqlite3
|
8
|
-
import threading
|
9
|
-
import time
|
10
|
-
import typing
|
11
|
-
from typing import Any, Dict, List, NamedTuple, Optional, Tuple
|
12
|
-
|
13
|
-
if typing.TYPE_CHECKING:
|
14
|
-
from sky.backends import backend as backend_lib
|
15
|
-
|
16
|
-
_BENCHMARK_BUCKET_NAME_KEY = 'bucket_name'
|
17
|
-
_BENCHMARK_BUCKET_TYPE_KEY = 'bucket_type'
|
18
|
-
|
19
|
-
_BENCHMARK_DB_PATH = os.path.expanduser('~/.sky/benchmark.db')
|
20
|
-
|
21
|
-
|
22
|
-
class _BenchmarkSQLiteConn(threading.local):
|
23
|
-
"""Thread-local connection to the sqlite3 database.
|
24
|
-
|
25
|
-
The database has three types of tables.
|
26
|
-
1. Benchmark table stores the benchmark names and
|
27
|
-
which resources are used for benchmarking.
|
28
|
-
2. Benchmark Config table stores Sky Benchmark configurations
|
29
|
-
(e.g., benchmark bucket name).
|
30
|
-
3. Benchmark Results table stores the benchmark results
|
31
|
-
of the individual clusters used for benchmarking.
|
32
|
-
"""
|
33
|
-
|
34
|
-
def __init__(self) -> None:
|
35
|
-
super().__init__()
|
36
|
-
self.conn = sqlite3.connect(_BENCHMARK_DB_PATH)
|
37
|
-
self.cursor = self.conn.cursor()
|
38
|
-
self._create_tables()
|
39
|
-
|
40
|
-
def _create_tables(self) -> None:
|
41
|
-
# Table for Benchmark
|
42
|
-
self.cursor.execute("""\
|
43
|
-
CREATE TABLE IF NOT EXISTS benchmark (
|
44
|
-
name TEXT PRIMARY KEY,
|
45
|
-
task TEXT,
|
46
|
-
bucket TEXT,
|
47
|
-
launched_at INTEGER)""")
|
48
|
-
# Table for Benchmark Config (e.g., benchmark bucket name)
|
49
|
-
self.cursor.execute("""\
|
50
|
-
CREATE TABLE IF NOT EXISTS benchmark_config (
|
51
|
-
key TEXT PRIMARY KEY, value TEXT)""")
|
52
|
-
# Table for Benchmark Results
|
53
|
-
self.cursor.execute("""\
|
54
|
-
CREATE TABLE IF NOT EXISTS benchmark_results (
|
55
|
-
cluster TEXT PRIMARY KEY,
|
56
|
-
status TEXT,
|
57
|
-
num_nodes INTEGER,
|
58
|
-
resources BLOB,
|
59
|
-
record BLOB,
|
60
|
-
benchmark TEXT,
|
61
|
-
FOREIGN KEY (benchmark)
|
62
|
-
REFERENCES benchmark (name)
|
63
|
-
ON DELETE CASCADE
|
64
|
-
)""")
|
65
|
-
self.conn.commit()
|
66
|
-
|
67
|
-
|
68
|
-
_BENCHMARK_DB = None
|
69
|
-
_benchmark_db_init_lock = threading.Lock()
|
70
|
-
|
71
|
-
|
72
|
-
def _init_db(func):
|
73
|
-
"""Initialize the database."""
|
74
|
-
|
75
|
-
@functools.wraps(func)
|
76
|
-
def wrapper(*args, **kwargs):
|
77
|
-
global _BENCHMARK_DB
|
78
|
-
if _BENCHMARK_DB:
|
79
|
-
return func(*args, **kwargs)
|
80
|
-
with _benchmark_db_init_lock:
|
81
|
-
if not _BENCHMARK_DB:
|
82
|
-
os.makedirs(pathlib.Path(_BENCHMARK_DB_PATH).parents[0],
|
83
|
-
exist_ok=True)
|
84
|
-
_BENCHMARK_DB = _BenchmarkSQLiteConn()
|
85
|
-
return func(*args, **kwargs)
|
86
|
-
|
87
|
-
return wrapper
|
88
|
-
|
89
|
-
|
90
|
-
class BenchmarkStatus(enum.Enum):
|
91
|
-
"""Benchmark job status.
|
92
|
-
|
93
|
-
This is slightly different from the job status maintained by the job queue
|
94
|
-
in the following aspects:
|
95
|
-
1. THE INIT state includes both INIT and PENDING states, because
|
96
|
-
the benchmarking job is always the first job of the cluster.
|
97
|
-
2. The TERMINATED state includes the CANCELLED and FAILED states, as we
|
98
|
-
cannot distinguish the two states when the cluster is not alive.
|
99
|
-
3. The SUCCEEDED state is renamed to FINISHED.
|
100
|
-
"""
|
101
|
-
# Corresponding job status: INIT, PENDING.
|
102
|
-
INIT = 'INIT'
|
103
|
-
|
104
|
-
# Corresponding job status: RUNNING.
|
105
|
-
RUNNING = 'RUNNING'
|
106
|
-
|
107
|
-
# Job status: CANCELLED, FAILED.
|
108
|
-
# TODO(woosuk): Add KILLED state to distinguish whether the benchmarking
|
109
|
-
# job is killed by the user or by its own error.
|
110
|
-
TERMINATED = 'TERMINATED'
|
111
|
-
|
112
|
-
# Job status: SUCCEEDED.
|
113
|
-
# Jobs terminated with zero exit code.
|
114
|
-
FINISHED = 'FINISHED'
|
115
|
-
|
116
|
-
@classmethod
|
117
|
-
def terminal_statuses(cls) -> List['BenchmarkStatus']:
|
118
|
-
return (cls.TERMINATED, cls.FINISHED)
|
119
|
-
|
120
|
-
def is_terminal(self) -> bool:
|
121
|
-
return self in self.terminal_statuses()
|
122
|
-
|
123
|
-
|
124
|
-
class BenchmarkRecord(NamedTuple):
|
125
|
-
"""Benchmark record."""
|
126
|
-
|
127
|
-
# The time when the benchmarking job is launched.
|
128
|
-
start_time: Optional[float] = None
|
129
|
-
|
130
|
-
# The last known time. Either the job finish time or the last step time.
|
131
|
-
last_time: Optional[float] = None
|
132
|
-
|
133
|
-
# The number of steps taken so far.
|
134
|
-
num_steps_so_far: Optional[int] = None
|
135
|
-
|
136
|
-
# The average time (in secs) taken per step.
|
137
|
-
seconds_per_step: Optional[float] = None
|
138
|
-
|
139
|
-
# The estimated end-to-end time (in secs) of the benchmarking job.
|
140
|
-
estimated_total_seconds: Optional[float] = None
|
141
|
-
|
142
|
-
|
143
|
-
@_init_db
|
144
|
-
def add_benchmark(benchmark_name: str, task_name: Optional[str],
|
145
|
-
bucket_name: str) -> None:
|
146
|
-
"""Add a new benchmark."""
|
147
|
-
assert _BENCHMARK_DB is not None
|
148
|
-
launched_at = int(time.time())
|
149
|
-
_BENCHMARK_DB.cursor.execute(
|
150
|
-
'INSERT INTO benchmark'
|
151
|
-
'(name, task, bucket, launched_at) '
|
152
|
-
'VALUES (?, ?, ?, ?)',
|
153
|
-
(benchmark_name, task_name, bucket_name, launched_at))
|
154
|
-
_BENCHMARK_DB.conn.commit()
|
155
|
-
|
156
|
-
|
157
|
-
@_init_db
|
158
|
-
def add_benchmark_result(benchmark_name: str,
|
159
|
-
cluster_handle: 'backend_lib.ResourceHandle') -> None:
|
160
|
-
assert _BENCHMARK_DB is not None
|
161
|
-
name = cluster_handle.cluster_name
|
162
|
-
num_nodes = cluster_handle.launched_nodes
|
163
|
-
resources = pickle.dumps(cluster_handle.launched_resources)
|
164
|
-
_BENCHMARK_DB.cursor.execute(
|
165
|
-
'INSERT INTO benchmark_results'
|
166
|
-
'(cluster, status, num_nodes, resources, record, benchmark) '
|
167
|
-
'VALUES (?, ?, ?, ?, NULL, ?)', (name, BenchmarkStatus.INIT.value,
|
168
|
-
num_nodes, resources, benchmark_name))
|
169
|
-
_BENCHMARK_DB.conn.commit()
|
170
|
-
|
171
|
-
|
172
|
-
@_init_db
|
173
|
-
def update_benchmark_result(
|
174
|
-
benchmark_name: str, cluster_name: str,
|
175
|
-
benchmark_status: BenchmarkStatus,
|
176
|
-
benchmark_record: Optional[BenchmarkRecord]) -> None:
|
177
|
-
assert _BENCHMARK_DB is not None
|
178
|
-
_BENCHMARK_DB.cursor.execute(
|
179
|
-
'UPDATE benchmark_results SET '
|
180
|
-
'status=(?), record=(?) WHERE benchmark=(?) AND cluster=(?)',
|
181
|
-
(benchmark_status.value, pickle.dumps(benchmark_record), benchmark_name,
|
182
|
-
cluster_name))
|
183
|
-
_BENCHMARK_DB.conn.commit()
|
184
|
-
|
185
|
-
|
186
|
-
@_init_db
|
187
|
-
def delete_benchmark(benchmark_name: str) -> None:
|
188
|
-
"""Delete a benchmark result."""
|
189
|
-
assert _BENCHMARK_DB is not None
|
190
|
-
_BENCHMARK_DB.cursor.execute(
|
191
|
-
'DELETE FROM benchmark_results WHERE benchmark=(?)', (benchmark_name,))
|
192
|
-
_BENCHMARK_DB.cursor.execute('DELETE FROM benchmark WHERE name=(?)',
|
193
|
-
(benchmark_name,))
|
194
|
-
_BENCHMARK_DB.conn.commit()
|
195
|
-
|
196
|
-
|
197
|
-
@_init_db
|
198
|
-
def get_benchmark_from_name(benchmark_name: str) -> Optional[Dict[str, Any]]:
|
199
|
-
"""Get a benchmark from its name."""
|
200
|
-
assert _BENCHMARK_DB is not None
|
201
|
-
rows = _BENCHMARK_DB.cursor.execute(
|
202
|
-
'SELECT * FROM benchmark WHERE name=(?)', (benchmark_name,))
|
203
|
-
for name, task, bucket, launched_at in rows:
|
204
|
-
record = {
|
205
|
-
'name': name,
|
206
|
-
'task': task,
|
207
|
-
'bucket': bucket,
|
208
|
-
'launched_at': launched_at,
|
209
|
-
}
|
210
|
-
return record
|
211
|
-
|
212
|
-
|
213
|
-
@_init_db
|
214
|
-
def get_benchmarks() -> List[Dict[str, Any]]:
|
215
|
-
"""Get all benchmarks."""
|
216
|
-
assert _BENCHMARK_DB is not None
|
217
|
-
rows = _BENCHMARK_DB.cursor.execute('SELECT * FROM benchmark')
|
218
|
-
records = []
|
219
|
-
for name, task, bucket, launched_at in rows:
|
220
|
-
record = {
|
221
|
-
'name': name,
|
222
|
-
'task': task,
|
223
|
-
'bucket': bucket,
|
224
|
-
'launched_at': launched_at,
|
225
|
-
}
|
226
|
-
records.append(record)
|
227
|
-
return records
|
228
|
-
|
229
|
-
|
230
|
-
@_init_db
|
231
|
-
def set_benchmark_bucket(bucket_name: str, bucket_type: str) -> None:
|
232
|
-
"""Save the benchmark bucket name and type."""
|
233
|
-
assert _BENCHMARK_DB is not None
|
234
|
-
_BENCHMARK_DB.cursor.execute(
|
235
|
-
'REPLACE INTO benchmark_config (key, value) VALUES (?, ?)',
|
236
|
-
(_BENCHMARK_BUCKET_NAME_KEY, bucket_name))
|
237
|
-
_BENCHMARK_DB.cursor.execute(
|
238
|
-
'REPLACE INTO benchmark_config (key, value) VALUES (?, ?)',
|
239
|
-
(_BENCHMARK_BUCKET_TYPE_KEY, bucket_type))
|
240
|
-
_BENCHMARK_DB.conn.commit()
|
241
|
-
|
242
|
-
|
243
|
-
@_init_db
|
244
|
-
def get_benchmark_bucket() -> Tuple[Optional[str], Optional[str]]:
|
245
|
-
"""Get the benchmark bucket name and type."""
|
246
|
-
assert _BENCHMARK_DB is not None
|
247
|
-
rows = _BENCHMARK_DB.cursor.execute(
|
248
|
-
'SELECT value FROM benchmark_config WHERE key=(?)',
|
249
|
-
(_BENCHMARK_BUCKET_NAME_KEY,))
|
250
|
-
bucket_name = None
|
251
|
-
for (value,) in rows:
|
252
|
-
bucket_name = value
|
253
|
-
break
|
254
|
-
|
255
|
-
rows = _BENCHMARK_DB.cursor.execute(
|
256
|
-
'SELECT value FROM benchmark_config WHERE key=(?)',
|
257
|
-
(_BENCHMARK_BUCKET_TYPE_KEY,))
|
258
|
-
bucket_type = None
|
259
|
-
for (value,) in rows:
|
260
|
-
bucket_type = value
|
261
|
-
break
|
262
|
-
return bucket_name, bucket_type
|
263
|
-
|
264
|
-
|
265
|
-
@_init_db
|
266
|
-
def get_benchmark_clusters(benchmark_name: str) -> List[str]:
|
267
|
-
"""Get all clusters for a benchmark."""
|
268
|
-
assert _BENCHMARK_DB is not None
|
269
|
-
rows = _BENCHMARK_DB.cursor.execute(
|
270
|
-
'SELECT cluster FROM benchmark_results WHERE benchmark=(?)',
|
271
|
-
(benchmark_name,))
|
272
|
-
return [row[0] for row in rows]
|
273
|
-
|
274
|
-
|
275
|
-
@_init_db
|
276
|
-
def get_benchmark_results(benchmark_name: str) -> List[Dict[str, Any]]:
|
277
|
-
assert _BENCHMARK_DB is not None
|
278
|
-
rows = _BENCHMARK_DB.cursor.execute(
|
279
|
-
'SELECT * FROM benchmark_results WHERE benchmark=(?)',
|
280
|
-
(benchmark_name,))
|
281
|
-
records = []
|
282
|
-
for (cluster, status, num_nodes, resources, benchmark_record,
|
283
|
-
benchmark) in rows:
|
284
|
-
if benchmark_record is not None:
|
285
|
-
benchmark_record = pickle.loads(benchmark_record)
|
286
|
-
record = {
|
287
|
-
'cluster': cluster,
|
288
|
-
'status': BenchmarkStatus[status],
|
289
|
-
'num_nodes': num_nodes,
|
290
|
-
'resources': pickle.loads(resources),
|
291
|
-
'record': benchmark_record,
|
292
|
-
'benchmark': benchmark,
|
293
|
-
}
|
294
|
-
records.append(record)
|
295
|
-
return records
|