skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
@@ -1,295 +0,0 @@
1
- """Sky benchmark database, backed by sqlite."""
2
- import enum
3
- import functools
4
- import os
5
- import pathlib
6
- import pickle
7
- import sqlite3
8
- import threading
9
- import time
10
- import typing
11
- from typing import Any, Dict, List, NamedTuple, Optional, Tuple
12
-
13
- if typing.TYPE_CHECKING:
14
- from sky.backends import backend as backend_lib
15
-
16
- _BENCHMARK_BUCKET_NAME_KEY = 'bucket_name'
17
- _BENCHMARK_BUCKET_TYPE_KEY = 'bucket_type'
18
-
19
- _BENCHMARK_DB_PATH = os.path.expanduser('~/.sky/benchmark.db')
20
-
21
-
22
- class _BenchmarkSQLiteConn(threading.local):
23
- """Thread-local connection to the sqlite3 database.
24
-
25
- The database has three types of tables.
26
- 1. Benchmark table stores the benchmark names and
27
- which resources are used for benchmarking.
28
- 2. Benchmark Config table stores Sky Benchmark configurations
29
- (e.g., benchmark bucket name).
30
- 3. Benchmark Results table stores the benchmark results
31
- of the individual clusters used for benchmarking.
32
- """
33
-
34
- def __init__(self) -> None:
35
- super().__init__()
36
- self.conn = sqlite3.connect(_BENCHMARK_DB_PATH)
37
- self.cursor = self.conn.cursor()
38
- self._create_tables()
39
-
40
- def _create_tables(self) -> None:
41
- # Table for Benchmark
42
- self.cursor.execute("""\
43
- CREATE TABLE IF NOT EXISTS benchmark (
44
- name TEXT PRIMARY KEY,
45
- task TEXT,
46
- bucket TEXT,
47
- launched_at INTEGER)""")
48
- # Table for Benchmark Config (e.g., benchmark bucket name)
49
- self.cursor.execute("""\
50
- CREATE TABLE IF NOT EXISTS benchmark_config (
51
- key TEXT PRIMARY KEY, value TEXT)""")
52
- # Table for Benchmark Results
53
- self.cursor.execute("""\
54
- CREATE TABLE IF NOT EXISTS benchmark_results (
55
- cluster TEXT PRIMARY KEY,
56
- status TEXT,
57
- num_nodes INTEGER,
58
- resources BLOB,
59
- record BLOB,
60
- benchmark TEXT,
61
- FOREIGN KEY (benchmark)
62
- REFERENCES benchmark (name)
63
- ON DELETE CASCADE
64
- )""")
65
- self.conn.commit()
66
-
67
-
68
- _BENCHMARK_DB = None
69
- _benchmark_db_init_lock = threading.Lock()
70
-
71
-
72
- def _init_db(func):
73
- """Initialize the database."""
74
-
75
- @functools.wraps(func)
76
- def wrapper(*args, **kwargs):
77
- global _BENCHMARK_DB
78
- if _BENCHMARK_DB:
79
- return func(*args, **kwargs)
80
- with _benchmark_db_init_lock:
81
- if not _BENCHMARK_DB:
82
- os.makedirs(pathlib.Path(_BENCHMARK_DB_PATH).parents[0],
83
- exist_ok=True)
84
- _BENCHMARK_DB = _BenchmarkSQLiteConn()
85
- return func(*args, **kwargs)
86
-
87
- return wrapper
88
-
89
-
90
- class BenchmarkStatus(enum.Enum):
91
- """Benchmark job status.
92
-
93
- This is slightly different from the job status maintained by the job queue
94
- in the following aspects:
95
- 1. THE INIT state includes both INIT and PENDING states, because
96
- the benchmarking job is always the first job of the cluster.
97
- 2. The TERMINATED state includes the CANCELLED and FAILED states, as we
98
- cannot distinguish the two states when the cluster is not alive.
99
- 3. The SUCCEEDED state is renamed to FINISHED.
100
- """
101
- # Corresponding job status: INIT, PENDING.
102
- INIT = 'INIT'
103
-
104
- # Corresponding job status: RUNNING.
105
- RUNNING = 'RUNNING'
106
-
107
- # Job status: CANCELLED, FAILED.
108
- # TODO(woosuk): Add KILLED state to distinguish whether the benchmarking
109
- # job is killed by the user or by its own error.
110
- TERMINATED = 'TERMINATED'
111
-
112
- # Job status: SUCCEEDED.
113
- # Jobs terminated with zero exit code.
114
- FINISHED = 'FINISHED'
115
-
116
- @classmethod
117
- def terminal_statuses(cls) -> List['BenchmarkStatus']:
118
- return (cls.TERMINATED, cls.FINISHED)
119
-
120
- def is_terminal(self) -> bool:
121
- return self in self.terminal_statuses()
122
-
123
-
124
- class BenchmarkRecord(NamedTuple):
125
- """Benchmark record."""
126
-
127
- # The time when the benchmarking job is launched.
128
- start_time: Optional[float] = None
129
-
130
- # The last known time. Either the job finish time or the last step time.
131
- last_time: Optional[float] = None
132
-
133
- # The number of steps taken so far.
134
- num_steps_so_far: Optional[int] = None
135
-
136
- # The average time (in secs) taken per step.
137
- seconds_per_step: Optional[float] = None
138
-
139
- # The estimated end-to-end time (in secs) of the benchmarking job.
140
- estimated_total_seconds: Optional[float] = None
141
-
142
-
143
- @_init_db
144
- def add_benchmark(benchmark_name: str, task_name: Optional[str],
145
- bucket_name: str) -> None:
146
- """Add a new benchmark."""
147
- assert _BENCHMARK_DB is not None
148
- launched_at = int(time.time())
149
- _BENCHMARK_DB.cursor.execute(
150
- 'INSERT INTO benchmark'
151
- '(name, task, bucket, launched_at) '
152
- 'VALUES (?, ?, ?, ?)',
153
- (benchmark_name, task_name, bucket_name, launched_at))
154
- _BENCHMARK_DB.conn.commit()
155
-
156
-
157
- @_init_db
158
- def add_benchmark_result(benchmark_name: str,
159
- cluster_handle: 'backend_lib.ResourceHandle') -> None:
160
- assert _BENCHMARK_DB is not None
161
- name = cluster_handle.cluster_name
162
- num_nodes = cluster_handle.launched_nodes
163
- resources = pickle.dumps(cluster_handle.launched_resources)
164
- _BENCHMARK_DB.cursor.execute(
165
- 'INSERT INTO benchmark_results'
166
- '(cluster, status, num_nodes, resources, record, benchmark) '
167
- 'VALUES (?, ?, ?, ?, NULL, ?)', (name, BenchmarkStatus.INIT.value,
168
- num_nodes, resources, benchmark_name))
169
- _BENCHMARK_DB.conn.commit()
170
-
171
-
172
- @_init_db
173
- def update_benchmark_result(
174
- benchmark_name: str, cluster_name: str,
175
- benchmark_status: BenchmarkStatus,
176
- benchmark_record: Optional[BenchmarkRecord]) -> None:
177
- assert _BENCHMARK_DB is not None
178
- _BENCHMARK_DB.cursor.execute(
179
- 'UPDATE benchmark_results SET '
180
- 'status=(?), record=(?) WHERE benchmark=(?) AND cluster=(?)',
181
- (benchmark_status.value, pickle.dumps(benchmark_record), benchmark_name,
182
- cluster_name))
183
- _BENCHMARK_DB.conn.commit()
184
-
185
-
186
- @_init_db
187
- def delete_benchmark(benchmark_name: str) -> None:
188
- """Delete a benchmark result."""
189
- assert _BENCHMARK_DB is not None
190
- _BENCHMARK_DB.cursor.execute(
191
- 'DELETE FROM benchmark_results WHERE benchmark=(?)', (benchmark_name,))
192
- _BENCHMARK_DB.cursor.execute('DELETE FROM benchmark WHERE name=(?)',
193
- (benchmark_name,))
194
- _BENCHMARK_DB.conn.commit()
195
-
196
-
197
- @_init_db
198
- def get_benchmark_from_name(benchmark_name: str) -> Optional[Dict[str, Any]]:
199
- """Get a benchmark from its name."""
200
- assert _BENCHMARK_DB is not None
201
- rows = _BENCHMARK_DB.cursor.execute(
202
- 'SELECT * FROM benchmark WHERE name=(?)', (benchmark_name,))
203
- for name, task, bucket, launched_at in rows:
204
- record = {
205
- 'name': name,
206
- 'task': task,
207
- 'bucket': bucket,
208
- 'launched_at': launched_at,
209
- }
210
- return record
211
-
212
-
213
- @_init_db
214
- def get_benchmarks() -> List[Dict[str, Any]]:
215
- """Get all benchmarks."""
216
- assert _BENCHMARK_DB is not None
217
- rows = _BENCHMARK_DB.cursor.execute('SELECT * FROM benchmark')
218
- records = []
219
- for name, task, bucket, launched_at in rows:
220
- record = {
221
- 'name': name,
222
- 'task': task,
223
- 'bucket': bucket,
224
- 'launched_at': launched_at,
225
- }
226
- records.append(record)
227
- return records
228
-
229
-
230
- @_init_db
231
- def set_benchmark_bucket(bucket_name: str, bucket_type: str) -> None:
232
- """Save the benchmark bucket name and type."""
233
- assert _BENCHMARK_DB is not None
234
- _BENCHMARK_DB.cursor.execute(
235
- 'REPLACE INTO benchmark_config (key, value) VALUES (?, ?)',
236
- (_BENCHMARK_BUCKET_NAME_KEY, bucket_name))
237
- _BENCHMARK_DB.cursor.execute(
238
- 'REPLACE INTO benchmark_config (key, value) VALUES (?, ?)',
239
- (_BENCHMARK_BUCKET_TYPE_KEY, bucket_type))
240
- _BENCHMARK_DB.conn.commit()
241
-
242
-
243
- @_init_db
244
- def get_benchmark_bucket() -> Tuple[Optional[str], Optional[str]]:
245
- """Get the benchmark bucket name and type."""
246
- assert _BENCHMARK_DB is not None
247
- rows = _BENCHMARK_DB.cursor.execute(
248
- 'SELECT value FROM benchmark_config WHERE key=(?)',
249
- (_BENCHMARK_BUCKET_NAME_KEY,))
250
- bucket_name = None
251
- for (value,) in rows:
252
- bucket_name = value
253
- break
254
-
255
- rows = _BENCHMARK_DB.cursor.execute(
256
- 'SELECT value FROM benchmark_config WHERE key=(?)',
257
- (_BENCHMARK_BUCKET_TYPE_KEY,))
258
- bucket_type = None
259
- for (value,) in rows:
260
- bucket_type = value
261
- break
262
- return bucket_name, bucket_type
263
-
264
-
265
- @_init_db
266
- def get_benchmark_clusters(benchmark_name: str) -> List[str]:
267
- """Get all clusters for a benchmark."""
268
- assert _BENCHMARK_DB is not None
269
- rows = _BENCHMARK_DB.cursor.execute(
270
- 'SELECT cluster FROM benchmark_results WHERE benchmark=(?)',
271
- (benchmark_name,))
272
- return [row[0] for row in rows]
273
-
274
-
275
- @_init_db
276
- def get_benchmark_results(benchmark_name: str) -> List[Dict[str, Any]]:
277
- assert _BENCHMARK_DB is not None
278
- rows = _BENCHMARK_DB.cursor.execute(
279
- 'SELECT * FROM benchmark_results WHERE benchmark=(?)',
280
- (benchmark_name,))
281
- records = []
282
- for (cluster, status, num_nodes, resources, benchmark_record,
283
- benchmark) in rows:
284
- if benchmark_record is not None:
285
- benchmark_record = pickle.loads(benchmark_record)
286
- record = {
287
- 'cluster': cluster,
288
- 'status': BenchmarkStatus[status],
289
- 'num_nodes': num_nodes,
290
- 'resources': pickle.loads(resources),
291
- 'record': benchmark_record,
292
- 'benchmark': benchmark,
293
- }
294
- records.append(record)
295
- return records