skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251023__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/client/cli/command.py +118 -30
- sky/client/cli/table_utils.py +14 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/CJlKj9Z9fXGlQCmH4EpLX/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-165dc0e1553d9822.js +6 -0
- sky/dashboard/out/_next/static/chunks/2755.1ffbda43f960962b.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.27318ad826343ea6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-434b7577d72c879b.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +117 -17
- sky/jobs/client/sdk.py +28 -9
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +7 -3
- sky/jobs/server/server.py +11 -11
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +281 -166
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +7 -7
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +1 -13
- sky/server/requests/executor.py +20 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +51 -25
- sky/server/requests/serializers/decoders.py +23 -10
- sky/server/requests/serializers/encoders.py +5 -4
- sky/server/rest.py +35 -1
- sky/server/server.py +34 -34
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/log_lib.py +8 -1
- sky/skylet/services.py +5 -5
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/db/db_utils.py +32 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/utils/subprocess_utils.py +13 -1
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/RECORD +84 -83
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-49141c317f3a9020.js +0 -6
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
- sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → CJlKj9Z9fXGlQCmH4EpLX}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py
CHANGED
|
@@ -220,7 +220,14 @@ def run_with_log(
|
|
|
220
220
|
stdin=stdin,
|
|
221
221
|
**kwargs) as proc:
|
|
222
222
|
try:
|
|
223
|
-
|
|
223
|
+
if ctx is not None:
|
|
224
|
+
# When runs in coroutine, use kill_pg if available to avoid
|
|
225
|
+
# the overhead of refreshing the process tree in the daemon.
|
|
226
|
+
subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
|
|
227
|
+
else:
|
|
228
|
+
# For backward compatibility, do not specify use_kill_pg by
|
|
229
|
+
# default.
|
|
230
|
+
subprocess_utils.kill_process_daemon(proc.pid)
|
|
224
231
|
stdout = ''
|
|
225
232
|
stderr = ''
|
|
226
233
|
stdout_stream_handler = None
|
sky/skylet/services.py
CHANGED
|
@@ -408,17 +408,17 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
408
408
|
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
409
409
|
try:
|
|
410
410
|
accessible_workspaces = list(request.accessible_workspaces)
|
|
411
|
-
job_ids = list(request.job_ids.ids)
|
|
411
|
+
job_ids = (list(request.job_ids.ids)
|
|
412
|
+
if request.HasField('job_ids') else None)
|
|
412
413
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
413
|
-
if request.user_hashes:
|
|
414
|
+
if request.HasField('user_hashes'):
|
|
414
415
|
user_hashes = list(request.user_hashes.hashes)
|
|
415
416
|
# For backwards compatibility, we show jobs that do not have a
|
|
416
417
|
# user_hash. TODO: Remove before 0.12.0.
|
|
417
418
|
if request.show_jobs_without_user_hash:
|
|
418
419
|
user_hashes.append(None)
|
|
419
|
-
statuses = list(
|
|
420
|
-
|
|
421
|
-
|
|
420
|
+
statuses = (list(request.statuses.statuses)
|
|
421
|
+
if request.HasField('statuses') else None)
|
|
422
422
|
job_queue = managed_job_utils.get_managed_job_queue(
|
|
423
423
|
skip_finished=request.skip_finished,
|
|
424
424
|
accessible_workspaces=accessible_workspaces,
|
sky/skylet/subprocess_daemon.py
CHANGED
|
@@ -4,11 +4,16 @@ processes of proc_pid.
|
|
|
4
4
|
"""
|
|
5
5
|
import argparse
|
|
6
6
|
import os
|
|
7
|
+
import signal
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
10
|
+
from typing import List, Optional
|
|
9
11
|
|
|
10
12
|
import psutil
|
|
11
13
|
|
|
14
|
+
# Environment variable to enable kill_pg in subprocess daemon.
|
|
15
|
+
USE_KILL_PG_ENV_VAR = 'SKYPILOT_SUBPROCESS_DAEMON_KILL_PG'
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
def daemonize():
|
|
14
19
|
"""Detaches the process from its parent process with double-forking.
|
|
@@ -38,8 +43,74 @@ def daemonize():
|
|
|
38
43
|
# This process is now fully detached from the original parent and terminal
|
|
39
44
|
|
|
40
45
|
|
|
41
|
-
|
|
42
|
-
|
|
46
|
+
def get_pgid_if_leader(pid) -> Optional[int]:
|
|
47
|
+
"""Get the process group ID of the target process if it is the leader."""
|
|
48
|
+
try:
|
|
49
|
+
pgid = os.getpgid(pid)
|
|
50
|
+
# Only use process group if the target process is the leader. This is
|
|
51
|
+
# to avoid killing the entire process group while the target process is
|
|
52
|
+
# just a subprocess in the group.
|
|
53
|
+
if pgid == pid:
|
|
54
|
+
print(f'Process group {pgid} is the leader.')
|
|
55
|
+
return pgid
|
|
56
|
+
return None
|
|
57
|
+
except Exception: # pylint: disable=broad-except
|
|
58
|
+
# Process group is only available in UNIX.
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def kill_process_group(pgid: int) -> bool:
|
|
63
|
+
"""Kill the target process group."""
|
|
64
|
+
try:
|
|
65
|
+
print(f'Terminating process group {pgid}...')
|
|
66
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
67
|
+
except Exception: # pylint: disable=broad-except
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
# Wait 30s for the process group to exit gracefully.
|
|
71
|
+
time.sleep(30)
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
print(f'Force killing process group {pgid}...')
|
|
75
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
76
|
+
except Exception: # pylint: disable=broad-except
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def kill_process_tree(process: psutil.Process,
|
|
83
|
+
children: List[psutil.Process]) -> bool:
|
|
84
|
+
"""Kill the process tree of the target process."""
|
|
85
|
+
if process is not None:
|
|
86
|
+
# Kill the target process first to avoid having more children, or fail
|
|
87
|
+
# the process due to the children being defunct.
|
|
88
|
+
children = [process] + children
|
|
89
|
+
|
|
90
|
+
if not children:
|
|
91
|
+
sys.exit()
|
|
92
|
+
|
|
93
|
+
for child in children:
|
|
94
|
+
try:
|
|
95
|
+
child.terminate()
|
|
96
|
+
except psutil.NoSuchProcess:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Wait 30s for the processes to exit gracefully.
|
|
100
|
+
time.sleep(30)
|
|
101
|
+
|
|
102
|
+
# SIGKILL if they're still running.
|
|
103
|
+
for child in children:
|
|
104
|
+
try:
|
|
105
|
+
child.kill()
|
|
106
|
+
except psutil.NoSuchProcess:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def main():
|
|
113
|
+
# daemonize()
|
|
43
114
|
parser = argparse.ArgumentParser()
|
|
44
115
|
parser.add_argument('--parent-pid', type=int, required=True)
|
|
45
116
|
parser.add_argument('--proc-pid', type=int, required=True)
|
|
@@ -72,37 +143,40 @@ if __name__ == '__main__':
|
|
|
72
143
|
except (psutil.NoSuchProcess, ValueError):
|
|
73
144
|
pass
|
|
74
145
|
|
|
146
|
+
pgid: Optional[int] = None
|
|
147
|
+
if os.environ.get(USE_KILL_PG_ENV_VAR) == '1':
|
|
148
|
+
# Use kill_pg on UNIX system if allowed to reduce the resource usage.
|
|
149
|
+
# Note that both implementations might leave subprocessed uncancelled:
|
|
150
|
+
# - kill_process_tree(default): a subprocess is able to detach itself
|
|
151
|
+
# from the process tree use the same technique as daemonize(). Also,
|
|
152
|
+
# since we refresh the process tree per second, if the subprocess is
|
|
153
|
+
# launched between the [last_poll, parent_die] interval, the
|
|
154
|
+
# subprocess will not be captured will not be killed.
|
|
155
|
+
# - kill_process_group: kill_pg will kill all the processed in the group
|
|
156
|
+
# but if a subprocess calls setpgid(0, 0) to detach itself from the
|
|
157
|
+
# process group (usually to daemonize itself), the subprocess will
|
|
158
|
+
# not be killed.
|
|
159
|
+
pgid = get_pgid_if_leader(process.pid)
|
|
160
|
+
|
|
75
161
|
if process is not None and parent_process is not None:
|
|
76
162
|
# Wait for either parent or target process to exit
|
|
77
163
|
while process.is_running() and parent_process.is_running():
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
164
|
+
if pgid is None:
|
|
165
|
+
# Refresh process tree for cleanup if process group is not
|
|
166
|
+
# available.
|
|
167
|
+
try:
|
|
168
|
+
tmp_children = process.children(recursive=True)
|
|
169
|
+
if tmp_children:
|
|
170
|
+
children = tmp_children
|
|
171
|
+
except psutil.NoSuchProcess:
|
|
172
|
+
pass
|
|
84
173
|
time.sleep(1)
|
|
85
174
|
|
|
86
|
-
if
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
175
|
+
if pgid is not None:
|
|
176
|
+
kill_process_group(pgid)
|
|
177
|
+
else:
|
|
178
|
+
kill_process_tree(process, children)
|
|
90
179
|
|
|
91
|
-
if not children:
|
|
92
|
-
sys.exit()
|
|
93
180
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
child.terminate()
|
|
97
|
-
except psutil.NoSuchProcess:
|
|
98
|
-
continue
|
|
99
|
-
|
|
100
|
-
# Wait 30s for the processes to exit gracefully.
|
|
101
|
-
time.sleep(30)
|
|
102
|
-
|
|
103
|
-
# SIGKILL if they're still running.
|
|
104
|
-
for child in children:
|
|
105
|
-
try:
|
|
106
|
-
child.kill()
|
|
107
|
-
except psutil.NoSuchProcess:
|
|
108
|
-
continue
|
|
181
|
+
if __name__ == '__main__':
|
|
182
|
+
main()
|
sky/skypilot_config.py
CHANGED
|
@@ -64,7 +64,6 @@ from sqlalchemy import orm
|
|
|
64
64
|
from sqlalchemy.dialects import postgresql
|
|
65
65
|
from sqlalchemy.dialects import sqlite
|
|
66
66
|
from sqlalchemy.ext import declarative
|
|
67
|
-
from sqlalchemy.pool import NullPool
|
|
68
67
|
|
|
69
68
|
from sky import exceptions
|
|
70
69
|
from sky import sky_logging
|
|
@@ -77,6 +76,7 @@ from sky.utils import schemas
|
|
|
77
76
|
from sky.utils import ux_utils
|
|
78
77
|
from sky.utils import yaml_utils
|
|
79
78
|
from sky.utils.db import db_utils
|
|
79
|
+
from sky.utils.db import migration_utils
|
|
80
80
|
from sky.utils.kubernetes import config_map_utils
|
|
81
81
|
|
|
82
82
|
if typing.TYPE_CHECKING:
|
|
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
|
|
|
121
121
|
|
|
122
122
|
API_SERVER_CONFIG_KEY = 'api_server_config'
|
|
123
123
|
|
|
124
|
-
|
|
124
|
+
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
125
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
125
126
|
|
|
126
127
|
Base = declarative.declarative_base()
|
|
127
128
|
|
|
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
|
|
|
481
482
|
reload_config()
|
|
482
483
|
|
|
483
484
|
|
|
484
|
-
def reload_config() -> None:
|
|
485
|
+
def reload_config(init_db: bool = False) -> None:
|
|
485
486
|
internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
|
|
486
487
|
if internal_config_path is not None:
|
|
487
488
|
# {ENV_VAR_SKYPILOT_CONFIG} is used internally.
|
|
@@ -493,7 +494,7 @@ def reload_config() -> None:
|
|
|
493
494
|
return
|
|
494
495
|
|
|
495
496
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
496
|
-
_reload_config_as_server()
|
|
497
|
+
_reload_config_as_server(init_db=init_db)
|
|
497
498
|
else:
|
|
498
499
|
_reload_config_as_client()
|
|
499
500
|
|
|
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
|
564
565
|
_set_loaded_config_path(config_path)
|
|
565
566
|
|
|
566
567
|
|
|
567
|
-
def
|
|
568
|
+
def _create_table(engine: sqlalchemy.engine.Engine):
|
|
569
|
+
"""Initialize the config database with migrations."""
|
|
570
|
+
migration_utils.safe_alembic_upgrade(
|
|
571
|
+
engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
|
|
572
|
+
migration_utils.SKYPILOT_CONFIG_VERSION)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
576
|
+
"""Initialize and return the config database engine.
|
|
577
|
+
|
|
578
|
+
This function should only be called by the API Server during initialization.
|
|
579
|
+
Client-side code should never call this function.
|
|
580
|
+
"""
|
|
581
|
+
assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
|
|
582
|
+
'initialize_and_get_db() can only be called by the API Server')
|
|
583
|
+
|
|
584
|
+
global _SQLALCHEMY_ENGINE
|
|
585
|
+
|
|
586
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
587
|
+
return _SQLALCHEMY_ENGINE
|
|
588
|
+
|
|
589
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
590
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
591
|
+
return _SQLALCHEMY_ENGINE
|
|
592
|
+
|
|
593
|
+
# We only store config in the DB when using Postgres,
|
|
594
|
+
# so no need to pass in db_name here.
|
|
595
|
+
engine = db_utils.get_engine(None)
|
|
596
|
+
|
|
597
|
+
# Run migrations if needed
|
|
598
|
+
_create_table(engine)
|
|
599
|
+
|
|
600
|
+
_SQLALCHEMY_ENGINE = engine
|
|
601
|
+
return _SQLALCHEMY_ENGINE
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _reload_config_as_server(init_db: bool = False) -> None:
|
|
568
605
|
# Reset the global variables, to avoid using stale values.
|
|
569
606
|
_set_loaded_config(config_utils.Config())
|
|
570
607
|
_set_loaded_config_path(None)
|
|
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
|
|
|
580
617
|
raise ValueError(
|
|
581
618
|
'If db config is specified, no other config is allowed')
|
|
582
619
|
logger.debug('retrieving config from database')
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
db_config = config_utils.Config(
|
|
602
|
-
yaml_utils.safe_load(row.value))
|
|
603
|
-
db_config.pop_nested(('db',), None)
|
|
604
|
-
return db_config
|
|
605
|
-
return None
|
|
606
|
-
|
|
607
|
-
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
608
|
-
if db_config:
|
|
609
|
-
server_config = overlay_skypilot_config(server_config,
|
|
610
|
-
db_config)
|
|
611
|
-
# Close the engine to avoid connection leaks
|
|
612
|
-
if dispose_engine:
|
|
613
|
-
sqlalchemy_engine.dispose()
|
|
620
|
+
|
|
621
|
+
if init_db:
|
|
622
|
+
_initialize_and_get_db()
|
|
623
|
+
|
|
624
|
+
def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
|
|
625
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
626
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
627
|
+
row = session.query(config_yaml_table).filter_by(
|
|
628
|
+
key=key).first()
|
|
629
|
+
if row:
|
|
630
|
+
db_config = config_utils.Config(yaml_utils.safe_load(row.value))
|
|
631
|
+
db_config.pop_nested(('db',), None)
|
|
632
|
+
return db_config
|
|
633
|
+
return None
|
|
634
|
+
|
|
635
|
+
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
636
|
+
if db_config:
|
|
637
|
+
server_config = overlay_skypilot_config(server_config, db_config)
|
|
614
638
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
615
639
|
logger.debug(f'server config: \n'
|
|
616
640
|
f'{yaml_utils.dump_yaml_str(dict(server_config))}')
|
|
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
|
|
|
666
690
|
|
|
667
691
|
|
|
668
692
|
# Load on import, synchronization is guaranteed by python interpreter.
|
|
669
|
-
reload_config()
|
|
693
|
+
reload_config(init_db=True)
|
|
670
694
|
|
|
671
695
|
|
|
672
696
|
def loaded() -> bool:
|
|
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
880
904
|
if new_db_url and new_db_url != existing_db_url:
|
|
881
905
|
raise ValueError('Cannot change db url while server is running')
|
|
882
906
|
if existing_db_url:
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
910
|
-
index_elements=[config_yaml_table.c.key],
|
|
911
|
-
set_={config_yaml_table.c.value: config_str})
|
|
912
|
-
session.execute(do_update_stmt)
|
|
913
|
-
session.commit()
|
|
914
|
-
|
|
915
|
-
logger.debug('saving api_server config to db')
|
|
916
|
-
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
917
|
-
db_updated = True
|
|
918
|
-
# Close the engine to avoid connection leaks
|
|
919
|
-
if dispose_engine:
|
|
920
|
-
sqlalchemy_engine.dispose()
|
|
907
|
+
|
|
908
|
+
def _set_config_yaml_to_db(key: str, config: config_utils.Config):
|
|
909
|
+
# reload_config(init_db=True) is called when this module is
|
|
910
|
+
# imported, so the database engine must already be initialized.
|
|
911
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
912
|
+
config_str = yaml_utils.dump_yaml_str(dict(config))
|
|
913
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
914
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
915
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
916
|
+
insert_func = sqlite.insert
|
|
917
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
918
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
919
|
+
insert_func = postgresql.insert
|
|
920
|
+
else:
|
|
921
|
+
raise ValueError('Unsupported database dialect')
|
|
922
|
+
insert_stmnt = insert_func(config_yaml_table).values(
|
|
923
|
+
key=key, value=config_str)
|
|
924
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
925
|
+
index_elements=[config_yaml_table.c.key],
|
|
926
|
+
set_={config_yaml_table.c.value: config_str})
|
|
927
|
+
session.execute(do_update_stmt)
|
|
928
|
+
session.commit()
|
|
929
|
+
|
|
930
|
+
logger.debug('saving api_server config to db')
|
|
931
|
+
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
932
|
+
db_updated = True
|
|
921
933
|
|
|
922
934
|
if not db_updated:
|
|
923
935
|
# save to the local file (PVC in Kubernetes, local file otherwise)
|
sky/ssh_node_pools/server.py
CHANGED
|
@@ -99,7 +99,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
|
|
|
99
99
|
"""Deploy SSH Node Pool using existing ssh_up functionality."""
|
|
100
100
|
try:
|
|
101
101
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
|
|
102
|
-
executor.
|
|
102
|
+
await executor.schedule_request_async(
|
|
103
103
|
request_id=request.state.request_id,
|
|
104
104
|
request_name='ssh_up',
|
|
105
105
|
request_body=ssh_up_body,
|
|
@@ -124,7 +124,7 @@ async def deploy_ssh_node_pool_general(
|
|
|
124
124
|
ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
|
|
125
125
|
"""Deploys all SSH Node Pools."""
|
|
126
126
|
try:
|
|
127
|
-
executor.
|
|
127
|
+
await executor.schedule_request_async(
|
|
128
128
|
request_id=request.state.request_id,
|
|
129
129
|
request_name='ssh_up',
|
|
130
130
|
request_body=ssh_up_body,
|
|
@@ -150,7 +150,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
|
|
|
150
150
|
"""Cleans up a SSH Node Pools."""
|
|
151
151
|
try:
|
|
152
152
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
|
|
153
|
-
executor.
|
|
153
|
+
await executor.schedule_request_async(
|
|
154
154
|
request_id=request.state.request_id,
|
|
155
155
|
request_name='ssh_down',
|
|
156
156
|
request_body=ssh_up_body,
|
|
@@ -178,7 +178,7 @@ async def down_ssh_node_pool_general(
|
|
|
178
178
|
try:
|
|
179
179
|
# Set cleanup=True for down operation
|
|
180
180
|
ssh_up_body.cleanup = True
|
|
181
|
-
executor.
|
|
181
|
+
await executor.schedule_request_async(
|
|
182
182
|
request_id=request.state.request_id,
|
|
183
183
|
request_name='ssh_down',
|
|
184
184
|
request_body=ssh_up_body,
|
sky/users/permission.py
CHANGED
|
@@ -14,6 +14,7 @@ from sky import models
|
|
|
14
14
|
from sky import sky_logging
|
|
15
15
|
from sky.skylet import constants
|
|
16
16
|
from sky.users import rbac
|
|
17
|
+
from sky.utils import annotations
|
|
17
18
|
from sky.utils import common_utils
|
|
18
19
|
from sky.utils.db import db_utils
|
|
19
20
|
|
|
@@ -254,6 +255,9 @@ class PermissionService:
|
|
|
254
255
|
with _policy_lock():
|
|
255
256
|
self._load_policy_no_lock()
|
|
256
257
|
|
|
258
|
+
# Right now, not a lot of users are using multiple workspaces,
|
|
259
|
+
# so 5 should be more than enough.
|
|
260
|
+
@annotations.lru_cache(scope='request', maxsize=5)
|
|
257
261
|
def check_workspace_permission(self, user_id: str,
|
|
258
262
|
workspace_name: str) -> bool:
|
|
259
263
|
"""Check workspace permission.
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -358,6 +358,27 @@ class SQLiteConn(threading.local):
|
|
|
358
358
|
conn = await self._get_async_conn()
|
|
359
359
|
return await conn.execute_fetchall(sql, parameters)
|
|
360
360
|
|
|
361
|
+
async def execute_get_returning_value_async(
|
|
362
|
+
self,
|
|
363
|
+
sql: str,
|
|
364
|
+
parameters: Optional[Iterable[Any]] = None
|
|
365
|
+
) -> Optional[sqlite3.Row]:
|
|
366
|
+
conn = await self._get_async_conn()
|
|
367
|
+
|
|
368
|
+
if parameters is None:
|
|
369
|
+
parameters = []
|
|
370
|
+
|
|
371
|
+
def exec_and_get_returning_value(sql: str,
|
|
372
|
+
parameters: Optional[Iterable[Any]]):
|
|
373
|
+
# pylint: disable=protected-access
|
|
374
|
+
row = conn._conn.execute(sql, parameters).fetchone()
|
|
375
|
+
conn._conn.commit()
|
|
376
|
+
return row
|
|
377
|
+
|
|
378
|
+
# pylint: disable=protected-access
|
|
379
|
+
return await conn._execute(exec_and_get_returning_value, sql,
|
|
380
|
+
parameters)
|
|
381
|
+
|
|
361
382
|
async def close(self):
|
|
362
383
|
if self._async_conn is not None:
|
|
363
384
|
await self._async_conn.close()
|
|
@@ -382,21 +403,28 @@ def get_max_connections():
|
|
|
382
403
|
|
|
383
404
|
@typing.overload
|
|
384
405
|
def get_engine(
|
|
385
|
-
db_name: str,
|
|
406
|
+
db_name: Optional[str],
|
|
386
407
|
async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
|
|
387
408
|
...
|
|
388
409
|
|
|
389
410
|
|
|
390
411
|
@typing.overload
|
|
391
|
-
def get_engine(db_name: str,
|
|
412
|
+
def get_engine(db_name: Optional[str],
|
|
392
413
|
async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
|
|
393
414
|
...
|
|
394
415
|
|
|
395
416
|
|
|
396
417
|
def get_engine(
|
|
397
|
-
db_name: str,
|
|
418
|
+
db_name: Optional[str],
|
|
398
419
|
async_engine: bool = False
|
|
399
420
|
) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
|
|
421
|
+
"""Get the engine for the given database name.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
db_name: The name of the database. ONLY used for SQLite. On Postgres,
|
|
425
|
+
we use a single database, which we get from the connection string.
|
|
426
|
+
async_engine: Whether to return an async engine.
|
|
427
|
+
"""
|
|
400
428
|
conn_string = None
|
|
401
429
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
402
430
|
conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
|
@@ -429,6 +457,7 @@ def get_engine(
|
|
|
429
457
|
max_overflow=0))
|
|
430
458
|
engine = _postgres_engine_cache[conn_string]
|
|
431
459
|
else:
|
|
460
|
+
assert db_name is not None, 'db_name must be provided for SQLite'
|
|
432
461
|
db_path = os.path.expanduser(f'~/.sky/{db_name}.db')
|
|
433
462
|
pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
|
|
434
463
|
if async_engine:
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -19,15 +19,19 @@ DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
|
19
19
|
|
|
20
20
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
21
21
|
GLOBAL_USER_STATE_VERSION = '010'
|
|
22
|
-
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.
|
|
22
|
+
GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
|
|
23
23
|
|
|
24
24
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
|
25
25
|
SPOT_JOBS_VERSION = '003'
|
|
26
|
-
SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.
|
|
26
|
+
SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
|
|
27
27
|
|
|
28
28
|
SERVE_DB_NAME = 'serve_db'
|
|
29
29
|
SERVE_VERSION = '001'
|
|
30
|
-
SERVE_LOCK_PATH = '~/.sky/locks/.
|
|
30
|
+
SERVE_LOCK_PATH = f'~/.sky/locks/.{SERVE_DB_NAME}.lock'
|
|
31
|
+
|
|
32
|
+
SKYPILOT_CONFIG_DB_NAME = 'sky_config_db'
|
|
33
|
+
SKYPILOT_CONFIG_VERSION = '001'
|
|
34
|
+
SKYPILOT_CONFIG_LOCK_PATH = f'~/.sky/locks/.{SKYPILOT_CONFIG_DB_NAME}.lock'
|
|
31
35
|
|
|
32
36
|
|
|
33
37
|
@contextlib.contextmanager
|
sky/utils/subprocess_utils.py
CHANGED
|
@@ -19,6 +19,7 @@ from sky import exceptions
|
|
|
19
19
|
from sky import sky_logging
|
|
20
20
|
from sky.adaptors import common as adaptors_common
|
|
21
21
|
from sky.skylet import log_lib
|
|
22
|
+
from sky.skylet import subprocess_daemon
|
|
22
23
|
from sky.utils import common_utils
|
|
23
24
|
from sky.utils import timeline
|
|
24
25
|
from sky.utils import ux_utils
|
|
@@ -306,11 +307,17 @@ def run_with_retries(
|
|
|
306
307
|
return returncode, stdout, stderr
|
|
307
308
|
|
|
308
309
|
|
|
309
|
-
def kill_process_daemon(process_pid: int) -> None:
|
|
310
|
+
def kill_process_daemon(process_pid: int, use_kill_pg: bool = False) -> None:
|
|
310
311
|
"""Start a daemon as a safety net to kill the process.
|
|
311
312
|
|
|
312
313
|
Args:
|
|
313
314
|
process_pid: The PID of the process to kill.
|
|
315
|
+
use_kill_pg: Whether to use kill process group to kill the process. If
|
|
316
|
+
True, the process will use os.killpg() to kill the target process
|
|
317
|
+
group on UNIX system, which is more efficient than using the daemon
|
|
318
|
+
to refresh the process tree in the daemon. Note that both
|
|
319
|
+
implementations have corner cases where subprocesses might not be
|
|
320
|
+
killed. Refer to subprocess_daemon.py for more details.
|
|
314
321
|
"""
|
|
315
322
|
# Get initial children list
|
|
316
323
|
try:
|
|
@@ -337,6 +344,10 @@ def kill_process_daemon(process_pid: int) -> None:
|
|
|
337
344
|
','.join(map(str, initial_children)),
|
|
338
345
|
]
|
|
339
346
|
|
|
347
|
+
env = os.environ.copy()
|
|
348
|
+
if use_kill_pg:
|
|
349
|
+
env[subprocess_daemon.USE_KILL_PG_ENV_VAR] = '1'
|
|
350
|
+
|
|
340
351
|
# We do not need to set `start_new_session=True` here, as the
|
|
341
352
|
# daemon script will detach itself from the parent process with
|
|
342
353
|
# fork to avoid being killed by parent process. See the reason we
|
|
@@ -348,6 +359,7 @@ def kill_process_daemon(process_pid: int) -> None:
|
|
|
348
359
|
stderr=subprocess.DEVNULL,
|
|
349
360
|
# Disable input
|
|
350
361
|
stdin=subprocess.DEVNULL,
|
|
362
|
+
env=env,
|
|
351
363
|
)
|
|
352
364
|
|
|
353
365
|
|
sky/volumes/server/server.py
CHANGED
|
@@ -25,7 +25,7 @@ async def volume_list(request: fastapi.Request) -> None:
|
|
|
25
25
|
'env_vars': auth_user.to_env_vars()
|
|
26
26
|
} if auth_user else {}
|
|
27
27
|
request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
|
|
28
|
-
executor.
|
|
28
|
+
await executor.schedule_request_async(
|
|
29
29
|
request_id=request.state.request_id,
|
|
30
30
|
request_name='volume_list',
|
|
31
31
|
request_body=request_body,
|
|
@@ -38,7 +38,7 @@ async def volume_list(request: fastapi.Request) -> None:
|
|
|
38
38
|
async def volume_delete(request: fastapi.Request,
|
|
39
39
|
volume_delete_body: payloads.VolumeDeleteBody) -> None:
|
|
40
40
|
"""Deletes a volume."""
|
|
41
|
-
executor.
|
|
41
|
+
await executor.schedule_request_async(
|
|
42
42
|
request_id=request.state.request_id,
|
|
43
43
|
request_name='volume_delete',
|
|
44
44
|
request_body=volume_delete_body,
|
|
@@ -112,7 +112,7 @@ async def volume_apply(request: fastapi.Request,
|
|
|
112
112
|
raise fastapi.HTTPException(
|
|
113
113
|
status_code=400,
|
|
114
114
|
detail='Runpod network volume is only supported on Runpod')
|
|
115
|
-
executor.
|
|
115
|
+
await executor.schedule_request_async(
|
|
116
116
|
request_id=request.state.request_id,
|
|
117
117
|
request_name='volume_apply',
|
|
118
118
|
request_body=volume_apply_body,
|