skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251023__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (93) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/client/cli/command.py +118 -30
  4. sky/client/cli/table_utils.py +14 -8
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/CJlKj9Z9fXGlQCmH4EpLX/_buildManifest.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  8. sky/dashboard/out/_next/static/chunks/1871-165dc0e1553d9822.js +6 -0
  9. sky/dashboard/out/_next/static/chunks/2755.1ffbda43f960962b.js +26 -0
  10. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.27318ad826343ea6.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +31 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/webpack-434b7577d72c879b.js +1 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/global_user_state.py +117 -17
  45. sky/jobs/client/sdk.py +28 -9
  46. sky/jobs/client/sdk_async.py +9 -3
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +7 -3
  49. sky/jobs/server/server.py +11 -11
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +281 -166
  52. sky/schemas/api/responses.py +2 -0
  53. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  54. sky/serve/server/server.py +7 -7
  55. sky/server/auth/oauth2_proxy.py +2 -5
  56. sky/server/common.py +1 -13
  57. sky/server/requests/executor.py +20 -20
  58. sky/server/requests/payloads.py +3 -0
  59. sky/server/requests/requests.py +51 -25
  60. sky/server/requests/serializers/decoders.py +23 -10
  61. sky/server/requests/serializers/encoders.py +5 -4
  62. sky/server/rest.py +35 -1
  63. sky/server/server.py +34 -34
  64. sky/setup_files/alembic.ini +4 -0
  65. sky/skylet/log_lib.py +8 -1
  66. sky/skylet/services.py +5 -5
  67. sky/skylet/subprocess_daemon.py +103 -29
  68. sky/skypilot_config.py +87 -75
  69. sky/ssh_node_pools/server.py +4 -4
  70. sky/users/permission.py +4 -0
  71. sky/utils/db/db_utils.py +32 -3
  72. sky/utils/db/migration_utils.py +7 -3
  73. sky/utils/subprocess_utils.py +13 -1
  74. sky/volumes/server/server.py +3 -3
  75. sky/workspaces/server.py +6 -6
  76. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/METADATA +36 -35
  77. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/RECORD +84 -83
  78. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  79. sky/dashboard/out/_next/static/chunks/1871-49141c317f3a9020.js +0 -6
  80. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  81. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  85. sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
  86. sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
  87. /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → CJlKj9Z9fXGlQCmH4EpLX}/_ssgManifest.js +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  89. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  90. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/WHEEL +0 -0
  91. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/licenses/LICENSE +0 -0
  93. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py CHANGED
@@ -220,7 +220,14 @@ def run_with_log(
220
220
  stdin=stdin,
221
221
  **kwargs) as proc:
222
222
  try:
223
- subprocess_utils.kill_process_daemon(proc.pid)
223
+ if ctx is not None:
224
+ # When runs in coroutine, use kill_pg if available to avoid
225
+ # the overhead of refreshing the process tree in the daemon.
226
+ subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
227
+ else:
228
+ # For backward compatibility, do not specify use_kill_pg by
229
+ # default.
230
+ subprocess_utils.kill_process_daemon(proc.pid)
224
231
  stdout = ''
225
232
  stderr = ''
226
233
  stdout_stream_handler = None
sky/skylet/services.py CHANGED
@@ -408,17 +408,17 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
408
408
  ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
409
  try:
410
410
  accessible_workspaces = list(request.accessible_workspaces)
411
- job_ids = list(request.job_ids.ids) if request.job_ids else None
411
+ job_ids = (list(request.job_ids.ids)
412
+ if request.HasField('job_ids') else None)
412
413
  user_hashes: Optional[List[Optional[str]]] = None
413
- if request.user_hashes:
414
+ if request.HasField('user_hashes'):
414
415
  user_hashes = list(request.user_hashes.hashes)
415
416
  # For backwards compatibility, we show jobs that do not have a
416
417
  # user_hash. TODO: Remove before 0.12.0.
417
418
  if request.show_jobs_without_user_hash:
418
419
  user_hashes.append(None)
419
- statuses = list(
420
- request.statuses.statuses) if request.statuses else None
421
-
420
+ statuses = (list(request.statuses.statuses)
421
+ if request.HasField('statuses') else None)
422
422
  job_queue = managed_job_utils.get_managed_job_queue(
423
423
  skip_finished=request.skip_finished,
424
424
  accessible_workspaces=accessible_workspaces,
@@ -4,11 +4,16 @@ processes of proc_pid.
4
4
  """
5
5
  import argparse
6
6
  import os
7
+ import signal
7
8
  import sys
8
9
  import time
10
+ from typing import List, Optional
9
11
 
10
12
  import psutil
11
13
 
14
+ # Environment variable to enable kill_pg in subprocess daemon.
15
+ USE_KILL_PG_ENV_VAR = 'SKYPILOT_SUBPROCESS_DAEMON_KILL_PG'
16
+
12
17
 
13
18
  def daemonize():
14
19
  """Detaches the process from its parent process with double-forking.
@@ -38,8 +43,74 @@ def daemonize():
38
43
  # This process is now fully detached from the original parent and terminal
39
44
 
40
45
 
41
- if __name__ == '__main__':
42
- daemonize()
46
+ def get_pgid_if_leader(pid) -> Optional[int]:
47
+ """Get the process group ID of the target process if it is the leader."""
48
+ try:
49
+ pgid = os.getpgid(pid)
50
+ # Only use process group if the target process is the leader. This is
51
+ # to avoid killing the entire process group while the target process is
52
+ # just a subprocess in the group.
53
+ if pgid == pid:
54
+ print(f'Process group {pgid} is the leader.')
55
+ return pgid
56
+ return None
57
+ except Exception: # pylint: disable=broad-except
58
+ # Process group is only available in UNIX.
59
+ return None
60
+
61
+
62
+ def kill_process_group(pgid: int) -> bool:
63
+ """Kill the target process group."""
64
+ try:
65
+ print(f'Terminating process group {pgid}...')
66
+ os.killpg(pgid, signal.SIGTERM)
67
+ except Exception: # pylint: disable=broad-except
68
+ return False
69
+
70
+ # Wait 30s for the process group to exit gracefully.
71
+ time.sleep(30)
72
+
73
+ try:
74
+ print(f'Force killing process group {pgid}...')
75
+ os.killpg(pgid, signal.SIGKILL)
76
+ except Exception: # pylint: disable=broad-except
77
+ pass
78
+
79
+ return True
80
+
81
+
82
+ def kill_process_tree(process: psutil.Process,
83
+ children: List[psutil.Process]) -> bool:
84
+ """Kill the process tree of the target process."""
85
+ if process is not None:
86
+ # Kill the target process first to avoid having more children, or fail
87
+ # the process due to the children being defunct.
88
+ children = [process] + children
89
+
90
+ if not children:
91
+ sys.exit()
92
+
93
+ for child in children:
94
+ try:
95
+ child.terminate()
96
+ except psutil.NoSuchProcess:
97
+ continue
98
+
99
+ # Wait 30s for the processes to exit gracefully.
100
+ time.sleep(30)
101
+
102
+ # SIGKILL if they're still running.
103
+ for child in children:
104
+ try:
105
+ child.kill()
106
+ except psutil.NoSuchProcess:
107
+ continue
108
+
109
+ return True
110
+
111
+
112
+ def main():
113
+ # daemonize()
43
114
  parser = argparse.ArgumentParser()
44
115
  parser.add_argument('--parent-pid', type=int, required=True)
45
116
  parser.add_argument('--proc-pid', type=int, required=True)
@@ -72,37 +143,40 @@ if __name__ == '__main__':
72
143
  except (psutil.NoSuchProcess, ValueError):
73
144
  pass
74
145
 
146
+ pgid: Optional[int] = None
147
+ if os.environ.get(USE_KILL_PG_ENV_VAR) == '1':
148
+ # Use kill_pg on UNIX system if allowed to reduce the resource usage.
149
+ # Note that both implementations might leave subprocessed uncancelled:
150
+ # - kill_process_tree(default): a subprocess is able to detach itself
151
+ # from the process tree use the same technique as daemonize(). Also,
152
+ # since we refresh the process tree per second, if the subprocess is
153
+ # launched between the [last_poll, parent_die] interval, the
154
+ # subprocess will not be captured will not be killed.
155
+ # - kill_process_group: kill_pg will kill all the processed in the group
156
+ # but if a subprocess calls setpgid(0, 0) to detach itself from the
157
+ # process group (usually to daemonize itself), the subprocess will
158
+ # not be killed.
159
+ pgid = get_pgid_if_leader(process.pid)
160
+
75
161
  if process is not None and parent_process is not None:
76
162
  # Wait for either parent or target process to exit
77
163
  while process.is_running() and parent_process.is_running():
78
- try:
79
- tmp_children = process.children(recursive=True)
80
- if tmp_children:
81
- children = tmp_children
82
- except psutil.NoSuchProcess:
83
- pass
164
+ if pgid is None:
165
+ # Refresh process tree for cleanup if process group is not
166
+ # available.
167
+ try:
168
+ tmp_children = process.children(recursive=True)
169
+ if tmp_children:
170
+ children = tmp_children
171
+ except psutil.NoSuchProcess:
172
+ pass
84
173
  time.sleep(1)
85
174
 
86
- if process is not None:
87
- # Kill the target process first to avoid having more children, or fail
88
- # the process due to the children being defunct.
89
- children = [process] + children
175
+ if pgid is not None:
176
+ kill_process_group(pgid)
177
+ else:
178
+ kill_process_tree(process, children)
90
179
 
91
- if not children:
92
- sys.exit()
93
180
 
94
- for child in children:
95
- try:
96
- child.terminate()
97
- except psutil.NoSuchProcess:
98
- continue
99
-
100
- # Wait 30s for the processes to exit gracefully.
101
- time.sleep(30)
102
-
103
- # SIGKILL if they're still running.
104
- for child in children:
105
- try:
106
- child.kill()
107
- except psutil.NoSuchProcess:
108
- continue
181
+ if __name__ == '__main__':
182
+ main()
sky/skypilot_config.py CHANGED
@@ -64,7 +64,6 @@ from sqlalchemy import orm
64
64
  from sqlalchemy.dialects import postgresql
65
65
  from sqlalchemy.dialects import sqlite
66
66
  from sqlalchemy.ext import declarative
67
- from sqlalchemy.pool import NullPool
68
67
 
69
68
  from sky import exceptions
70
69
  from sky import sky_logging
@@ -77,6 +76,7 @@ from sky.utils import schemas
77
76
  from sky.utils import ux_utils
78
77
  from sky.utils import yaml_utils
79
78
  from sky.utils.db import db_utils
79
+ from sky.utils.db import migration_utils
80
80
  from sky.utils.kubernetes import config_map_utils
81
81
 
82
82
  if typing.TYPE_CHECKING:
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
121
121
 
122
122
  API_SERVER_CONFIG_KEY = 'api_server_config'
123
123
 
124
- _DB_USE_LOCK = threading.Lock()
124
+ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
125
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
125
126
 
126
127
  Base = declarative.declarative_base()
127
128
 
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
481
482
  reload_config()
482
483
 
483
484
 
484
- def reload_config() -> None:
485
+ def reload_config(init_db: bool = False) -> None:
485
486
  internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
486
487
  if internal_config_path is not None:
487
488
  # {ENV_VAR_SKYPILOT_CONFIG} is used internally.
@@ -493,7 +494,7 @@ def reload_config() -> None:
493
494
  return
494
495
 
495
496
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
496
- _reload_config_as_server()
497
+ _reload_config_as_server(init_db=init_db)
497
498
  else:
498
499
  _reload_config_as_client()
499
500
 
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
564
565
  _set_loaded_config_path(config_path)
565
566
 
566
567
 
567
- def _reload_config_as_server() -> None:
568
+ def _create_table(engine: sqlalchemy.engine.Engine):
569
+ """Initialize the config database with migrations."""
570
+ migration_utils.safe_alembic_upgrade(
571
+ engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
572
+ migration_utils.SKYPILOT_CONFIG_VERSION)
573
+
574
+
575
+ def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
576
+ """Initialize and return the config database engine.
577
+
578
+ This function should only be called by the API Server during initialization.
579
+ Client-side code should never call this function.
580
+ """
581
+ assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
582
+ 'initialize_and_get_db() can only be called by the API Server')
583
+
584
+ global _SQLALCHEMY_ENGINE
585
+
586
+ if _SQLALCHEMY_ENGINE is not None:
587
+ return _SQLALCHEMY_ENGINE
588
+
589
+ with _SQLALCHEMY_ENGINE_LOCK:
590
+ if _SQLALCHEMY_ENGINE is not None:
591
+ return _SQLALCHEMY_ENGINE
592
+
593
+ # We only store config in the DB when using Postgres,
594
+ # so no need to pass in db_name here.
595
+ engine = db_utils.get_engine(None)
596
+
597
+ # Run migrations if needed
598
+ _create_table(engine)
599
+
600
+ _SQLALCHEMY_ENGINE = engine
601
+ return _SQLALCHEMY_ENGINE
602
+
603
+
604
+ def _reload_config_as_server(init_db: bool = False) -> None:
568
605
  # Reset the global variables, to avoid using stale values.
569
606
  _set_loaded_config(config_utils.Config())
570
607
  _set_loaded_config_path(None)
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
580
617
  raise ValueError(
581
618
  'If db config is specified, no other config is allowed')
582
619
  logger.debug('retrieving config from database')
583
- with _DB_USE_LOCK:
584
- dispose_engine = False
585
- if db_utils.get_max_connections() == 0:
586
- dispose_engine = True
587
- sqlalchemy_engine = sqlalchemy.create_engine(db_url,
588
- poolclass=NullPool)
589
- else:
590
- sqlalchemy_engine = db_utils.get_engine('config')
591
- db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
592
- sqlalchemy_engine)
593
-
594
- def _get_config_yaml_from_db(
595
- key: str) -> Optional[config_utils.Config]:
596
- assert sqlalchemy_engine is not None
597
- with orm.Session(sqlalchemy_engine) as session:
598
- row = session.query(config_yaml_table).filter_by(
599
- key=key).first()
600
- if row:
601
- db_config = config_utils.Config(
602
- yaml_utils.safe_load(row.value))
603
- db_config.pop_nested(('db',), None)
604
- return db_config
605
- return None
606
-
607
- db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
608
- if db_config:
609
- server_config = overlay_skypilot_config(server_config,
610
- db_config)
611
- # Close the engine to avoid connection leaks
612
- if dispose_engine:
613
- sqlalchemy_engine.dispose()
620
+
621
+ if init_db:
622
+ _initialize_and_get_db()
623
+
624
+ def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
625
+ assert _SQLALCHEMY_ENGINE is not None
626
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
627
+ row = session.query(config_yaml_table).filter_by(
628
+ key=key).first()
629
+ if row:
630
+ db_config = config_utils.Config(yaml_utils.safe_load(row.value))
631
+ db_config.pop_nested(('db',), None)
632
+ return db_config
633
+ return None
634
+
635
+ db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
636
+ if db_config:
637
+ server_config = overlay_skypilot_config(server_config, db_config)
614
638
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
615
639
  logger.debug(f'server config: \n'
616
640
  f'{yaml_utils.dump_yaml_str(dict(server_config))}')
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
666
690
 
667
691
 
668
692
  # Load on import, synchronization is guaranteed by python interpreter.
669
- reload_config()
693
+ reload_config(init_db=True)
670
694
 
671
695
 
672
696
  def loaded() -> bool:
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
880
904
  if new_db_url and new_db_url != existing_db_url:
881
905
  raise ValueError('Cannot change db url while server is running')
882
906
  if existing_db_url:
883
- with _DB_USE_LOCK:
884
- dispose_engine = False
885
- if db_utils.get_max_connections() == 0:
886
- dispose_engine = True
887
- sqlalchemy_engine = sqlalchemy.create_engine(
888
- existing_db_url, poolclass=NullPool)
889
- else:
890
- sqlalchemy_engine = db_utils.get_engine('config')
891
- db_utils.add_all_tables_to_db_sqlalchemy(
892
- Base.metadata, sqlalchemy_engine)
893
-
894
- def _set_config_yaml_to_db(key: str,
895
- config: config_utils.Config):
896
- assert sqlalchemy_engine is not None
897
- config_str = yaml_utils.dump_yaml_str(dict(config))
898
- with orm.Session(sqlalchemy_engine) as session:
899
- if (sqlalchemy_engine.dialect.name ==
900
- db_utils.SQLAlchemyDialect.SQLITE.value):
901
- insert_func = sqlite.insert
902
- elif (sqlalchemy_engine.dialect.name ==
903
- db_utils.SQLAlchemyDialect.POSTGRESQL.value):
904
- insert_func = postgresql.insert
905
- else:
906
- raise ValueError('Unsupported database dialect')
907
- insert_stmnt = insert_func(config_yaml_table).values(
908
- key=key, value=config_str)
909
- do_update_stmt = insert_stmnt.on_conflict_do_update(
910
- index_elements=[config_yaml_table.c.key],
911
- set_={config_yaml_table.c.value: config_str})
912
- session.execute(do_update_stmt)
913
- session.commit()
914
-
915
- logger.debug('saving api_server config to db')
916
- _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
917
- db_updated = True
918
- # Close the engine to avoid connection leaks
919
- if dispose_engine:
920
- sqlalchemy_engine.dispose()
907
+
908
+ def _set_config_yaml_to_db(key: str, config: config_utils.Config):
909
+ # reload_config(init_db=True) is called when this module is
910
+ # imported, so the database engine must already be initialized.
911
+ assert _SQLALCHEMY_ENGINE is not None
912
+ config_str = yaml_utils.dump_yaml_str(dict(config))
913
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
914
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
915
+ db_utils.SQLAlchemyDialect.SQLITE.value):
916
+ insert_func = sqlite.insert
917
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
918
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
919
+ insert_func = postgresql.insert
920
+ else:
921
+ raise ValueError('Unsupported database dialect')
922
+ insert_stmnt = insert_func(config_yaml_table).values(
923
+ key=key, value=config_str)
924
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
925
+ index_elements=[config_yaml_table.c.key],
926
+ set_={config_yaml_table.c.value: config_str})
927
+ session.execute(do_update_stmt)
928
+ session.commit()
929
+
930
+ logger.debug('saving api_server config to db')
931
+ _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
932
+ db_updated = True
921
933
 
922
934
  if not db_updated:
923
935
  # save to the local file (PVC in Kubernetes, local file otherwise)
@@ -99,7 +99,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
99
99
  """Deploy SSH Node Pool using existing ssh_up functionality."""
100
100
  try:
101
101
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
102
- executor.schedule_request(
102
+ await executor.schedule_request_async(
103
103
  request_id=request.state.request_id,
104
104
  request_name='ssh_up',
105
105
  request_body=ssh_up_body,
@@ -124,7 +124,7 @@ async def deploy_ssh_node_pool_general(
124
124
  ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
125
125
  """Deploys all SSH Node Pools."""
126
126
  try:
127
- executor.schedule_request(
127
+ await executor.schedule_request_async(
128
128
  request_id=request.state.request_id,
129
129
  request_name='ssh_up',
130
130
  request_body=ssh_up_body,
@@ -150,7 +150,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
150
150
  """Cleans up a SSH Node Pools."""
151
151
  try:
152
152
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
153
- executor.schedule_request(
153
+ await executor.schedule_request_async(
154
154
  request_id=request.state.request_id,
155
155
  request_name='ssh_down',
156
156
  request_body=ssh_up_body,
@@ -178,7 +178,7 @@ async def down_ssh_node_pool_general(
178
178
  try:
179
179
  # Set cleanup=True for down operation
180
180
  ssh_up_body.cleanup = True
181
- executor.schedule_request(
181
+ await executor.schedule_request_async(
182
182
  request_id=request.state.request_id,
183
183
  request_name='ssh_down',
184
184
  request_body=ssh_up_body,
sky/users/permission.py CHANGED
@@ -14,6 +14,7 @@ from sky import models
14
14
  from sky import sky_logging
15
15
  from sky.skylet import constants
16
16
  from sky.users import rbac
17
+ from sky.utils import annotations
17
18
  from sky.utils import common_utils
18
19
  from sky.utils.db import db_utils
19
20
 
@@ -254,6 +255,9 @@ class PermissionService:
254
255
  with _policy_lock():
255
256
  self._load_policy_no_lock()
256
257
 
258
+ # Right now, not a lot of users are using multiple workspaces,
259
+ # so 5 should be more than enough.
260
+ @annotations.lru_cache(scope='request', maxsize=5)
257
261
  def check_workspace_permission(self, user_id: str,
258
262
  workspace_name: str) -> bool:
259
263
  """Check workspace permission.
sky/utils/db/db_utils.py CHANGED
@@ -358,6 +358,27 @@ class SQLiteConn(threading.local):
358
358
  conn = await self._get_async_conn()
359
359
  return await conn.execute_fetchall(sql, parameters)
360
360
 
361
+ async def execute_get_returning_value_async(
362
+ self,
363
+ sql: str,
364
+ parameters: Optional[Iterable[Any]] = None
365
+ ) -> Optional[sqlite3.Row]:
366
+ conn = await self._get_async_conn()
367
+
368
+ if parameters is None:
369
+ parameters = []
370
+
371
+ def exec_and_get_returning_value(sql: str,
372
+ parameters: Optional[Iterable[Any]]):
373
+ # pylint: disable=protected-access
374
+ row = conn._conn.execute(sql, parameters).fetchone()
375
+ conn._conn.commit()
376
+ return row
377
+
378
+ # pylint: disable=protected-access
379
+ return await conn._execute(exec_and_get_returning_value, sql,
380
+ parameters)
381
+
361
382
  async def close(self):
362
383
  if self._async_conn is not None:
363
384
  await self._async_conn.close()
@@ -382,21 +403,28 @@ def get_max_connections():
382
403
 
383
404
  @typing.overload
384
405
  def get_engine(
385
- db_name: str,
406
+ db_name: Optional[str],
386
407
  async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
387
408
  ...
388
409
 
389
410
 
390
411
  @typing.overload
391
- def get_engine(db_name: str,
412
+ def get_engine(db_name: Optional[str],
392
413
  async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
393
414
  ...
394
415
 
395
416
 
396
417
  def get_engine(
397
- db_name: str,
418
+ db_name: Optional[str],
398
419
  async_engine: bool = False
399
420
  ) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
421
+ """Get the engine for the given database name.
422
+
423
+ Args:
424
+ db_name: The name of the database. ONLY used for SQLite. On Postgres,
425
+ we use a single database, which we get from the connection string.
426
+ async_engine: Whether to return an async engine.
427
+ """
400
428
  conn_string = None
401
429
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
402
430
  conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
@@ -429,6 +457,7 @@ def get_engine(
429
457
  max_overflow=0))
430
458
  engine = _postgres_engine_cache[conn_string]
431
459
  else:
460
+ assert db_name is not None, 'db_name must be provided for SQLite'
432
461
  db_path = os.path.expanduser(f'~/.sky/{db_name}.db')
433
462
  pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
434
463
  if async_engine:
@@ -19,15 +19,19 @@ DB_INIT_LOCK_TIMEOUT_SECONDS = 10
19
19
 
20
20
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
21
21
  GLOBAL_USER_STATE_VERSION = '010'
22
- GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
22
+ GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
23
23
 
24
24
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
25
25
  SPOT_JOBS_VERSION = '003'
26
- SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
26
+ SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
27
27
 
28
28
  SERVE_DB_NAME = 'serve_db'
29
29
  SERVE_VERSION = '001'
30
- SERVE_LOCK_PATH = '~/.sky/locks/.serve_db.lock'
30
+ SERVE_LOCK_PATH = f'~/.sky/locks/.{SERVE_DB_NAME}.lock'
31
+
32
+ SKYPILOT_CONFIG_DB_NAME = 'sky_config_db'
33
+ SKYPILOT_CONFIG_VERSION = '001'
34
+ SKYPILOT_CONFIG_LOCK_PATH = f'~/.sky/locks/.{SKYPILOT_CONFIG_DB_NAME}.lock'
31
35
 
32
36
 
33
37
  @contextlib.contextmanager
@@ -19,6 +19,7 @@ from sky import exceptions
19
19
  from sky import sky_logging
20
20
  from sky.adaptors import common as adaptors_common
21
21
  from sky.skylet import log_lib
22
+ from sky.skylet import subprocess_daemon
22
23
  from sky.utils import common_utils
23
24
  from sky.utils import timeline
24
25
  from sky.utils import ux_utils
@@ -306,11 +307,17 @@ def run_with_retries(
306
307
  return returncode, stdout, stderr
307
308
 
308
309
 
309
- def kill_process_daemon(process_pid: int) -> None:
310
+ def kill_process_daemon(process_pid: int, use_kill_pg: bool = False) -> None:
310
311
  """Start a daemon as a safety net to kill the process.
311
312
 
312
313
  Args:
313
314
  process_pid: The PID of the process to kill.
315
+ use_kill_pg: Whether to use kill process group to kill the process. If
316
+ True, the process will use os.killpg() to kill the target process
317
+ group on UNIX system, which is more efficient than using the daemon
318
+ to refresh the process tree in the daemon. Note that both
319
+ implementations have corner cases where subprocesses might not be
320
+ killed. Refer to subprocess_daemon.py for more details.
314
321
  """
315
322
  # Get initial children list
316
323
  try:
@@ -337,6 +344,10 @@ def kill_process_daemon(process_pid: int) -> None:
337
344
  ','.join(map(str, initial_children)),
338
345
  ]
339
346
 
347
+ env = os.environ.copy()
348
+ if use_kill_pg:
349
+ env[subprocess_daemon.USE_KILL_PG_ENV_VAR] = '1'
350
+
340
351
  # We do not need to set `start_new_session=True` here, as the
341
352
  # daemon script will detach itself from the parent process with
342
353
  # fork to avoid being killed by parent process. See the reason we
@@ -348,6 +359,7 @@ def kill_process_daemon(process_pid: int) -> None:
348
359
  stderr=subprocess.DEVNULL,
349
360
  # Disable input
350
361
  stdin=subprocess.DEVNULL,
362
+ env=env,
351
363
  )
352
364
 
353
365
 
@@ -25,7 +25,7 @@ async def volume_list(request: fastapi.Request) -> None:
25
25
  'env_vars': auth_user.to_env_vars()
26
26
  } if auth_user else {}
27
27
  request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
28
- executor.schedule_request(
28
+ await executor.schedule_request_async(
29
29
  request_id=request.state.request_id,
30
30
  request_name='volume_list',
31
31
  request_body=request_body,
@@ -38,7 +38,7 @@ async def volume_list(request: fastapi.Request) -> None:
38
38
  async def volume_delete(request: fastapi.Request,
39
39
  volume_delete_body: payloads.VolumeDeleteBody) -> None:
40
40
  """Deletes a volume."""
41
- executor.schedule_request(
41
+ await executor.schedule_request_async(
42
42
  request_id=request.state.request_id,
43
43
  request_name='volume_delete',
44
44
  request_body=volume_delete_body,
@@ -112,7 +112,7 @@ async def volume_apply(request: fastapi.Request,
112
112
  raise fastapi.HTTPException(
113
113
  status_code=400,
114
114
  detail='Runpod network volume is only supported on Runpod')
115
- executor.schedule_request(
115
+ await executor.schedule_request_async(
116
116
  request_id=request.state.request_id,
117
117
  request_name='volume_apply',
118
118
  request_body=volume_apply_body,