skypilot-nightly 1.0.0.dev20250708__py3-none-any.whl → 1.0.0.dev20250710__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +6 -4
  3. sky/clouds/kubernetes.py +2 -2
  4. sky/dashboard/out/404.html +1 -1
  5. sky/dashboard/out/_next/static/{O3wBEOmvYEVEqZxAP7Czn → P2Di1JdUlHuKN2lBws4Mr}/_buildManifest.js +1 -1
  6. sky/dashboard/out/_next/static/chunks/8969-13bb52ce3cffa4e3.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/{webpack-9a81ea998672c303.js → webpack-fd62f17bd9ce1fcc.js} +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs.html +1 -1
  17. sky/dashboard/out/users.html +1 -1
  18. sky/dashboard/out/volumes.html +1 -1
  19. sky/dashboard/out/workspace/new.html +1 -1
  20. sky/dashboard/out/workspaces/[name].html +1 -1
  21. sky/dashboard/out/workspaces.html +1 -1
  22. sky/global_user_state.py +7 -4
  23. sky/jobs/constants.py +1 -1
  24. sky/jobs/controller.py +7 -0
  25. sky/jobs/server/core.py +2 -1
  26. sky/jobs/server/utils.py +81 -0
  27. sky/jobs/state.py +55 -33
  28. sky/jobs/utils.py +34 -3
  29. sky/provision/kubernetes/instance.py +17 -0
  30. sky/provision/kubernetes/utils.py +5 -0
  31. sky/provision/provisioner.py +20 -0
  32. sky/server/metrics.py +2 -3
  33. sky/server/requests/executor.py +2 -5
  34. sky/server/requests/payloads.py +1 -0
  35. sky/server/requests/requests.py +94 -4
  36. sky/server/server.py +19 -5
  37. sky/server/uvicorn.py +4 -1
  38. sky/skylet/constants.py +1 -6
  39. sky/skylet/job_lib.py +30 -8
  40. sky/skypilot_config.py +4 -2
  41. sky/task.py +17 -0
  42. sky/users/permission.py +3 -0
  43. sky/utils/common_utils.py +13 -0
  44. sky/utils/db_utils.py +16 -0
  45. sky/utils/schemas.py +6 -0
  46. sky/utils/ux_utils.py +2 -4
  47. {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/METADATA +1 -1
  48. {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/RECORD +53 -52
  49. sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +0 -1
  50. /sky/dashboard/out/_next/static/{O3wBEOmvYEVEqZxAP7Czn → P2Di1JdUlHuKN2lBws4Mr}/_ssgManifest.js +0 -0
  51. {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/WHEEL +0 -0
  52. {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/entry_points.txt +0 -0
  53. {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/licenses/LICENSE +0 -0
  54. {skypilot_nightly-1.0.0.dev20250708.dist-info → skypilot_nightly-1.0.0.dev20250710.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -77,6 +77,7 @@ spot_table = sqlalchemy.Table(
77
77
  sqlalchemy.Column('task_name', sqlalchemy.Text),
78
78
  sqlalchemy.Column('specs', sqlalchemy.Text),
79
79
  sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
80
+ sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
80
81
  )
81
82
 
82
83
  job_info_table = sqlalchemy.Table(
@@ -131,7 +132,7 @@ def create_table():
131
132
  # is not critical and is likely to be enabled by other processes.
132
133
 
133
134
  # Create tables if they don't exist
134
- Base.metadata.create_all(bind=_SQLALCHEMY_ENGINE)
135
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata, _SQLALCHEMY_ENGINE)
135
136
 
136
137
  # Backward compatibility: add columns that not exist in older databases
137
138
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -170,6 +171,14 @@ def create_table():
170
171
  sqlalchemy.Text(),
171
172
  default_statement='DEFAULT NULL')
172
173
 
174
+ db_utils.add_column_to_table_sqlalchemy(
175
+ session,
176
+ 'spot',
177
+ 'metadata',
178
+ sqlalchemy.Text(),
179
+ default_statement='DEFAULT \'{}\'',
180
+ value_to_replace_existing_entries='{}')
181
+
173
182
  db_utils.add_column_to_table_sqlalchemy(session, 'job_info',
174
183
  'schedule_state',
175
184
  sqlalchemy.Text())
@@ -219,7 +228,8 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
219
228
  conn_string = skypilot_config.get_nested(('db',), None)
220
229
  if conn_string:
221
230
  logger.debug(f'using db URI from {conn_string}')
222
- _SQLALCHEMY_ENGINE = sqlalchemy.create_engine(conn_string)
231
+ _SQLALCHEMY_ENGINE = sqlalchemy.create_engine(
232
+ conn_string, poolclass=sqlalchemy.NullPool)
223
233
  else:
224
234
  db_path = os.path.expanduser('~/.sky/spot_jobs.db')
225
235
  pathlib.Path(db_path).parents[0].mkdir(parents=True,
@@ -272,6 +282,7 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
272
282
  'task_name': r['task_name'],
273
283
  'specs': r['specs'],
274
284
  'local_log_file': r['local_log_file'],
285
+ 'metadata': r['metadata'],
275
286
  # columns from job_info table (some may be None for legacy jobs)
276
287
  '_job_info_job_id': r[job_info_table.c.spot_job_id
277
288
  ], # ambiguous, use table.column
@@ -544,20 +555,28 @@ def set_job_info_without_job_id(name: str, workspace: str,
544
555
  if (_SQLALCHEMY_ENGINE.dialect.name ==
545
556
  db_utils.SQLAlchemyDialect.SQLITE.value):
546
557
  result = session.execute(insert_stmt)
558
+ ret = result.lastrowid
547
559
  session.commit()
548
- return result.lastrowid
560
+ return ret
549
561
  elif (_SQLALCHEMY_ENGINE.dialect.name ==
550
562
  db_utils.SQLAlchemyDialect.POSTGRESQL.value):
551
563
  result = session.execute(
552
564
  insert_stmt.returning(job_info_table.c.spot_job_id))
565
+ ret = result.scalar()
553
566
  session.commit()
554
- return result.scalar()
567
+ return ret
555
568
  else:
556
569
  raise ValueError('Unsupported database dialect')
557
570
 
558
571
 
559
572
  @_init_db
560
- def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
573
+ def set_pending(
574
+ job_id: int,
575
+ task_id: int,
576
+ task_name: str,
577
+ resources_str: str,
578
+ metadata: str,
579
+ ):
561
580
  """Set the task to pending state."""
562
581
  assert _SQLALCHEMY_ENGINE is not None
563
582
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -567,6 +586,7 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
567
586
  task_id=task_id,
568
587
  task_name=task_name,
569
588
  resources=resources_str,
589
+ metadata=metadata,
570
590
  status=ManagedJobStatus.PENDING.value,
571
591
  ))
572
592
  session.commit()
@@ -1192,38 +1212,40 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1192
1212
  # Note: we will get the user_hash here, but don't try to call
1193
1213
  # global_user_state.get_user() on it. This runs on the controller, which may
1194
1214
  # not have the user info. Prefer to do it on the API server side.
1215
+ query = sqlalchemy.select(spot_table, job_info_table).select_from(
1216
+ spot_table.outerjoin(
1217
+ job_info_table,
1218
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1219
+ if job_id is not None:
1220
+ query = query.where(spot_table.c.spot_job_id == job_id)
1221
+ query = query.order_by(spot_table.c.spot_job_id.desc(),
1222
+ spot_table.c.task_id.asc())
1223
+ rows = None
1195
1224
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1196
- query = sqlalchemy.select(spot_table, job_info_table).select_from(
1197
- spot_table.outerjoin(
1198
- job_info_table,
1199
- spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1200
- if job_id is not None:
1201
- query = query.where(spot_table.c.spot_job_id == job_id)
1202
- query = query.order_by(spot_table.c.spot_job_id.desc(),
1203
- spot_table.c.task_id.asc())
1204
1225
  rows = session.execute(query).fetchall()
1205
- jobs = []
1206
- for row in rows:
1207
- job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1208
- job_dict['status'] = ManagedJobStatus(job_dict['status'])
1209
- job_dict['schedule_state'] = ManagedJobScheduleState(
1210
- job_dict['schedule_state'])
1211
- if job_dict['job_name'] is None:
1212
- job_dict['job_name'] = job_dict['task_name']
1213
-
1214
- # Add user YAML content for managed jobs.
1215
- yaml_path = job_dict.get('original_user_yaml_path')
1216
- if yaml_path:
1217
- try:
1218
- with open(yaml_path, 'r', encoding='utf-8') as f:
1219
- job_dict['user_yaml'] = f.read()
1220
- except (FileNotFoundError, IOError, OSError):
1221
- job_dict['user_yaml'] = None
1222
- else:
1226
+ jobs = []
1227
+ for row in rows:
1228
+ job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1229
+ job_dict['status'] = ManagedJobStatus(job_dict['status'])
1230
+ job_dict['schedule_state'] = ManagedJobScheduleState(
1231
+ job_dict['schedule_state'])
1232
+ if job_dict['job_name'] is None:
1233
+ job_dict['job_name'] = job_dict['task_name']
1234
+ job_dict['metadata'] = json.loads(job_dict['metadata'])
1235
+
1236
+ # Add user YAML content for managed jobs.
1237
+ yaml_path = job_dict.get('original_user_yaml_path')
1238
+ if yaml_path:
1239
+ try:
1240
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1241
+ job_dict['user_yaml'] = f.read()
1242
+ except (FileNotFoundError, IOError, OSError):
1223
1243
  job_dict['user_yaml'] = None
1244
+ else:
1245
+ job_dict['user_yaml'] = None
1224
1246
 
1225
- jobs.append(job_dict)
1226
- return jobs
1247
+ jobs.append(job_dict)
1248
+ return jobs
1227
1249
 
1228
1250
 
1229
1251
  @_init_db
sky/jobs/utils.py CHANGED
@@ -1249,7 +1249,14 @@ def format_job_table(
1249
1249
  ]
1250
1250
  if show_all:
1251
1251
  # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
1252
- columns += ['STARTED', 'INFRA', 'RESOURCES', 'SCHED. STATE', 'DETAILS']
1252
+ columns += [
1253
+ 'STARTED',
1254
+ 'INFRA',
1255
+ 'RESOURCES',
1256
+ 'SCHED. STATE',
1257
+ 'DETAILS',
1258
+ 'GIT_COMMIT',
1259
+ ]
1253
1260
  if tasks_have_k8s_user:
1254
1261
  columns.insert(0, 'USER')
1255
1262
  job_table = log_utils.create_table(columns)
@@ -1362,6 +1369,7 @@ def format_job_table(
1362
1369
  '-',
1363
1370
  job_tasks[0]['schedule_state'],
1364
1371
  generate_details(details, failure_reason),
1372
+ job_tasks[0].get('metadata', {}).get('git_commit', '-'),
1365
1373
  ])
1366
1374
  if tasks_have_k8s_user:
1367
1375
  job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -1427,6 +1435,8 @@ def format_job_table(
1427
1435
  generate_details(task.get('details'),
1428
1436
  task['failure_reason']),
1429
1437
  ])
1438
+
1439
+ values.append(task.get('metadata', {}).get('git_commit', '-'))
1430
1440
  if tasks_have_k8s_user:
1431
1441
  values.insert(0, task.get('user', '-'))
1432
1442
  job_table.add_row(values)
@@ -1511,6 +1521,22 @@ class ManagedJobCodeGen:
1511
1521
  """)
1512
1522
  return cls._build(code)
1513
1523
 
1524
+ @classmethod
1525
+ def get_version_and_job_table(cls) -> str:
1526
+ """Generate code to get controller version and raw job table."""
1527
+ code = textwrap.dedent("""\
1528
+ from sky.skylet import constants as controller_constants
1529
+
1530
+ # Get controller version
1531
+ controller_version = controller_constants.SKYLET_VERSION
1532
+ print(f"controller_version:{controller_version}", flush=True)
1533
+
1534
+ # Get and print raw job table (load_managed_job_queue can parse this directly)
1535
+ job_table = utils.dump_managed_job_queue()
1536
+ print(job_table, flush=True)
1537
+ """)
1538
+ return cls._build(code)
1539
+
1514
1540
  @classmethod
1515
1541
  def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
1516
1542
  code = textwrap.dedent(f"""\
@@ -1565,8 +1591,13 @@ class ManagedJobCodeGen:
1565
1591
  resources_str = backend_utils.get_task_resources_str(
1566
1592
  task, is_managed_job=True)
1567
1593
  code += textwrap.dedent(f"""\
1568
- managed_job_state.set_pending({job_id}, {task_id},
1569
- {task.name!r}, {resources_str!r})
1594
+ if managed_job_version < 7:
1595
+ managed_job_state.set_pending({job_id}, {task_id},
1596
+ {task.name!r}, {resources_str!r})
1597
+ else:
1598
+ managed_job_state.set_pending({job_id}, {task_id},
1599
+ {task.name!r}, {resources_str!r},
1600
+ {task.metadata_json!r})
1570
1601
  """)
1571
1602
  return cls._build(code)
1572
1603
 
@@ -825,6 +825,23 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
825
825
  return
826
826
  pod_spec_copy['metadata']['name'] = pod_name
827
827
  pod_spec_copy['metadata']['labels']['component'] = pod_name
828
+
829
+ # We need to keep the following fields in the pod spec to be same for
830
+ # head and worker pods.
831
+ # So that Kueue can merge them into a single PodSet when creating
832
+ # ProvisioningRequest to trigger scale up of the cluster autoscaler,
833
+ # this is especially required for DWS queued provisioning mode in GKE.
834
+ # spec.containers[*].resources.requests
835
+ # spec.initContainers[*].resources.requests
836
+ # spec.resources
837
+ # spec.nodeSelector
838
+ # spec.tolerations
839
+ # spec.affinity
840
+ # resourceClaims
841
+ # Refer to the following links for more details:
842
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
843
+ # https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
844
+ if config.count > 1:
828
845
  # For multi-node support, we put a soft-constraint to schedule
829
846
  # worker pods on different nodes than the head pod.
830
847
  # This is not set as a hard constraint because if different nodes
@@ -313,6 +313,9 @@ def get_gke_accelerator_name(accelerator: str) -> str:
313
313
  # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
314
314
  # have a different name pattern.
315
315
  return 'nvidia-{}'.format(accelerator.lower())
316
+ elif accelerator == 'H200':
317
+ # H200s on GCP use this label format
318
+ return 'nvidia-h200-141gb'
316
319
  elif accelerator.startswith('tpu-'):
317
320
  return accelerator
318
321
  else:
@@ -482,6 +485,8 @@ class GKELabelFormatter(GPULabelFormatter):
482
485
  # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
483
486
  # to distinguish between a3-high and a3-mega instances
484
487
  return 'H100'
488
+ elif acc == 'H200-141GB':
489
+ return 'H200'
485
490
  return acc
486
491
  elif is_tpu_on_gke(value):
487
492
  return value
@@ -22,11 +22,13 @@ from sky import sky_logging
22
22
  from sky import skypilot_config
23
23
  from sky.adaptors import aws
24
24
  from sky.backends import backend_utils
25
+ from sky.jobs.server import utils as server_jobs_utils
25
26
  from sky.provision import common as provision_common
26
27
  from sky.provision import instance_setup
27
28
  from sky.provision import logging as provision_logging
28
29
  from sky.provision import metadata_utils
29
30
  from sky.skylet import constants
31
+ from sky.utils import common
30
32
  from sky.utils import common_utils
31
33
  from sky.utils import message_utils
32
34
  from sky.utils import resources_utils
@@ -502,6 +504,24 @@ def _post_provision_setup(
502
504
  logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
503
505
  f'Docker container is up.{colorama.Style.RESET_ALL}')
504
506
 
507
+ # Check version compatibility for jobs controller clusters
508
+ if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
509
+ # TODO(zeping): remove this in v0.12.0
510
+ # This only happens in upgrade from <0.9.3 to > 0.10.0
511
+ # After 0.10.0 no incompatibility issue
512
+ # See https://github.com/skypilot-org/skypilot/pull/6096
513
+ # For more details
514
+ status.update(
515
+ ux_utils.spinner_message(
516
+ 'Checking controller version compatibility'))
517
+ try:
518
+ server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
519
+ except exceptions.ClusterNotUpError:
520
+ # Controller is not up yet during initial provisioning, that
521
+ # also means no non-terminal jobs, so no incompatibility in
522
+ # this case.
523
+ pass
524
+
505
525
  # We mount the metadata with sky wheel for speedup.
506
526
  # NOTE: currently we mount all credentials for all nodes, because
507
527
  # (1) jobs controllers need permission to launch/down nodes of
sky/server/metrics.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Instrumentation for the API server."""
2
2
 
3
- import asyncio
4
3
  import os
5
4
  import time
6
5
 
@@ -50,7 +49,7 @@ async def metrics() -> fastapi.Response:
50
49
  headers={'Cache-Control': 'no-cache'})
51
50
 
52
51
 
53
- def run_metrics_server(host: str, port: int):
52
+ def build_metrics_server(host: str, port: int) -> uvicorn.Server:
54
53
  metrics_config = uvicorn.Config(
55
54
  'sky.server.metrics:metrics_app',
56
55
  host=host,
@@ -58,7 +57,7 @@ def run_metrics_server(host: str, port: int):
58
57
  workers=1,
59
58
  )
60
59
  metrics_server_instance = uvicorn.Server(metrics_config)
61
- asyncio.run(metrics_server_instance.serve())
60
+ return metrics_server_instance
62
61
 
63
62
 
64
63
  def _get_status_code_group(status_code: int) -> str:
@@ -399,11 +399,8 @@ def _request_execution_wrapper(request_id: str,
399
399
  f'{common_utils.format_exception(e)}')
400
400
  return
401
401
  else:
402
- with api_requests.update_request(request_id) as request_task:
403
- assert request_task is not None, request_id
404
- request_task.status = api_requests.RequestStatus.SUCCEEDED
405
- if not ignore_return_value:
406
- request_task.set_return_value(return_value)
402
+ api_requests.set_request_succeeded(
403
+ request_id, return_value if not ignore_return_value else None)
407
404
  _restore_output(original_stdout, original_stderr)
408
405
  logger.info(f'Request {request_id} finished')
409
406
 
@@ -710,3 +710,4 @@ class RequestPayload(BasePayload):
710
710
  cluster_name: Optional[str] = None
711
711
  status_msg: Optional[str] = None
712
712
  should_retry: bool = False
713
+ finished_at: Optional[float] = None
@@ -1,4 +1,5 @@
1
1
  """Utilities for REST API."""
2
+ import asyncio
2
3
  import contextlib
3
4
  import dataclasses
4
5
  import enum
@@ -20,6 +21,7 @@ import filelock
20
21
  from sky import exceptions
21
22
  from sky import global_user_state
22
23
  from sky import sky_logging
24
+ from sky import skypilot_config
23
25
  from sky.server import common as server_common
24
26
  from sky.server import constants as server_constants
25
27
  from sky.server.requests import payloads
@@ -29,6 +31,7 @@ from sky.utils import common
29
31
  from sky.utils import common_utils
30
32
  from sky.utils import db_utils
31
33
  from sky.utils import env_options
34
+ from sky.utils import subprocess_utils
32
35
  from sky.utils import ux_utils
33
36
 
34
37
  logger = sky_logging.init_logger(__name__)
@@ -39,8 +42,11 @@ COL_CLUSTER_NAME = 'cluster_name'
39
42
  COL_USER_ID = 'user_id'
40
43
  COL_STATUS_MSG = 'status_msg'
41
44
  COL_SHOULD_RETRY = 'should_retry'
45
+ COL_FINISHED_AT = 'finished_at'
42
46
  REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
43
47
 
48
+ DEFAULT_REQUESTS_RETENTION_HOURS = 24 # 1 day
49
+
44
50
  # TODO(zhwu): For scalability, there are several TODOs:
45
51
  # [x] Have a way to queue requests.
46
52
  # [ ] Move logs to persistent place.
@@ -64,6 +70,10 @@ class RequestStatus(enum.Enum):
64
70
  color = _STATUS_TO_COLOR[self]
65
71
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
66
72
 
73
+ @classmethod
74
+ def finished_status(cls) -> List['RequestStatus']:
75
+ return [cls.SUCCEEDED, cls.FAILED, cls.CANCELLED]
76
+
67
77
 
68
78
  _STATUS_TO_COLOR = {
69
79
  RequestStatus.PENDING: colorama.Fore.BLUE,
@@ -88,6 +98,7 @@ REQUEST_COLUMNS = [
88
98
  COL_USER_ID,
89
99
  COL_STATUS_MSG,
90
100
  COL_SHOULD_RETRY,
101
+ COL_FINISHED_AT,
91
102
  ]
92
103
 
93
104
 
@@ -120,6 +131,8 @@ class Request:
120
131
  status_msg: Optional[str] = None
121
132
  # Whether the request should be retried.
122
133
  should_retry: bool = False
134
+ # When the request finished.
135
+ finished_at: Optional[float] = None
123
136
 
124
137
  @property
125
138
  def log_path(self) -> pathlib.Path:
@@ -206,6 +219,7 @@ class Request:
206
219
  cluster_name=self.cluster_name,
207
220
  status_msg=self.status_msg,
208
221
  should_retry=self.should_retry,
222
+ finished_at=self.finished_at,
209
223
  )
210
224
 
211
225
  def encode(self) -> payloads.RequestPayload:
@@ -228,6 +242,7 @@ class Request:
228
242
  cluster_name=self.cluster_name,
229
243
  status_msg=self.status_msg,
230
244
  should_retry=self.should_retry,
245
+ finished_at=self.finished_at,
231
246
  )
232
247
  except (TypeError, ValueError) as e:
233
248
  # The error is unexpected, so we don't suppress the stack trace.
@@ -260,6 +275,7 @@ class Request:
260
275
  cluster_name=payload.cluster_name,
261
276
  status_msg=payload.status_msg,
262
277
  should_retry=payload.should_retry,
278
+ finished_at=payload.finished_at,
263
279
  )
264
280
  except (TypeError, ValueError) as e:
265
281
  logger.error(
@@ -439,6 +455,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
439
455
  # process for each request.
440
456
  os.kill(request_record.pid, signal.SIGTERM)
441
457
  request_record.status = RequestStatus.CANCELLED
458
+ request_record.finished_at = time.time()
442
459
  cancelled_request_ids.append(request_id)
443
460
  return cancelled_request_ids
444
461
 
@@ -474,13 +491,16 @@ def create_table(cursor, conn):
474
491
  schedule_type TEXT,
475
492
  {COL_USER_ID} TEXT,
476
493
  {COL_STATUS_MSG} TEXT,
477
- {COL_SHOULD_RETRY} INTEGER
494
+ {COL_SHOULD_RETRY} INTEGER,
495
+ {COL_FINISHED_AT} REAL
478
496
  )""")
479
497
 
480
498
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
481
499
  'TEXT')
482
500
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
483
501
  'INTEGER')
502
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
503
+ 'REAL')
484
504
 
485
505
 
486
506
  _DB = None
@@ -583,6 +603,7 @@ def get_request_tasks(
583
603
  user_id: Optional[str] = None,
584
604
  exclude_request_names: Optional[List[str]] = None,
585
605
  include_request_names: Optional[List[str]] = None,
606
+ finished_before: Optional[float] = None,
586
607
  ) -> List[Request]:
587
608
  """Get a list of requests that match the given filters.
588
609
 
@@ -595,6 +616,8 @@ def get_request_tasks(
595
616
  If None, all users are included.
596
617
  include_request_names: a list of request names to filter on.
597
618
  Mutually exclusive with exclude_request_names.
619
+ finished_before: if provided, only include requests finished before this
620
+ timestamp.
598
621
 
599
622
  Raises:
600
623
  ValueError: If both exclude_request_names and include_request_names are
@@ -606,7 +629,7 @@ def get_request_tasks(
606
629
  'provided, not both.')
607
630
 
608
631
  filters = []
609
- filter_params = []
632
+ filter_params: List[Any] = []
610
633
  if status is not None:
611
634
  status_list_str = ','.join(repr(status.value) for status in status)
612
635
  filters.append(f'status IN ({status_list_str})')
@@ -624,6 +647,9 @@ def get_request_tasks(
624
647
  request_names_str = ','.join(
625
648
  repr(name) for name in include_request_names)
626
649
  filters.append(f'name IN ({request_names_str})')
650
+ if finished_before is not None:
651
+ filters.append('finished_at < ?')
652
+ filter_params.append(finished_before)
627
653
  assert _DB is not None
628
654
  with _DB.conn:
629
655
  cursor = _DB.conn.cursor()
@@ -665,19 +691,83 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
665
691
  with update_request(request_id) as request_task:
666
692
  assert request_task is not None, request_id
667
693
  request_task.status = RequestStatus.FAILED
694
+ request_task.finished_at = time.time()
668
695
  request_task.set_error(e)
669
696
 
670
697
 
671
- def set_request_succeeded(request_id: str, result: Any) -> None:
698
+ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
672
699
  """Set a request to succeeded and populate the result."""
673
700
  with update_request(request_id) as request_task:
674
701
  assert request_task is not None, request_id
675
702
  request_task.status = RequestStatus.SUCCEEDED
676
- request_task.set_return_value(result)
703
+ request_task.finished_at = time.time()
704
+ if result is not None:
705
+ request_task.set_return_value(result)
677
706
 
678
707
 
679
708
  def set_request_cancelled(request_id: str) -> None:
680
709
  """Set a request to cancelled."""
681
710
  with update_request(request_id) as request_task:
682
711
  assert request_task is not None, request_id
712
+ request_task.finished_at = time.time()
683
713
  request_task.status = RequestStatus.CANCELLED
714
+
715
+
716
+ @init_db
717
+ def _delete_requests(requests: List[Request]):
718
+ """Clean up requests by their IDs."""
719
+ id_list_str = ','.join(repr(req.request_id) for req in requests)
720
+ assert _DB is not None
721
+ with _DB.conn:
722
+ cursor = _DB.conn.cursor()
723
+ cursor.execute(
724
+ f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
725
+
726
+
727
+ def clean_finished_requests_with_retention(retention_seconds: int):
728
+ """Clean up finished requests older than the retention period.
729
+
730
+ This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
731
+ from the database and cleans up their associated log files.
732
+
733
+ Args:
734
+ retention_seconds: Requests older than this many seconds will be
735
+ deleted.
736
+ """
737
+ reqs = get_request_tasks(status=RequestStatus.finished_status(),
738
+ finished_before=time.time() - retention_seconds)
739
+
740
+ subprocess_utils.run_in_parallel(
741
+ func=lambda req: req.log_path.unlink(missing_ok=True),
742
+ args=reqs,
743
+ num_threads=len(reqs))
744
+
745
+ _delete_requests(reqs)
746
+
747
+ # To avoid leakage of the log file, logs must be deleted before the
748
+ # request task in the database.
749
+ logger.info(f'Cleaned up {len(reqs)} finished requests '
750
+ f'older than {retention_seconds} seconds')
751
+
752
+
753
+ async def requests_gc_daemon():
754
+ """Garbage collect finished requests periodically."""
755
+ while True:
756
+ logger.info('Running requests GC daemon...')
757
+ # Use the latest config.
758
+ skypilot_config.reload_config()
759
+ retention_seconds = skypilot_config.get_nested(
760
+ ('api_server', 'requests_retention_hours'),
761
+ DEFAULT_REQUESTS_RETENTION_HOURS) * 3600
762
+ try:
763
+ # Negative value disables the requests GC
764
+ if retention_seconds >= 0:
765
+ clean_finished_requests_with_retention(retention_seconds)
766
+ except asyncio.CancelledError:
767
+ logger.info('Requests GC daemon cancelled')
768
+ break
769
+ except Exception as e: # pylint: disable=broad-except
770
+ logger.error(f'Error running requests GC daemon: {e}')
771
+ # Run the daemon at most once every hour to avoid too frequent
772
+ # cleanup.
773
+ await asyncio.sleep(max(retention_seconds, 3600))
sky/server/server.py CHANGED
@@ -26,6 +26,7 @@ import fastapi
26
26
  from fastapi.middleware import cors
27
27
  from passlib.hash import apr_md5_crypt
28
28
  import starlette.middleware.base
29
+ import uvloop
29
30
 
30
31
  import sky
31
32
  from sky import catalog
@@ -1461,6 +1462,12 @@ async def stream(
1461
1462
  raise fastapi.HTTPException(
1462
1463
  status_code=404, detail=f'Request {request_id!r} not found')
1463
1464
  log_path_to_stream = request_task.log_path
1465
+ if not log_path_to_stream.exists():
1466
+ # The log file might be deleted by the request GC daemon but the
1467
+ # request task is still in the database.
1468
+ raise fastapi.HTTPException(
1469
+ status_code=404,
1470
+ detail=f'Log of request {request_id!r} has been deleted')
1464
1471
  else:
1465
1472
  assert log_path is not None, (request_id, log_path)
1466
1473
  if log_path == constants.API_SERVER_LOGS:
@@ -1775,13 +1782,18 @@ if __name__ == '__main__':
1775
1782
 
1776
1783
  queue_server: Optional[multiprocessing.Process] = None
1777
1784
  workers: List[executor.RequestWorker] = []
1785
+ # Global background tasks that will be scheduled in a separate event loop.
1786
+ global_tasks: List[asyncio.Task] = []
1778
1787
  try:
1788
+ background = uvloop.new_event_loop()
1779
1789
  if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
1780
- metrics_thread = threading.Thread(target=metrics.run_metrics_server,
1781
- args=(cmd_args.host,
1782
- cmd_args.metrics_port),
1783
- daemon=True)
1784
- metrics_thread.start()
1790
+ metrics_server = metrics.build_metrics_server(
1791
+ cmd_args.host, cmd_args.metrics_port)
1792
+ global_tasks.append(background.create_task(metrics_server.serve()))
1793
+ global_tasks.append(
1794
+ background.create_task(requests_lib.requests_gc_daemon()))
1795
+ threading.Thread(target=background.run_forever, daemon=True).start()
1796
+
1785
1797
  queue_server, workers = executor.start(config)
1786
1798
 
1787
1799
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
@@ -1799,6 +1811,8 @@ if __name__ == '__main__':
1799
1811
  finally:
1800
1812
  logger.info('Shutting down SkyPilot API server...')
1801
1813
 
1814
+ for gt in global_tasks:
1815
+ gt.cancel()
1802
1816
  subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
1803
1817
  workers,
1804
1818
  num_threads=len(workers))
sky/server/uvicorn.py CHANGED
@@ -150,7 +150,10 @@ class Server(uvicorn.Server):
150
150
  if req is None:
151
151
  return
152
152
  if req.pid is not None:
153
- os.kill(req.pid, signal.SIGTERM)
153
+ try:
154
+ os.kill(req.pid, signal.SIGTERM)
155
+ except ProcessLookupError:
156
+ logger.debug(f'Process {req.pid} already finished.')
154
157
  req.status = requests_lib.RequestStatus.CANCELLED
155
158
  req.should_retry = True
156
159
  logger.info(
sky/skylet/constants.py CHANGED
@@ -89,18 +89,13 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
89
  # cluster yaml is updated.
90
90
  #
91
91
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '14'
92
+ SKYLET_VERSION = '15'
93
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
95
  # user can be notified to update their SkyPilot version on the remote cluster.
96
96
  SKYLET_LIB_VERSION = 3
97
97
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
98
98
 
99
- # `sky jobs dashboard`-related
100
- #
101
- # Port on the remote jobs controller that the dashboard is running on.
102
- SPOT_DASHBOARD_REMOTE_PORT = 5000
103
-
104
99
  # Docker default options
105
100
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
106
101
  DEFAULT_DOCKER_PORT = 10022