skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/backends/backend_utils.py +9 -6
  5. sky/backends/cloud_vm_ray_backend.py +2 -3
  6. sky/check.py +25 -13
  7. sky/client/cli/command.py +52 -24
  8. sky/cloud_stores.py +73 -0
  9. sky/clouds/aws.py +59 -11
  10. sky/core.py +7 -5
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
  15. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
  28. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/data/data_utils.py +92 -1
  45. sky/data/mounting_utils.py +71 -2
  46. sky/data/storage.py +166 -9
  47. sky/global_user_state.py +14 -18
  48. sky/jobs/constants.py +2 -0
  49. sky/jobs/controller.py +62 -67
  50. sky/jobs/file_content_utils.py +80 -0
  51. sky/jobs/log_gc.py +201 -0
  52. sky/jobs/scheduler.py +15 -2
  53. sky/jobs/server/core.py +85 -13
  54. sky/jobs/server/server.py +14 -13
  55. sky/jobs/server/utils.py +28 -10
  56. sky/jobs/state.py +216 -40
  57. sky/jobs/utils.py +65 -28
  58. sky/metrics/utils.py +18 -0
  59. sky/optimizer.py +1 -1
  60. sky/provision/kubernetes/instance.py +88 -19
  61. sky/provision/kubernetes/volume.py +2 -2
  62. sky/schemas/api/responses.py +3 -5
  63. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  64. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  65. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  66. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  67. sky/serve/replica_managers.py +2 -2
  68. sky/serve/serve_utils.py +9 -2
  69. sky/serve/server/server.py +8 -7
  70. sky/server/common.py +21 -15
  71. sky/server/constants.py +1 -1
  72. sky/server/daemons.py +23 -17
  73. sky/server/requests/executor.py +7 -3
  74. sky/server/requests/payloads.py +2 -0
  75. sky/server/requests/request_names.py +80 -0
  76. sky/server/requests/requests.py +137 -102
  77. sky/server/requests/serializers/decoders.py +0 -6
  78. sky/server/requests/serializers/encoders.py +33 -6
  79. sky/server/server.py +105 -36
  80. sky/server/stream_utils.py +56 -13
  81. sky/setup_files/dependencies.py +2 -0
  82. sky/skylet/constants.py +6 -1
  83. sky/skylet/events.py +7 -0
  84. sky/skylet/services.py +18 -7
  85. sky/ssh_node_pools/server.py +5 -4
  86. sky/task.py +14 -42
  87. sky/templates/kubernetes-ray.yml.j2 +1 -1
  88. sky/templates/nebius-ray.yml.j2 +1 -0
  89. sky/templates/websocket_proxy.py +140 -12
  90. sky/users/permission.py +4 -1
  91. sky/utils/cli_utils/status_utils.py +8 -2
  92. sky/utils/context_utils.py +13 -1
  93. sky/utils/db/migration_utils.py +1 -1
  94. sky/utils/resource_checker.py +4 -1
  95. sky/utils/resources_utils.py +53 -29
  96. sky/utils/schemas.py +23 -4
  97. sky/volumes/server/server.py +4 -3
  98. sky/workspaces/server.py +7 -6
  99. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
  100. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
  101. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  102. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  108. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  109. /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
  110. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  111. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
  112. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
  113. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
  114. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -93,6 +93,7 @@ spot_table = sqlalchemy.Table(
93
93
  sqlalchemy.Column('specs', sqlalchemy.Text),
94
94
  sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
95
95
  sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
96
+ sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
96
97
  )
97
98
 
98
99
  job_info_table = sqlalchemy.Table(
@@ -108,6 +109,8 @@ job_info_table = sqlalchemy.Table(
108
109
  server_default=None),
109
110
  sqlalchemy.Column('dag_yaml_path', sqlalchemy.Text),
110
111
  sqlalchemy.Column('env_file_path', sqlalchemy.Text),
112
+ sqlalchemy.Column('dag_yaml_content', sqlalchemy.Text, server_default=None),
113
+ sqlalchemy.Column('env_file_content', sqlalchemy.Text, server_default=None),
111
114
  sqlalchemy.Column('user_hash', sqlalchemy.Text),
112
115
  sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
113
116
  sqlalchemy.Column('priority',
@@ -117,6 +120,9 @@ job_info_table = sqlalchemy.Table(
117
120
  sqlalchemy.Column('original_user_yaml_path',
118
121
  sqlalchemy.Text,
119
122
  server_default=None),
123
+ sqlalchemy.Column('original_user_yaml_content',
124
+ sqlalchemy.Text,
125
+ server_default=None),
120
126
  sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
121
127
  sqlalchemy.Column('current_cluster_name',
122
128
  sqlalchemy.Text,
@@ -125,6 +131,9 @@ job_info_table = sqlalchemy.Table(
125
131
  sqlalchemy.Integer,
126
132
  server_default=None),
127
133
  sqlalchemy.Column('pool_hash', sqlalchemy.Text, server_default=None),
134
+ sqlalchemy.Column('controller_logs_cleaned_at',
135
+ sqlalchemy.Float,
136
+ server_default=None),
128
137
  )
129
138
 
130
139
  ha_recovery_script_table = sqlalchemy.Table(
@@ -313,6 +322,8 @@ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
313
322
  # column names in the DB and it corresponds to the combined view
314
323
  # by joining the spot and job_info tables.
315
324
  def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
325
+ # WARNING: If you update these you may also need to update GetJobTable in
326
+ # the skylet ManagedJobsServiceImpl.
316
327
  return {
317
328
  '_job_id': r.get('job_id'), # from spot table
318
329
  '_task_name': r.get('job_name'), # deprecated, from spot table
@@ -339,13 +350,18 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
339
350
  'job_name': r.get('name'), # from job_info table
340
351
  'schedule_state': r.get('schedule_state'),
341
352
  'controller_pid': r.get('controller_pid'),
353
+ # the _path columns are for backwards compatibility, use the _content
354
+ # columns instead
342
355
  'dag_yaml_path': r.get('dag_yaml_path'),
343
356
  'env_file_path': r.get('env_file_path'),
357
+ 'dag_yaml_content': r.get('dag_yaml_content'),
358
+ 'env_file_content': r.get('env_file_content'),
344
359
  'user_hash': r.get('user_hash'),
345
360
  'workspace': r.get('workspace'),
346
361
  'priority': r.get('priority'),
347
362
  'entrypoint': r.get('entrypoint'),
348
363
  'original_user_yaml_path': r.get('original_user_yaml_path'),
364
+ 'original_user_yaml_content': r.get('original_user_yaml_content'),
349
365
  'pool': r.get('pool'),
350
366
  'current_cluster_name': r.get('current_cluster_name'),
351
367
  'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
@@ -1076,7 +1092,8 @@ def _get_all_task_ids_statuses(
1076
1092
 
1077
1093
  @_init_db
1078
1094
  def get_all_task_ids_names_statuses_logs(
1079
- job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
1095
+ job_id: int
1096
+ ) -> List[Tuple[int, str, ManagedJobStatus, str, Optional[float]]]:
1080
1097
  assert _SQLALCHEMY_ENGINE is not None
1081
1098
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1082
1099
  id_names = session.execute(
@@ -1085,9 +1102,10 @@ def get_all_task_ids_names_statuses_logs(
1085
1102
  spot_table.c.task_name,
1086
1103
  spot_table.c.status,
1087
1104
  spot_table.c.local_log_file,
1105
+ spot_table.c.logs_cleaned_at,
1088
1106
  ).where(spot_table.c.spot_job_id == job_id).order_by(
1089
1107
  spot_table.c.task_id.asc())).fetchall()
1090
- return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
1108
+ return [(row[0], row[1], ManagedJobStatus(row[2]), row[3], row[4])
1091
1109
  for row in id_names]
1092
1110
 
1093
1111
 
@@ -1152,8 +1170,8 @@ def get_failure_reason(job_id: int) -> Optional[str]:
1152
1170
 
1153
1171
 
1154
1172
  @_init_db
1155
- def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1156
- """Get managed jobs from the database."""
1173
+ def get_managed_job_tasks(job_id: int) -> List[Dict[str, Any]]:
1174
+ """Get managed job tasks for a specific managed job id from the database."""
1157
1175
  assert _SQLALCHEMY_ENGINE is not None
1158
1176
 
1159
1177
  # Join spot and job_info tables to get the job name for each task.
@@ -1168,10 +1186,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1168
1186
  spot_table.outerjoin(
1169
1187
  job_info_table,
1170
1188
  spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1171
- if job_id is not None:
1172
- query = query.where(spot_table.c.spot_job_id == job_id)
1173
- query = query.order_by(spot_table.c.spot_job_id.desc(),
1174
- spot_table.c.task_id.asc())
1189
+ query = query.where(spot_table.c.spot_job_id == job_id)
1190
+ query = query.order_by(spot_table.c.task_id.asc())
1175
1191
  rows = None
1176
1192
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1177
1193
  rows = session.execute(query).fetchall()
@@ -1186,15 +1202,17 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1186
1202
  job_dict['metadata'] = json.loads(job_dict['metadata'])
1187
1203
 
1188
1204
  # Add user YAML content for managed jobs.
1189
- yaml_path = job_dict.get('original_user_yaml_path')
1190
- if yaml_path:
1191
- try:
1192
- with open(yaml_path, 'r', encoding='utf-8') as f:
1193
- job_dict['user_yaml'] = f.read()
1194
- except (FileNotFoundError, IOError, OSError):
1195
- job_dict['user_yaml'] = None
1196
- else:
1197
- job_dict['user_yaml'] = None
1205
+ job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
1206
+ if job_dict['user_yaml'] is None:
1207
+ # Backwards compatibility - try to read from file path
1208
+ yaml_path = job_dict.get('original_user_yaml_path')
1209
+ if yaml_path:
1210
+ try:
1211
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1212
+ job_dict['user_yaml'] = f.read()
1213
+ except (FileNotFoundError, IOError, OSError) as e:
1214
+ logger.debug('Failed to read original user YAML for job '
1215
+ f'{job_id} from {yaml_path}: {e}')
1198
1216
 
1199
1217
  jobs.append(job_dict)
1200
1218
  return jobs
@@ -1408,7 +1426,13 @@ def get_managed_jobs_with_filters(
1408
1426
  page: Optional[int] = None,
1409
1427
  limit: Optional[int] = None,
1410
1428
  ) -> Tuple[List[Dict[str, Any]], int]:
1411
- """Get managed jobs from the database with filters."""
1429
+ """Get managed jobs from the database with filters.
1430
+
1431
+ Returns:
1432
+ A tuple containing
1433
+ - the list of managed jobs
1434
+ - the total number of managed jobs
1435
+ """
1412
1436
  assert _SQLALCHEMY_ENGINE is not None
1413
1437
 
1414
1438
  count_query = build_managed_jobs_with_filters_query(
@@ -1447,7 +1471,8 @@ def get_managed_jobs_with_filters(
1447
1471
  jobs = []
1448
1472
  for row in rows:
1449
1473
  job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1450
- job_dict['status'] = ManagedJobStatus(job_dict['status'])
1474
+ if job_dict.get('status') is not None:
1475
+ job_dict['status'] = ManagedJobStatus(job_dict['status'])
1451
1476
  if job_dict.get('schedule_state') is not None:
1452
1477
  job_dict['schedule_state'] = ManagedJobScheduleState(
1453
1478
  job_dict['schedule_state'])
@@ -1457,15 +1482,22 @@ def get_managed_jobs_with_filters(
1457
1482
  job_dict['metadata'] = json.loads(job_dict['metadata'])
1458
1483
 
1459
1484
  # Add user YAML content for managed jobs.
1460
- yaml_path = job_dict.get('original_user_yaml_path')
1461
- if (not fields or 'user_yaml' in fields) and yaml_path:
1462
- try:
1463
- with open(yaml_path, 'r', encoding='utf-8') as f:
1464
- job_dict['user_yaml'] = f.read()
1465
- except (FileNotFoundError, IOError, OSError):
1466
- job_dict['user_yaml'] = None
1467
- else:
1468
- job_dict['user_yaml'] = None
1485
+ job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
1486
+ if job_dict['user_yaml'] is None:
1487
+ # Backwards compatibility - try to read from file path
1488
+ yaml_path = job_dict.get('original_user_yaml_path')
1489
+ if yaml_path:
1490
+ try:
1491
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1492
+ job_dict['user_yaml'] = f.read()
1493
+ except (FileNotFoundError, IOError, OSError) as e:
1494
+ job_id = job_dict.get('job_id')
1495
+ if job_id is not None:
1496
+ logger.debug('Failed to read original user YAML for '
1497
+ f'job {job_id} from {yaml_path}: {e}')
1498
+ else:
1499
+ logger.debug('Failed to read original user YAML from '
1500
+ f'{yaml_path}: {e}')
1469
1501
 
1470
1502
  jobs.append(job_dict)
1471
1503
  return jobs, total
@@ -1511,9 +1543,9 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
1511
1543
 
1512
1544
 
1513
1545
  @_init_db
1514
- def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1515
- original_user_yaml_path: str, env_file_path: str,
1516
- priority: int):
1546
+ def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
1547
+ original_user_yaml_content: str,
1548
+ env_file_content: str, priority: int):
1517
1549
  """Do not call without holding the scheduler lock.
1518
1550
 
1519
1551
  Returns: Whether this is a recovery run or not.
@@ -1525,19 +1557,48 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1525
1557
  assert _SQLALCHEMY_ENGINE is not None
1526
1558
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1527
1559
  updated_count = session.query(job_info_table).filter(
1528
- sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
1529
- ).update({
1530
- job_info_table.c.schedule_state:
1531
- ManagedJobScheduleState.WAITING.value,
1532
- job_info_table.c.dag_yaml_path: dag_yaml_path,
1533
- job_info_table.c.original_user_yaml_path: original_user_yaml_path,
1534
- job_info_table.c.env_file_path: env_file_path,
1535
- job_info_table.c.priority: priority,
1536
- })
1560
+ sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)).update({
1561
+ job_info_table.c.schedule_state:
1562
+ ManagedJobScheduleState.WAITING.value,
1563
+ job_info_table.c.dag_yaml_content: dag_yaml_content,
1564
+ job_info_table.c.original_user_yaml_content:
1565
+ (original_user_yaml_content),
1566
+ job_info_table.c.env_file_content: env_file_content,
1567
+ job_info_table.c.priority: priority,
1568
+ })
1537
1569
  session.commit()
1538
1570
  assert updated_count <= 1, (job_id, updated_count)
1539
1571
 
1540
1572
 
1573
+ @_init_db
1574
+ def get_job_file_contents(job_id: int) -> Dict[str, Optional[str]]:
1575
+ """Return file information and stored contents for a managed job."""
1576
+ assert _SQLALCHEMY_ENGINE is not None
1577
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1578
+ row = session.execute(
1579
+ sqlalchemy.select(
1580
+ job_info_table.c.dag_yaml_path,
1581
+ job_info_table.c.env_file_path,
1582
+ job_info_table.c.dag_yaml_content,
1583
+ job_info_table.c.env_file_content,
1584
+ ).where(job_info_table.c.spot_job_id == job_id)).fetchone()
1585
+
1586
+ if row is None:
1587
+ return {
1588
+ 'dag_yaml_path': None,
1589
+ 'env_file_path': None,
1590
+ 'dag_yaml_content': None,
1591
+ 'env_file_content': None,
1592
+ }
1593
+
1594
+ return {
1595
+ 'dag_yaml_path': row[0],
1596
+ 'env_file_path': row[1],
1597
+ 'dag_yaml_content': row[2],
1598
+ 'env_file_content': row[3],
1599
+ }
1600
+
1601
+
1541
1602
  @_init_db
1542
1603
  def get_pool_from_job_id(job_id: int) -> Optional[str]:
1543
1604
  """Get the pool from the job id."""
@@ -2331,3 +2392,118 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
2331
2392
  rows = session.execute(query).fetchall()
2332
2393
  job_ids = [row[0] for row in rows if row[0] is not None]
2333
2394
  return job_ids
2395
+
2396
+
2397
+ @_init_db_async
2398
+ async def get_task_logs_to_clean_async(retention_seconds: int,
2399
+ batch_size) -> List[Dict[str, Any]]:
2400
+ """Get the logs of job tasks to clean.
2401
+
2402
+ The logs of a task will only cleaned when:
2403
+ - the job schedule state is DONE
2404
+ - AND the end time of the task is older than the retention period
2405
+ """
2406
+
2407
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2408
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2409
+ now = time.time()
2410
+ result = await session.execute(
2411
+ sqlalchemy.select(
2412
+ spot_table.c.spot_job_id,
2413
+ spot_table.c.task_id,
2414
+ spot_table.c.local_log_file,
2415
+ ).select_from(
2416
+ spot_table.join(
2417
+ job_info_table,
2418
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id,
2419
+ )).
2420
+ where(
2421
+ sqlalchemy.and_(
2422
+ job_info_table.c.schedule_state.is_(
2423
+ ManagedJobScheduleState.DONE.value),
2424
+ spot_table.c.end_at.isnot(None),
2425
+ spot_table.c.end_at < (now - retention_seconds),
2426
+ spot_table.c.logs_cleaned_at.is_(None),
2427
+ # The local log file is set AFTER the task is finished,
2428
+ # add this condition to ensure the entire log file has
2429
+ # been written.
2430
+ spot_table.c.local_log_file.isnot(None),
2431
+ )).limit(batch_size))
2432
+ rows = result.fetchall()
2433
+ return [{
2434
+ 'job_id': row[0],
2435
+ 'task_id': row[1],
2436
+ 'local_log_file': row[2]
2437
+ } for row in rows]
2438
+
2439
+
2440
+ @_init_db_async
2441
+ async def get_controller_logs_to_clean_async(
2442
+ retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
2443
+ """Get the controller logs to clean.
2444
+
2445
+ The controller logs will only cleaned when:
2446
+ - the job schedule state is DONE
2447
+ - AND the end time of the latest task is older than the retention period
2448
+ """
2449
+
2450
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2451
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2452
+ now = time.time()
2453
+
2454
+ result = await session.execute(
2455
+ sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
2456
+ job_info_table.join(
2457
+ spot_table,
2458
+ job_info_table.c.spot_job_id == spot_table.c.spot_job_id,
2459
+ )).where(
2460
+ sqlalchemy.and_(
2461
+ job_info_table.c.schedule_state.is_(
2462
+ ManagedJobScheduleState.DONE.value),
2463
+ spot_table.c.local_log_file.isnot(None),
2464
+ job_info_table.c.controller_logs_cleaned_at.is_(None),
2465
+ )).group_by(
2466
+ job_info_table.c.spot_job_id,
2467
+ job_info_table.c.current_cluster_name,
2468
+ ).having(
2469
+ sqlalchemy.func.max(
2470
+ spot_table.c.end_at).isnot(None),).having(
2471
+ sqlalchemy.func.max(spot_table.c.end_at) < (
2472
+ now - retention_seconds)).limit(batch_size))
2473
+ rows = result.fetchall()
2474
+ return [{'job_id': row[0]} for row in rows]
2475
+
2476
+
2477
+ @_init_db_async
2478
+ async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
2479
+ logs_cleaned_at: float):
2480
+ """Set the task logs cleaned at."""
2481
+ if not tasks:
2482
+ return
2483
+ # Deduplicate
2484
+ task_keys = list(dict.fromkeys(tasks))
2485
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2486
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2487
+ await session.execute(
2488
+ sqlalchemy.update(spot_table).where(
2489
+ sqlalchemy.tuple_(spot_table.c.spot_job_id,
2490
+ spot_table.c.task_id).in_(task_keys)).values(
2491
+ logs_cleaned_at=logs_cleaned_at))
2492
+ await session.commit()
2493
+
2494
+
2495
+ @_init_db_async
2496
+ async def set_controller_logs_cleaned_async(job_ids: List[int],
2497
+ logs_cleaned_at: float):
2498
+ """Set the controller logs cleaned at."""
2499
+ if not job_ids:
2500
+ return
2501
+ # Deduplicate
2502
+ job_ids = list(dict.fromkeys(job_ids))
2503
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2504
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2505
+ await session.execute(
2506
+ sqlalchemy.update(job_info_table).where(
2507
+ job_info_table.c.spot_job_id.in_(job_ids)).values(
2508
+ controller_logs_cleaned_at=logs_cleaned_at))
2509
+ await session.commit()
sky/jobs/utils.py CHANGED
@@ -6,7 +6,7 @@ ManagedJobCodeGen.
6
6
  """
7
7
  import asyncio
8
8
  import collections
9
- import datetime
9
+ from datetime import datetime
10
10
  import enum
11
11
  import os
12
12
  import pathlib
@@ -195,8 +195,8 @@ def _validate_consolidation_mode_config(
195
195
  'terminate the controller cluster first.'
196
196
  f'{colorama.Style.RESET_ALL}')
197
197
  else:
198
- all_jobs = managed_job_state.get_managed_jobs()
199
- if all_jobs:
198
+ total_jobs = managed_job_state.get_managed_jobs_total()
199
+ if total_jobs > 0:
200
200
  nonterminal_jobs = (
201
201
  managed_job_state.get_nonterminal_job_ids_by_name(
202
202
  None, None, all_users=True))
@@ -211,7 +211,7 @@ def _validate_consolidation_mode_config(
211
211
  else:
212
212
  logger.warning(
213
213
  f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
214
- f'but there are {len(all_jobs)} jobs from previous '
214
+ f'but there are {total_jobs} jobs from previous '
215
215
  'consolidation mode. Reset the `jobs.controller.'
216
216
  'consolidation_mode` to `true` and run `sky jobs queue` '
217
217
  'to see those jobs. Switching to normal mode will '
@@ -266,6 +266,12 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
266
266
 
267
267
  def ha_recovery_for_consolidation_mode():
268
268
  """Recovery logic for HA mode."""
269
+ # Touch the signal file here to avoid conflict with
270
+ # update_managed_jobs_statuses. Although we run this first and then start
271
+ # the deamon, this function is also called in cancel_jobs_by_id.
272
+ signal_file = pathlib.Path(
273
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
274
+ signal_file.touch()
269
275
  # No setup recovery is needed in consolidation mode, as the API server
270
276
  # already has all runtime installed. Directly start jobs recovery here.
271
277
  # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
@@ -276,7 +282,9 @@ def ha_recovery_for_consolidation_mode():
276
282
  encoding='utf-8') as f:
277
283
  start = time.time()
278
284
  f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
279
- for job in managed_job_state.get_managed_jobs():
285
+ jobs, _ = managed_job_state.get_managed_jobs_with_filters(
286
+ fields=['job_id', 'controller_pid', 'schedule_state', 'status'])
287
+ for job in jobs:
280
288
  job_id = job['job_id']
281
289
  controller_pid = job['controller_pid']
282
290
 
@@ -312,6 +320,7 @@ def ha_recovery_for_consolidation_mode():
312
320
  f'{datetime.datetime.now()}\n')
313
321
  f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
314
322
  f.write(f'Total recovery time: {time.time() - start} seconds\n')
323
+ signal_file.unlink()
315
324
 
316
325
 
317
326
  async def get_job_status(
@@ -456,7 +465,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
456
465
  """
457
466
  managed_job_state.remove_ha_recovery_script(job_id)
458
467
  error_msg = None
459
- tasks = managed_job_state.get_managed_jobs(job_id)
468
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
460
469
  for task in tasks:
461
470
  pool = task.get('pool', None)
462
471
  if pool is None:
@@ -525,7 +534,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
525
534
 
526
535
  for job_id in job_ids:
527
536
  assert job_id is not None
528
- tasks = managed_job_state.get_managed_jobs(job_id)
537
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
529
538
  # Note: controller_pid and schedule_state are in the job_info table
530
539
  # which is joined to the spot table, so all tasks with the same job_id
531
540
  # will have the same value for these columns. This is what lets us just
@@ -545,9 +554,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
545
554
  if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
546
555
  # There are two cases where we could get a job that is DONE.
547
556
  # 1. At query time (get_jobs_to_check_status), the job was not yet
548
- # DONE, but since then (before get_managed_jobs is called) it has
549
- # hit a terminal status, marked itself done, and exited. This is
550
- # fine.
557
+ # DONE, but since then (before get_managed_job_tasks is called)
558
+ # it has hit a terminal status, marked itself done, and exited.
559
+ # This is fine.
551
560
  # 2. The job is DONE, but in a non-terminal status. This is
552
561
  # unexpected. For instance, the task status is RUNNING, but the
553
562
  # job schedule_state is DONE.
@@ -901,6 +910,14 @@ def cancel_jobs_by_pool(pool_name: str,
901
910
  return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
902
911
 
903
912
 
913
+ def controller_log_file_for_job(job_id: int,
914
+ create_if_not_exists: bool = False) -> str:
915
+ log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
916
+ if create_if_not_exists:
917
+ os.makedirs(log_dir, exist_ok=True)
918
+ return os.path.join(log_dir, f'{job_id}.log')
919
+
920
+
904
921
  def stream_logs_by_id(job_id: int,
905
922
  follow: bool = True,
906
923
  tail: Optional[int] = None) -> Tuple[str, int]:
@@ -933,13 +950,20 @@ def stream_logs_by_id(job_id: int,
933
950
  if managed_job_status.is_failed():
934
951
  job_msg = ('\nFailure reason: '
935
952
  f'{managed_job_state.get_failure_reason(job_id)}')
936
- log_file_exists = False
953
+ log_file_ever_existed = False
937
954
  task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
938
955
  job_id)
939
956
  num_tasks = len(task_info)
940
- for task_id, task_name, task_status, log_file in task_info:
957
+ for (task_id, task_name, task_status, log_file,
958
+ logs_cleaned_at) in task_info:
941
959
  if log_file:
942
- log_file_exists = True
960
+ log_file_ever_existed = True
961
+ if logs_cleaned_at is not None:
962
+ ts_str = datetime.fromtimestamp(
963
+ logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
964
+ print(f'Task {task_name}({task_id}) log has been '
965
+ f'cleaned at {ts_str}.')
966
+ continue
943
967
  task_str = (f'Task {task_name}({task_id})'
944
968
  if task_name else f'Task {task_id}')
945
969
  if num_tasks > 1:
@@ -974,7 +998,7 @@ def stream_logs_by_id(job_id: int,
974
998
  f'{task_str} finished '
975
999
  f'(status: {task_status.value}).'),
976
1000
  flush=True)
977
- if log_file_exists:
1001
+ if log_file_ever_existed:
978
1002
  # Add the "Job finished" message for terminal states
979
1003
  if managed_job_status.is_terminal():
980
1004
  print(ux_utils.finishing_message(
@@ -1202,7 +1226,8 @@ def stream_logs(job_id: Optional[int],
1202
1226
  if controller:
1203
1227
  if job_id is None:
1204
1228
  assert job_name is not None
1205
- managed_jobs = managed_job_state.get_managed_jobs()
1229
+ managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
1230
+ name_match=job_name, fields=['job_id', 'job_name', 'status'])
1206
1231
  # We manually filter the jobs by name, instead of using
1207
1232
  # get_nonterminal_job_ids_by_name, as with `controller=True`, we
1208
1233
  # should be able to show the logs for jobs in terminal states.
@@ -1225,9 +1250,7 @@ def stream_logs(job_id: Optional[int],
1225
1250
  job_id = managed_job_ids.pop()
1226
1251
  assert job_id is not None, (job_id, job_name)
1227
1252
 
1228
- controller_log_path = os.path.join(
1229
- os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
1230
- f'{job_id}.log')
1253
+ controller_log_path = controller_log_file_for_job(job_id)
1231
1254
  job_status = None
1232
1255
 
1233
1256
  # Wait for the log file to be written
@@ -1378,9 +1401,11 @@ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
1378
1401
  new_fields.append('priority')
1379
1402
  if 'failure_reason' not in new_fields:
1380
1403
  new_fields.append('failure_reason')
1381
- if ('user_yaml' in new_fields and
1382
- 'original_user_yaml_path' not in new_fields):
1383
- new_fields.append('original_user_yaml_path')
1404
+ if 'user_yaml' in new_fields:
1405
+ if 'original_user_yaml_path' not in new_fields:
1406
+ new_fields.append('original_user_yaml_path')
1407
+ if 'original_user_yaml_content' not in new_fields:
1408
+ new_fields.append('original_user_yaml_content')
1384
1409
  if cluster_handle_required:
1385
1410
  if 'task_name' not in new_fields:
1386
1411
  new_fields.append('task_name')
@@ -1522,12 +1547,11 @@ def get_managed_job_queue(
1522
1547
  handle = cluster_name_to_handle.get(
1523
1548
  cluster_name, None) if cluster_name is not None else None
1524
1549
  if isinstance(handle, backends.CloudVmRayResourceHandle):
1525
- resources_str = resources_utils.get_readable_resources_repr(
1526
- handle, simplify=True)
1527
- resources_str_full = (
1528
- resources_utils.get_readable_resources_repr(handle,
1529
- simplify=False))
1530
- job['cluster_resources'] = resources_str
1550
+ resources_str_simple, resources_str_full = (
1551
+ resources_utils.get_readable_resources_repr(
1552
+ handle, simplified_only=False))
1553
+ assert resources_str_full is not None
1554
+ job['cluster_resources'] = resources_str_simple
1531
1555
  job['cluster_resources_full'] = resources_str_full
1532
1556
  job['cloud'] = str(handle.launched_resources.cloud)
1533
1557
  job['region'] = handle.launched_resources.region
@@ -2110,7 +2134,8 @@ def _job_proto_to_dict(
2110
2134
  # and Protobuf encodes int64 as decimal strings in JSON,
2111
2135
  # so we need to convert them back to ints.
2112
2136
  # https://protobuf.dev/programming-guides/json/#field-representation
2113
- if field.type == descriptor.FieldDescriptor.TYPE_INT64:
2137
+ if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
2138
+ job_dict.get(field.name) is not None):
2114
2139
  job_dict[field.name] = int(job_dict[field.name])
2115
2140
  job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
2116
2141
  job_dict['status'])
@@ -2265,6 +2290,18 @@ class ManagedJobCodeGen:
2265
2290
  """)
2266
2291
  return cls._build(code)
2267
2292
 
2293
+ @classmethod
2294
+ def get_version(cls) -> str:
2295
+ """Generate code to get controller version."""
2296
+ code = textwrap.dedent("""\
2297
+ from sky.skylet import constants as controller_constants
2298
+
2299
+ # Get controller version
2300
+ controller_version = controller_constants.SKYLET_VERSION
2301
+ print(f"controller_version:{controller_version}", flush=True)
2302
+ """)
2303
+ return cls._build(code)
2304
+
2268
2305
  @classmethod
2269
2306
  def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
2270
2307
  code = textwrap.dedent(f"""\
sky/metrics/utils.py CHANGED
@@ -143,6 +143,24 @@ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
143
143
  'RSS increment after requests', ['name'],
144
144
  buckets=_MEM_BUCKETS)
145
145
 
146
+ SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
147
+ 'sky_apiserver_websocket_ssh_latency_seconds',
148
+ ('Time taken for ssh message to go from client to API server and back'
149
+ 'to the client. This does not include: latency to reach the pod, '
150
+ 'overhead from sending through the k8s port-forward tunnel, or '
151
+ 'ssh server lag on the destination pod.'),
152
+ ['pid'],
153
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
154
+ 0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
155
+ 5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
156
+ 50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
157
+ 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
158
+ 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
159
+ 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
160
+ 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
161
+ 960.0, 980.0, 1000.0, float('inf')),
162
+ )
163
+
146
164
 
147
165
  @contextlib.contextmanager
148
166
  def time_it(name: str, group: str = 'default'):
sky/optimizer.py CHANGED
@@ -1019,7 +1019,7 @@ class Optimizer:
1019
1019
  if res.instance_type is not None
1020
1020
  ])
1021
1021
  candidate_str = resources_utils.format_resource(
1022
- best_resources, simplify=True)
1022
+ best_resources, simplified_only=True)[0]
1023
1023
 
1024
1024
  logger.info(
1025
1025
  f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '