skypilot-nightly 1.0.0.dev20250818__py3-none-any.whl → 1.0.0.dev20250819__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +2 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8969-6cb1af4ec7fb1e19.js → 8969-23c8fbdb8b397d59.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-a46c8b62df807ec1.js → webpack-008593a02784a2df.js} +1 -1
- sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → tYn7R2be3cQPYJfTxxE09}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +42 -33
- sky/jobs/server/utils.py +2 -1
- sky/jobs/utils.py +56 -9
- sky/provision/provisioner.py +10 -6
- sky/server/requests/payloads.py +1 -0
- sky/server/requests/serializers/encoders.py +15 -3
- sky/server/server.py +4 -1
- sky/setup_files/MANIFEST.in +1 -0
- sky/skylet/ray_patches/__init__.py +18 -4
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/RECORD +46 -39
- sky/dashboard/out/_next/static/chunks/3015-471d67c9302d4027.js +0 -1
- /sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → tYn7R2be3cQPYJfTxxE09}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250819.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
|
@@ -514,7 +514,7 @@ def queue_from_kubernetes_pod(
|
|
|
514
514
|
except exceptions.CommandError as e:
|
|
515
515
|
raise RuntimeError(str(e)) from e
|
|
516
516
|
|
|
517
|
-
jobs, _, result_type = managed_job_utils.load_managed_job_queue(
|
|
517
|
+
jobs, _, result_type, _, _ = managed_job_utils.load_managed_job_queue(
|
|
518
518
|
job_table_payload)
|
|
519
519
|
|
|
520
520
|
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
@@ -587,31 +587,36 @@ def queue(
|
|
|
587
587
|
pool_match: Optional[str] = None,
|
|
588
588
|
page: Optional[int] = None,
|
|
589
589
|
limit: Optional[int] = None,
|
|
590
|
-
|
|
590
|
+
statuses: Optional[List[str]] = None,
|
|
591
|
+
) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
|
|
591
592
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
592
593
|
"""Gets statuses of managed jobs.
|
|
593
594
|
|
|
594
595
|
Please refer to sky.cli.job_queue for documentation.
|
|
595
596
|
|
|
596
597
|
Returns:
|
|
597
|
-
[
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
598
|
+
jobs: List[Dict[str, Any]]
|
|
599
|
+
[
|
|
600
|
+
{
|
|
601
|
+
'job_id': int,
|
|
602
|
+
'job_name': str,
|
|
603
|
+
'resources': str,
|
|
604
|
+
'submitted_at': (float) timestamp of submission,
|
|
605
|
+
'end_at': (float) timestamp of end,
|
|
606
|
+
'job_duration': (float) duration in seconds,
|
|
607
|
+
'recovery_count': (int) Number of retries,
|
|
608
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
|
609
|
+
'cluster_resources': (str) resources of the cluster,
|
|
610
|
+
'region': (str) region of the cluster,
|
|
611
|
+
'user_name': (Optional[str]) job creator's user name,
|
|
612
|
+
'user_hash': (str) job creator's user hash,
|
|
613
|
+
'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
614
|
+
'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
|
|
615
|
+
}
|
|
616
|
+
]
|
|
617
|
+
total: int, total number of jobs after filter
|
|
618
|
+
status_counts: Dict[str, int], status counts after filter
|
|
619
|
+
total_no_filter: int, total number of jobs before filter
|
|
615
620
|
Raises:
|
|
616
621
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up or
|
|
617
622
|
does not exist.
|
|
@@ -645,13 +650,13 @@ def queue(
|
|
|
645
650
|
elif user_match is not None:
|
|
646
651
|
users = global_user_state.get_user_by_name_match(user_match)
|
|
647
652
|
if not users:
|
|
648
|
-
return [], 0
|
|
653
|
+
return [], 0, {}, 0
|
|
649
654
|
user_hashes = [user.id for user in users]
|
|
650
655
|
|
|
651
656
|
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
652
657
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
653
658
|
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
654
|
-
name_match, pool_match, page, limit, user_hashes)
|
|
659
|
+
name_match, pool_match, page, limit, user_hashes, statuses)
|
|
655
660
|
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
656
661
|
handle,
|
|
657
662
|
code,
|
|
@@ -664,11 +669,11 @@ def queue(
|
|
|
664
669
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
665
670
|
f'{returncode}.\n{job_table_payload + stderr}')
|
|
666
671
|
|
|
667
|
-
jobs, total, result_type
|
|
668
|
-
|
|
672
|
+
(jobs, total, result_type, total_no_filter, status_counts
|
|
673
|
+
) = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
669
674
|
|
|
670
675
|
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
671
|
-
return jobs, total
|
|
676
|
+
return jobs, total, status_counts, total_no_filter
|
|
672
677
|
|
|
673
678
|
# Backward compatibility for old jobs controller without filtering
|
|
674
679
|
# TODO(hailong): remove this after 0.12.0
|
|
@@ -702,14 +707,18 @@ def queue(
|
|
|
702
707
|
if job_ids:
|
|
703
708
|
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
704
709
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
710
|
+
filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
|
|
711
|
+
jobs,
|
|
712
|
+
workspace_match,
|
|
713
|
+
name_match,
|
|
714
|
+
pool_match,
|
|
715
|
+
page=page,
|
|
716
|
+
limit=limit,
|
|
717
|
+
user_match=user_match,
|
|
718
|
+
enable_user_match=True,
|
|
719
|
+
statuses=statuses,
|
|
720
|
+
)
|
|
721
|
+
return filtered_jobs, total, status_counts, total_no_filter
|
|
713
722
|
|
|
714
723
|
|
|
715
724
|
@usage_lib.entrypoint
|
sky/jobs/server/utils.py
CHANGED
|
@@ -62,7 +62,8 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
62
62
|
version_matches = controller_version == local_version
|
|
63
63
|
|
|
64
64
|
# Load and filter jobs locally using existing method
|
|
65
|
-
jobs, _, _ = managed_job_utils.load_managed_job_queue(
|
|
65
|
+
jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
|
|
66
|
+
job_table_payload)
|
|
66
67
|
non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
|
|
67
68
|
has_non_terminal_jobs = len(non_terminal_jobs) > 0
|
|
68
69
|
|
sky/jobs/utils.py
CHANGED
|
@@ -768,6 +768,13 @@ def stream_logs_by_id(job_id: int,
|
|
|
768
768
|
assert tail > 0
|
|
769
769
|
# Read only the last 'tail' lines using deque
|
|
770
770
|
read_from = collections.deque(f, maxlen=tail)
|
|
771
|
+
# We set start_streaming to True here in case
|
|
772
|
+
# truncating the log file removes the line that
|
|
773
|
+
# contains LOG_FILE_START_STREAMING_AT. This does
|
|
774
|
+
# not cause issues for log files shorter than tail
|
|
775
|
+
# because tail_logs in sky/skylet/log_lib.py also
|
|
776
|
+
# handles LOG_FILE_START_STREAMING_AT.
|
|
777
|
+
start_streaming = True
|
|
771
778
|
for line in read_from:
|
|
772
779
|
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
|
773
780
|
start_streaming = True
|
|
@@ -1133,6 +1140,7 @@ def dump_managed_job_queue(
|
|
|
1133
1140
|
page: Optional[int] = None,
|
|
1134
1141
|
limit: Optional[int] = None,
|
|
1135
1142
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1143
|
+
statuses: Optional[List[str]] = None,
|
|
1136
1144
|
) -> str:
|
|
1137
1145
|
# Make sure to get all jobs - some logic below (e.g. high priority job
|
|
1138
1146
|
# detection) requires a full view of the jobs table.
|
|
@@ -1160,6 +1168,8 @@ def dump_managed_job_queue(
|
|
|
1160
1168
|
if priority is not None and priority > highest_blocking_priority:
|
|
1161
1169
|
highest_blocking_priority = priority
|
|
1162
1170
|
|
|
1171
|
+
total_no_filter = len(jobs)
|
|
1172
|
+
|
|
1163
1173
|
if user_hashes:
|
|
1164
1174
|
jobs = [
|
|
1165
1175
|
job for job in jobs if job.get('user_hash', None) in user_hashes
|
|
@@ -1183,8 +1193,13 @@ def dump_managed_job_queue(
|
|
|
1183
1193
|
if job_ids:
|
|
1184
1194
|
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1185
1195
|
|
|
1186
|
-
jobs, total = filter_jobs(jobs,
|
|
1187
|
-
|
|
1196
|
+
jobs, total, status_counts = filter_jobs(jobs,
|
|
1197
|
+
workspace_match,
|
|
1198
|
+
name_match,
|
|
1199
|
+
pool_match,
|
|
1200
|
+
page,
|
|
1201
|
+
limit,
|
|
1202
|
+
statuses=statuses)
|
|
1188
1203
|
for job in jobs:
|
|
1189
1204
|
end_at = job['end_at']
|
|
1190
1205
|
if end_at is None:
|
|
@@ -1258,7 +1273,12 @@ def dump_managed_job_queue(
|
|
|
1258
1273
|
else:
|
|
1259
1274
|
job['details'] = None
|
|
1260
1275
|
|
|
1261
|
-
return message_utils.encode_payload({
|
|
1276
|
+
return message_utils.encode_payload({
|
|
1277
|
+
'jobs': jobs,
|
|
1278
|
+
'total': total,
|
|
1279
|
+
'total_no_filter': total_no_filter,
|
|
1280
|
+
'status_counts': status_counts
|
|
1281
|
+
})
|
|
1262
1282
|
|
|
1263
1283
|
|
|
1264
1284
|
def filter_jobs(
|
|
@@ -1270,7 +1290,8 @@ def filter_jobs(
|
|
|
1270
1290
|
limit: Optional[int],
|
|
1271
1291
|
user_match: Optional[str] = None,
|
|
1272
1292
|
enable_user_match: bool = False,
|
|
1273
|
-
|
|
1293
|
+
statuses: Optional[List[str]] = None,
|
|
1294
|
+
) -> Tuple[List[Dict[str, Any]], int, Dict[str, int]]:
|
|
1274
1295
|
"""Filter jobs based on the given criteria.
|
|
1275
1296
|
|
|
1276
1297
|
Args:
|
|
@@ -1282,9 +1303,12 @@ def filter_jobs(
|
|
|
1282
1303
|
limit: Limit to filter.
|
|
1283
1304
|
user_match: User name to filter.
|
|
1284
1305
|
enable_user_match: Whether to enable user match.
|
|
1306
|
+
statuses: Statuses to filter.
|
|
1285
1307
|
|
|
1286
1308
|
Returns:
|
|
1287
|
-
List of filtered jobs
|
|
1309
|
+
List of filtered jobs
|
|
1310
|
+
Total number of jobs
|
|
1311
|
+
Dictionary of status counts
|
|
1288
1312
|
"""
|
|
1289
1313
|
|
|
1290
1314
|
# TODO(hailong): refactor the whole function including the
|
|
@@ -1314,6 +1338,7 @@ def filter_jobs(
|
|
|
1314
1338
|
end = min(start + limit, len(result))
|
|
1315
1339
|
return result[start:end]
|
|
1316
1340
|
|
|
1341
|
+
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1317
1342
|
result = []
|
|
1318
1343
|
checks = [
|
|
1319
1344
|
('workspace', workspace_match),
|
|
@@ -1327,25 +1352,34 @@ def filter_jobs(
|
|
|
1327
1352
|
if not all(
|
|
1328
1353
|
_pattern_matches(job, key, pattern) for key, pattern in checks):
|
|
1329
1354
|
continue
|
|
1355
|
+
status_counts[job['status'].value] += 1
|
|
1356
|
+
if statuses:
|
|
1357
|
+
if job['status'].value not in statuses:
|
|
1358
|
+
continue
|
|
1330
1359
|
result.append(job)
|
|
1331
1360
|
|
|
1332
1361
|
total = len(result)
|
|
1333
1362
|
|
|
1334
|
-
return _handle_page_and_limit(result, page, limit), total
|
|
1363
|
+
return _handle_page_and_limit(result, page, limit), total, status_counts
|
|
1335
1364
|
|
|
1336
1365
|
|
|
1337
1366
|
def load_managed_job_queue(
|
|
1338
1367
|
payload: str
|
|
1339
|
-
) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType
|
|
1368
|
+
) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType, int, Dict[
|
|
1369
|
+
str, int]]:
|
|
1340
1370
|
"""Load job queue from json string."""
|
|
1341
1371
|
result = message_utils.decode_payload(payload)
|
|
1342
1372
|
result_type = ManagedJobQueueResultType.DICT
|
|
1373
|
+
status_counts = {}
|
|
1343
1374
|
if isinstance(result, dict):
|
|
1344
1375
|
jobs = result['jobs']
|
|
1345
1376
|
total = result['total']
|
|
1377
|
+
status_counts = result.get('status_counts', {})
|
|
1378
|
+
total_no_filter = result.get('total_no_filter', total)
|
|
1346
1379
|
else:
|
|
1347
1380
|
jobs = result
|
|
1348
1381
|
total = len(jobs)
|
|
1382
|
+
total_no_filter = total
|
|
1349
1383
|
result_type = ManagedJobQueueResultType.LIST
|
|
1350
1384
|
|
|
1351
1385
|
for job in jobs:
|
|
@@ -1355,7 +1389,7 @@ def load_managed_job_queue(
|
|
|
1355
1389
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1356
1390
|
user = global_user_state.get_user(job['user_hash'])
|
|
1357
1391
|
job['user_name'] = user.name if user is not None else None
|
|
1358
|
-
return jobs, total, result_type
|
|
1392
|
+
return jobs, total, result_type, total_no_filter, status_counts
|
|
1359
1393
|
|
|
1360
1394
|
|
|
1361
1395
|
def _get_job_status_from_tasks(
|
|
@@ -1713,6 +1747,7 @@ class ManagedJobCodeGen:
|
|
|
1713
1747
|
page: Optional[int] = None,
|
|
1714
1748
|
limit: Optional[int] = None,
|
|
1715
1749
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1750
|
+
statuses: Optional[List[str]] = None,
|
|
1716
1751
|
) -> str:
|
|
1717
1752
|
code = textwrap.dedent(f"""\
|
|
1718
1753
|
if managed_job_version < 9:
|
|
@@ -1720,7 +1755,7 @@ class ManagedJobCodeGen:
|
|
|
1720
1755
|
# before #6652.
|
|
1721
1756
|
# TODO(hailong): Remove compatibility before 0.12.0
|
|
1722
1757
|
job_table = utils.dump_managed_job_queue()
|
|
1723
|
-
|
|
1758
|
+
elif managed_job_version < 10:
|
|
1724
1759
|
job_table = utils.dump_managed_job_queue(
|
|
1725
1760
|
skip_finished={skip_finished},
|
|
1726
1761
|
accessible_workspaces={accessible_workspaces!r},
|
|
@@ -1731,6 +1766,18 @@ class ManagedJobCodeGen:
|
|
|
1731
1766
|
page={page!r},
|
|
1732
1767
|
limit={limit!r},
|
|
1733
1768
|
user_hashes={user_hashes!r})
|
|
1769
|
+
else:
|
|
1770
|
+
job_table = utils.dump_managed_job_queue(
|
|
1771
|
+
skip_finished={skip_finished},
|
|
1772
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
1773
|
+
job_ids={job_ids!r},
|
|
1774
|
+
workspace_match={workspace_match!r},
|
|
1775
|
+
name_match={name_match!r},
|
|
1776
|
+
pool_match={pool_match!r},
|
|
1777
|
+
page={page!r},
|
|
1778
|
+
limit={limit!r},
|
|
1779
|
+
user_hashes={user_hashes!r},
|
|
1780
|
+
statuses={statuses!r})
|
|
1734
1781
|
print(job_table, flush=True)
|
|
1735
1782
|
""")
|
|
1736
1783
|
return cls._build(code)
|
sky/provision/provisioner.py
CHANGED
|
@@ -167,7 +167,7 @@ def bulk_provision(
|
|
|
167
167
|
# This error is a user error instead of a provisioning failure.
|
|
168
168
|
# And there is no possibility to fix it by teardown.
|
|
169
169
|
raise
|
|
170
|
-
except Exception: # pylint: disable=broad-except
|
|
170
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
171
171
|
zone_str = 'all zones'
|
|
172
172
|
if zones:
|
|
173
173
|
zone_str = ','.join(zone.name for zone in zones)
|
|
@@ -189,14 +189,18 @@ def bulk_provision(
|
|
|
189
189
|
provider_config=original_config['provider'])
|
|
190
190
|
break
|
|
191
191
|
except NotImplementedError as e:
|
|
192
|
-
|
|
192
|
+
assert not terminate, (
|
|
193
|
+
'Terminating must be supported by all clouds')
|
|
194
|
+
exc_msg = common_utils.format_exception(exc).replace(
|
|
195
|
+
'\n', ' ')
|
|
193
196
|
# If the underlying cloud does not support stopping
|
|
194
197
|
# instances, we should stop failover as well.
|
|
195
198
|
raise provision_common.StopFailoverError(
|
|
196
|
-
'
|
|
197
|
-
f'
|
|
198
|
-
|
|
199
|
-
f'
|
|
199
|
+
f'Provisioning cluster {cluster_name.display_name} '
|
|
200
|
+
f'failed: {exc_msg}. Failover is stopped for safety '
|
|
201
|
+
'because the cluster was previously in UP state but '
|
|
202
|
+
f'{cloud} does not support stopping instances to '
|
|
203
|
+
'preserve the cluster state. Please try launching the '
|
|
200
204
|
'cluster again, or terminate it with: '
|
|
201
205
|
f'sky down {cluster_name.display_name}') from e
|
|
202
206
|
except Exception as e: # pylint: disable=broad-except
|
sky/server/requests/payloads.py
CHANGED
|
@@ -113,8 +113,15 @@ def encode_status_kubernetes(
|
|
|
113
113
|
@register_encoder('jobs.queue')
|
|
114
114
|
def encode_jobs_queue(jobs_or_tuple):
|
|
115
115
|
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
116
|
-
|
|
117
|
-
|
|
116
|
+
status_counts = {}
|
|
117
|
+
if isinstance(jobs_or_tuple, tuple):
|
|
118
|
+
if len(jobs_or_tuple) == 2:
|
|
119
|
+
jobs, total = jobs_or_tuple
|
|
120
|
+
total_no_filter = total
|
|
121
|
+
elif len(jobs_or_tuple) == 4:
|
|
122
|
+
jobs, total, status_counts, total_no_filter = jobs_or_tuple
|
|
123
|
+
else:
|
|
124
|
+
raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
|
|
118
125
|
else:
|
|
119
126
|
jobs = jobs_or_tuple
|
|
120
127
|
total = None
|
|
@@ -122,7 +129,12 @@ def encode_jobs_queue(jobs_or_tuple):
|
|
|
122
129
|
job['status'] = job['status'].value
|
|
123
130
|
if total is None:
|
|
124
131
|
return jobs
|
|
125
|
-
return {
|
|
132
|
+
return {
|
|
133
|
+
'jobs': jobs,
|
|
134
|
+
'total': total,
|
|
135
|
+
'total_no_filter': total_no_filter,
|
|
136
|
+
'status_counts': status_counts
|
|
137
|
+
}
|
|
126
138
|
|
|
127
139
|
|
|
128
140
|
def _encode_serve_status(
|
sky/server/server.py
CHANGED
|
@@ -1650,7 +1650,10 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1650
1650
|
await websocket.accept()
|
|
1651
1651
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1652
1652
|
|
|
1653
|
-
|
|
1653
|
+
# Run core.status in another thread to avoid blocking the event loop.
|
|
1654
|
+
cluster_records = await context_utils.to_thread(core.status,
|
|
1655
|
+
cluster_name,
|
|
1656
|
+
all_users=True)
|
|
1654
1657
|
cluster_record = cluster_records[0]
|
|
1655
1658
|
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
|
1656
1659
|
raise fastapi.HTTPException(
|
sky/setup_files/MANIFEST.in
CHANGED
|
@@ -9,6 +9,7 @@ include sky/skylet/providers/ibm/*
|
|
|
9
9
|
include sky/skylet/providers/scp/*
|
|
10
10
|
include sky/skylet/providers/*.py
|
|
11
11
|
include sky/skylet/ray_patches/*.patch
|
|
12
|
+
include sky/skylet/ray_patches/*.diff
|
|
12
13
|
include sky/jobs/dashboard/*
|
|
13
14
|
include sky/jobs/dashboard/templates/*
|
|
14
15
|
include sky/jobs/dashboard/static/*
|
|
@@ -40,15 +40,29 @@ def _run_patch(target_file,
|
|
|
40
40
|
"""Applies a patch if it has not been applied already."""
|
|
41
41
|
# .orig is the original file that is not patched.
|
|
42
42
|
orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
|
|
43
|
+
# Get diff filename by replacing .patch with .diff
|
|
44
|
+
diff_file = patch_file.replace('.patch', '.diff')
|
|
45
|
+
|
|
43
46
|
script = f"""\
|
|
44
|
-
which patch >/dev/null 2>&1 || sudo yum install -y patch ||
|
|
45
|
-
which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
|
|
47
|
+
which patch >/dev/null 2>&1 || sudo yum install -y patch || true
|
|
46
48
|
if [ ! -f {orig_file} ]; then
|
|
47
49
|
echo Create backup file {orig_file}
|
|
48
50
|
cp {target_file} {orig_file}
|
|
49
51
|
fi
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
if which patch >/dev/null 2>&1; then
|
|
53
|
+
# System patch command is available, use it
|
|
54
|
+
# It is ok to patch again from the original file.
|
|
55
|
+
patch {orig_file} -i {patch_file} -o {target_file}
|
|
56
|
+
else
|
|
57
|
+
# System patch command not available, use Python patch library
|
|
58
|
+
echo "System patch command not available, using Python patch library..."
|
|
59
|
+
python -m pip install patch
|
|
60
|
+
# Get target directory
|
|
61
|
+
target_dir="$(dirname {target_file})"
|
|
62
|
+
# Execute python patch command
|
|
63
|
+
echo "Executing python -m patch -d $target_dir {diff_file}"
|
|
64
|
+
python -m patch -d "$target_dir" "{diff_file}"
|
|
65
|
+
fi
|
|
52
66
|
"""
|
|
53
67
|
subprocess.run(script, shell=True, check=True)
|
|
54
68
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
--- a/autoscaler.py
|
|
2
|
+
+++ b/autoscaler.py
|
|
3
|
+
@@ -1,3 +1,6 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - enable upscaling_speed to be 0.0
|
|
7
|
+
import copy
|
|
8
|
+
import logging
|
|
9
|
+
import math
|
|
10
|
+
@@ -1071,7 +1074,7 @@
|
|
11
|
+
upscaling_speed = self.config.get("upscaling_speed")
|
|
12
|
+
aggressive = self.config.get("autoscaling_mode") == "aggressive"
|
|
13
|
+
target_utilization_fraction = self.config.get("target_utilization_fraction")
|
|
14
|
+
- if upscaling_speed:
|
|
15
|
+
+ if upscaling_speed is not None: # NOTE(sky): enable 0.0
|
|
16
|
+
upscaling_speed = float(upscaling_speed)
|
|
17
|
+
# TODO(ameer): consider adding (if users ask) an option of
|
|
18
|
+
# initial_upscaling_num_workers.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
--- a/cli.py
|
|
2
|
+
+++ b/cli.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
|
|
5
|
+
+# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
|
|
6
|
+
+# Otherwise, the output redirection ">" will not work.
|
|
7
|
+
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
@@ -270,7 +274,7 @@
|
|
12
|
+
working_dir=working_dir,
|
|
13
|
+
)
|
|
14
|
+
job_id = client.submit_job(
|
|
15
|
+
- entrypoint=list2cmdline(entrypoint),
|
|
16
|
+
+ entrypoint=" ".join(entrypoint),
|
|
17
|
+
submission_id=submission_id,
|
|
18
|
+
runtime_env=final_runtime_env,
|
|
19
|
+
metadata=metadata_json,
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
--- a/command_runner.py
|
|
2
|
+
+++ b/command_runner.py
|
|
3
|
+
@@ -1,3 +1,5 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
|
|
5
|
+
+
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
@@ -137,7 +139,7 @@
|
|
10
|
+
{
|
|
11
|
+
"ControlMaster": "auto",
|
|
12
|
+
"ControlPath": "{}/%C".format(control_path),
|
|
13
|
+
- "ControlPersist": "10s",
|
|
14
|
+
+ "ControlPersist": "300s",
|
|
15
|
+
}
|
|
16
|
+
)
|
|
17
|
+
self.arg_dict.update(kwargs)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
--- a/log_monitor.py
|
|
2
|
+
+++ b/log_monitor.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
|
|
5
|
+
+# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
|
|
6
|
+
+# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
|
|
7
|
+
+
|
|
8
|
+
import argparse
|
|
9
|
+
import errno
|
|
10
|
+
import glob
|
|
11
|
+
@@ -374,7 +378,8 @@
|
|
12
|
+
next_line = next_line.decode("utf-8", "replace")
|
|
13
|
+
if next_line == "":
|
|
14
|
+
break
|
|
15
|
+
- next_line = next_line.rstrip("\r\n")
|
|
16
|
+
+ if next_line.endswith("\n"):
|
|
17
|
+
+ next_line = next_line[:-1]
|
|
18
|
+
|
|
19
|
+
if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
|
|
20
|
+
flush() # Possible change of task/actor name.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
--- a/resource_demand_scheduler.py
|
|
2
|
+
+++ b/resource_demand_scheduler.py
|
|
3
|
+
@@ -1,3 +1,8 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - no new nodes are allowed to be launched launched when the upscaling_speed is 0
|
|
7
|
+
+# - comment out "assert not unfulfilled": this seems a buggy assert
|
|
8
|
+
+
|
|
9
|
+
"""Implements multi-node-type autoscaling.
|
|
10
|
+
|
|
11
|
+
This file implements an autoscaling algorithm that is aware of multiple node
|
|
12
|
+
@@ -448,7 +453,10 @@
|
|
13
|
+
+ placement_group_nodes.get(node_type, 0),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
- if upper_bound > 0:
|
|
17
|
+
+ # NOTE(sky): do not autoscale when upsclaing speed is 0.
|
|
18
|
+
+ if self.upscaling_speed == 0:
|
|
19
|
+
+ upper_bound = 0
|
|
20
|
+
+ if upper_bound >= 0:
|
|
21
|
+
updated_nodes_to_launch[node_type] = min(
|
|
22
|
+
upper_bound, to_launch[node_type]
|
|
23
|
+
)
|
|
24
|
+
@@ -592,7 +600,7 @@
|
|
25
|
+
unfulfilled, including_reserved = get_bin_pack_residual(
|
|
26
|
+
new_node_resources, unfulfilled, strict_spread=True
|
|
27
|
+
)
|
|
28
|
+
- assert not unfulfilled
|
|
29
|
+
+ # assert not unfulfilled # NOTE(sky): buggy assert.
|
|
30
|
+
node_resources += including_reserved
|
|
31
|
+
return to_add, node_resources, node_type_counts
|
|
32
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
--- a/updater.py
|
|
2
|
+
+++ b/updater.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/releases/2.9.3/python/ray/autoscaler/_private/updater.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - Ensure the node state is refreshed before checking the node is terminated.
|
|
7
|
+
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import subprocess
|
|
11
|
+
@@ -325,6 +329,7 @@
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
time.sleep(READY_CHECK_INTERVAL)
|
|
15
|
+
+ self.provider.non_terminated_nodes({})
|
|
16
|
+
|
|
17
|
+
def do_update(self):
|
|
18
|
+
self.provider.set_node_tags(
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
--- a/worker.py
|
|
2
|
+
+++ b/worker.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py
|
|
5
|
+
+# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233
|
|
6
|
+
+# Tracked in PR https://github.com/ray-project/ray/pull/21977/files.
|
|
7
|
+
+
|
|
8
|
+
import atexit
|
|
9
|
+
import faulthandler
|
|
10
|
+
import functools
|
|
11
|
+
@@ -2020,6 +2024,14 @@
|
|
12
|
+
pid = data.get("pid")
|
|
13
|
+
lines = data.get("lines", [])
|
|
14
|
+
|
|
15
|
+
+ def end_for(line: str) -> str:
|
|
16
|
+
+ if sys.platform == "win32":
|
|
17
|
+
+ return "\n"
|
|
18
|
+
+ if line.endswith("\r"):
|
|
19
|
+
+ return ""
|
|
20
|
+
+ return "\n"
|
|
21
|
+
+
|
|
22
|
+
+
|
|
23
|
+
if data.get("ip") == data.get("localhost"):
|
|
24
|
+
for line in lines:
|
|
25
|
+
if RAY_TQDM_MAGIC in line:
|
|
26
|
+
@@ -2035,6 +2047,7 @@
|
|
27
|
+
message_for(data, line),
|
|
28
|
+
),
|
|
29
|
+
file=print_file,
|
|
30
|
+
+ end=end_for(line),
|
|
31
|
+
)
|
|
32
|
+
else:
|
|
33
|
+
for line in lines:
|
|
34
|
+
@@ -2052,6 +2065,7 @@
|
|
35
|
+
message_for(data, line),
|
|
36
|
+
),
|
|
37
|
+
file=print_file,
|
|
38
|
+
+ end=end_for(line),
|
|
39
|
+
)
|
|
40
|
+
# Restore once at end of batch to avoid excess hiding/unhiding of tqdm.
|
|
41
|
+
restore_tqdm()
|