skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +69 -6
- sky/backends/cloud_vm_ray_backend.py +156 -25
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +40 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +63 -7
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +18 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +8 -0
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +36 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +21 -20
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/dashboard/out/users.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-7ed36e44e779d5c7.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1559-6c00e20454194859.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-37611fe6b86d274d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-c9686994ddafcf01.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-0f886f16e0d55ff8.js" defer=""></script><script src="/dashboard/_next/static/chunks/8056-5bdeda81199c0def.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-85426374db04811e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/9159-11421c0f2909236f.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-a8a8f1adba34c892.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-f72f73bcef9541dc.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-8f67be60165724cc.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/data/storage.py
CHANGED
|
@@ -4510,9 +4510,19 @@ class R2Store(S3CompatibleStore):
|
|
|
4510
4510
|
extra_cli_args=['--checksum-algorithm', 'CRC32'], # R2 specific
|
|
4511
4511
|
cloud_name=cloudflare.NAME,
|
|
4512
4512
|
default_region='auto',
|
|
4513
|
-
mount_cmd_factory=
|
|
4513
|
+
mount_cmd_factory=cls._get_r2_mount_cmd,
|
|
4514
4514
|
)
|
|
4515
4515
|
|
|
4516
|
+
@classmethod
|
|
4517
|
+
def _get_r2_mount_cmd(cls, bucket_name: str, mount_path: str,
|
|
4518
|
+
bucket_sub_path: Optional[str]) -> str:
|
|
4519
|
+
"""Factory method for R2 mount command."""
|
|
4520
|
+
endpoint_url = cloudflare.create_endpoint()
|
|
4521
|
+
return mounting_utils.get_r2_mount_cmd(cloudflare.R2_CREDENTIALS_PATH,
|
|
4522
|
+
cloudflare.R2_PROFILE_NAME,
|
|
4523
|
+
endpoint_url, bucket_name,
|
|
4524
|
+
mount_path, bucket_sub_path)
|
|
4525
|
+
|
|
4516
4526
|
def mount_cached_command(self, mount_path: str) -> str:
|
|
4517
4527
|
"""R2-specific cached mount implementation using rclone."""
|
|
4518
4528
|
install_cmd = mounting_utils.get_rclone_install_cmd()
|
sky/exceptions.py
CHANGED
sky/global_user_state.py
CHANGED
|
@@ -645,13 +645,32 @@ def add_cluster_event(cluster_name: str,
|
|
|
645
645
|
new_status: Optional[status_lib.ClusterStatus],
|
|
646
646
|
reason: str,
|
|
647
647
|
event_type: ClusterEventType,
|
|
648
|
-
nop_if_duplicate: bool = False
|
|
648
|
+
nop_if_duplicate: bool = False,
|
|
649
|
+
duplicate_regex: Optional[str] = None,
|
|
650
|
+
expose_duplicate_error: bool = False,
|
|
651
|
+
transitioned_at: Optional[int] = None) -> None:
|
|
652
|
+
"""Add a cluster event.
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
cluster_name: Name of the cluster.
|
|
656
|
+
new_status: New status of the cluster.
|
|
657
|
+
reason: Reason for the event.
|
|
658
|
+
event_type: Type of the event.
|
|
659
|
+
nop_if_duplicate: If True, do not add the event if it is a duplicate.
|
|
660
|
+
duplicate_regex: If provided, do not add the event if it matches the
|
|
661
|
+
regex. Only used if nop_if_duplicate is True.
|
|
662
|
+
expose_duplicate_error: If True, raise an error if the event is a
|
|
663
|
+
duplicate. Only used if nop_if_duplicate is True.
|
|
664
|
+
transitioned_at: If provided, use this timestamp for the event.
|
|
665
|
+
"""
|
|
649
666
|
assert _SQLALCHEMY_ENGINE is not None
|
|
650
667
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
651
668
|
if cluster_hash is None:
|
|
652
669
|
logger.debug(f'Hash for cluster {cluster_name} not found. '
|
|
653
670
|
'Skipping event.')
|
|
654
671
|
return
|
|
672
|
+
if transitioned_at is None:
|
|
673
|
+
transitioned_at = int(time.time())
|
|
655
674
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
656
675
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
657
676
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
@@ -669,7 +688,10 @@ def add_cluster_event(cluster_name: str,
|
|
|
669
688
|
if nop_if_duplicate:
|
|
670
689
|
last_event = get_last_cluster_event(cluster_hash,
|
|
671
690
|
event_type=event_type)
|
|
672
|
-
if last_event
|
|
691
|
+
if duplicate_regex is not None and last_event is not None:
|
|
692
|
+
if re.search(duplicate_regex, last_event):
|
|
693
|
+
return
|
|
694
|
+
elif last_event == reason:
|
|
673
695
|
return
|
|
674
696
|
try:
|
|
675
697
|
session.execute(
|
|
@@ -679,15 +701,20 @@ def add_cluster_event(cluster_name: str,
|
|
|
679
701
|
starting_status=last_status,
|
|
680
702
|
ending_status=new_status.value if new_status else None,
|
|
681
703
|
reason=reason,
|
|
682
|
-
transitioned_at=
|
|
704
|
+
transitioned_at=transitioned_at,
|
|
683
705
|
type=event_type.value,
|
|
684
706
|
))
|
|
685
707
|
session.commit()
|
|
686
708
|
except sqlalchemy.exc.IntegrityError as e:
|
|
687
709
|
if 'UNIQUE constraint failed' in str(e):
|
|
688
710
|
# This can happen if the cluster event is added twice.
|
|
689
|
-
# We can ignore this error
|
|
690
|
-
|
|
711
|
+
# We can ignore this error unless the caller requests
|
|
712
|
+
# to expose the error.
|
|
713
|
+
if expose_duplicate_error:
|
|
714
|
+
raise db_utils.UniqueConstraintViolationError(
|
|
715
|
+
value=reason, message=str(e))
|
|
716
|
+
else:
|
|
717
|
+
pass
|
|
691
718
|
else:
|
|
692
719
|
raise e
|
|
693
720
|
|
|
@@ -704,6 +731,35 @@ def get_last_cluster_event(cluster_hash: str,
|
|
|
704
731
|
return row.reason
|
|
705
732
|
|
|
706
733
|
|
|
734
|
+
def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
|
|
735
|
+
event_type: ClusterEventType) -> List[str]:
|
|
736
|
+
"""Returns the cluster events for the cluster.
|
|
737
|
+
|
|
738
|
+
Args:
|
|
739
|
+
cluster_name: Name of the cluster. Cannot be specified if cluster_hash
|
|
740
|
+
is specified.
|
|
741
|
+
cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
|
|
742
|
+
is specified.
|
|
743
|
+
event_type: Type of the event.
|
|
744
|
+
"""
|
|
745
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
746
|
+
|
|
747
|
+
if cluster_name is not None and cluster_hash is not None:
|
|
748
|
+
raise ValueError('Cannot specify both cluster_name and cluster_hash')
|
|
749
|
+
if cluster_name is None and cluster_hash is None:
|
|
750
|
+
raise ValueError('Must specify either cluster_name or cluster_hash')
|
|
751
|
+
if cluster_name is not None:
|
|
752
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
753
|
+
if cluster_hash is None:
|
|
754
|
+
raise ValueError(f'Hash for cluster {cluster_name} not found.')
|
|
755
|
+
|
|
756
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
757
|
+
rows = session.query(cluster_event_table).filter_by(
|
|
758
|
+
cluster_hash=cluster_hash, type=event_type.value).order_by(
|
|
759
|
+
cluster_event_table.c.transitioned_at.asc()).all()
|
|
760
|
+
return [row.reason for row in rows]
|
|
761
|
+
|
|
762
|
+
|
|
707
763
|
def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
|
708
764
|
"""Returns the user hash or the current user hash, if user_hash is None.
|
|
709
765
|
|
|
@@ -1245,9 +1301,9 @@ def get_clusters_from_history(
|
|
|
1245
1301
|
def get_cluster_names_start_with(starts_with: str) -> List[str]:
|
|
1246
1302
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1247
1303
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1248
|
-
rows = session.query(cluster_table).filter(
|
|
1304
|
+
rows = session.query(cluster_table.c.name).filter(
|
|
1249
1305
|
cluster_table.c.name.like(f'{starts_with}%')).all()
|
|
1250
|
-
return [row
|
|
1306
|
+
return [row[0] for row in rows]
|
|
1251
1307
|
|
|
1252
1308
|
|
|
1253
1309
|
@_init_db
|
sky/jobs/constants.py
CHANGED
|
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
47
47
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
48
48
|
# change for the jobs/utils, we need to bump this version and update
|
|
49
49
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
50
|
-
MANAGED_JOBS_VERSION =
|
|
50
|
+
MANAGED_JOBS_VERSION = 8
|
|
51
51
|
|
|
52
52
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
|
53
53
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
sky/jobs/controller.py
CHANGED
|
@@ -30,7 +30,6 @@ from sky.jobs import recovery_strategy
|
|
|
30
30
|
from sky.jobs import scheduler
|
|
31
31
|
from sky.jobs import state as managed_job_state
|
|
32
32
|
from sky.jobs import utils as managed_job_utils
|
|
33
|
-
from sky.serve import serve_utils
|
|
34
33
|
from sky.skylet import constants
|
|
35
34
|
from sky.skylet import job_lib
|
|
36
35
|
from sky.usage import usage_lib
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -10,8 +10,8 @@ import traceback
|
|
|
10
10
|
import typing
|
|
11
11
|
from typing import Optional
|
|
12
12
|
|
|
13
|
-
import sky
|
|
14
13
|
from sky import backends
|
|
14
|
+
from sky import dag as dag_lib
|
|
15
15
|
from sky import exceptions
|
|
16
16
|
from sky import execution
|
|
17
17
|
from sky import global_user_state
|
|
@@ -61,7 +61,7 @@ class StrategyExecutor:
|
|
|
61
61
|
"""
|
|
62
62
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
|
63
63
|
'Only CloudVMRayBackend is supported.')
|
|
64
|
-
self.dag =
|
|
64
|
+
self.dag = dag_lib.Dag()
|
|
65
65
|
self.dag.add(task)
|
|
66
66
|
# For jobs submitted to a pool, the cluster name might change after each
|
|
67
67
|
# recovery. Initially this is set to an empty string to indicate that no
|
|
@@ -447,7 +447,7 @@ class StrategyExecutor:
|
|
|
447
447
|
# We retry immediately for worker pool, since no sky.launch()
|
|
448
448
|
# is called and the overhead is minimal.
|
|
449
449
|
gap_seconds = (backoff.current_backoff()
|
|
450
|
-
if self.pool is None else
|
|
450
|
+
if self.pool is None else 1)
|
|
451
451
|
logger.info('Retrying to launch the cluster in '
|
|
452
452
|
f'{gap_seconds:.1f} seconds.')
|
|
453
453
|
time.sleep(gap_seconds)
|
sky/jobs/scheduler.py
CHANGED
|
@@ -15,13 +15,14 @@ following section for more details).
|
|
|
15
15
|
|
|
16
16
|
The scheduling logic limits #running jobs according to three limits:
|
|
17
17
|
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
|
18
|
-
once, based on the number of CPUs.
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
once, based on the number of CPUs. This the most compute-intensive part of
|
|
19
|
+
the job lifecycle, which is why we have an additional limit.
|
|
20
|
+
See sky/utils/controller_utils.py::_get_launch_parallelism.
|
|
21
21
|
2. The number of jobs that can be running at any given time, based on the amount
|
|
22
|
-
of memory.
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
of memory. Since the job controller is doing very little once a job starts
|
|
23
|
+
(just checking its status periodically), the most significant resource it
|
|
24
|
+
consumes is memory.
|
|
25
|
+
See sky/utils/controller_utils.py::_get_job_parallelism.
|
|
25
26
|
3. The number of jobs that can be running in a pool at any given time, based on
|
|
26
27
|
the number of ready workers in the pool. (See _can_start_new_job.)
|
|
27
28
|
|
|
@@ -42,55 +43,27 @@ Nomenclature:
|
|
|
42
43
|
|
|
43
44
|
from argparse import ArgumentParser
|
|
44
45
|
import contextlib
|
|
45
|
-
from functools import lru_cache
|
|
46
46
|
import os
|
|
47
47
|
import sys
|
|
48
48
|
import time
|
|
49
|
-
import typing
|
|
50
49
|
from typing import Optional
|
|
51
50
|
|
|
52
51
|
import filelock
|
|
53
52
|
|
|
54
53
|
from sky import exceptions
|
|
55
54
|
from sky import sky_logging
|
|
56
|
-
from sky.adaptors import common as adaptors_common
|
|
57
55
|
from sky.jobs import constants as managed_job_constants
|
|
58
56
|
from sky.jobs import state
|
|
59
57
|
from sky.serve import serve_utils
|
|
60
58
|
from sky.skylet import constants
|
|
61
59
|
from sky.utils import common_utils
|
|
60
|
+
from sky.utils import controller_utils
|
|
62
61
|
from sky.utils import subprocess_utils
|
|
63
62
|
|
|
64
|
-
if typing.TYPE_CHECKING:
|
|
65
|
-
import psutil
|
|
66
|
-
else:
|
|
67
|
-
psutil = adaptors_common.LazyImport('psutil')
|
|
68
|
-
|
|
69
63
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
70
64
|
|
|
71
|
-
# The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
|
|
72
|
-
# parallelism control or updating the schedule_state of any job.
|
|
73
|
-
# Any code that takes this lock must conclude by calling
|
|
74
|
-
# maybe_schedule_next_jobs.
|
|
75
|
-
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
|
76
65
|
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
|
77
66
|
|
|
78
|
-
# Based on testing, assume a running job uses 350MB memory.
|
|
79
|
-
JOB_MEMORY_MB = 350
|
|
80
|
-
# Past 2000 simultaneous jobs, we become unstable.
|
|
81
|
-
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
|
82
|
-
MAX_JOB_LIMIT = 2000
|
|
83
|
-
# Number of ongoing launches launches allowed per CPU.
|
|
84
|
-
LAUNCHES_PER_CPU = 4
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
@lru_cache(maxsize=1)
|
|
88
|
-
def _get_lock_path() -> str:
|
|
89
|
-
# TODO(tian): Per pool lock.
|
|
90
|
-
path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
|
|
91
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
92
|
-
return path
|
|
93
|
-
|
|
94
67
|
|
|
95
68
|
def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
96
69
|
pool: Optional[str]) -> None:
|
|
@@ -163,7 +136,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
163
136
|
# parallelism control. If we cannot obtain the lock, exit immediately.
|
|
164
137
|
# The current lock holder is expected to launch any jobs it can before
|
|
165
138
|
# releasing the lock.
|
|
166
|
-
with filelock.FileLock(
|
|
139
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path(),
|
|
140
|
+
blocking=False):
|
|
167
141
|
while True:
|
|
168
142
|
maybe_next_job = state.get_waiting_job(pool)
|
|
169
143
|
if maybe_next_job is None:
|
|
@@ -184,7 +158,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
184
158
|
# an ALIVE_WAITING job, but we would be able to launch a WAITING
|
|
185
159
|
# job.
|
|
186
160
|
if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
|
|
187
|
-
if not
|
|
161
|
+
if not (controller_utils.can_provision() or
|
|
162
|
+
actual_pool is not None):
|
|
188
163
|
# Can't schedule anything, break from scheduling loop.
|
|
189
164
|
break
|
|
190
165
|
elif current_state == state.ManagedJobScheduleState.WAITING:
|
|
@@ -234,7 +209,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
234
209
|
|
|
235
210
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
|
236
211
|
"""
|
|
237
|
-
with filelock.FileLock(
|
|
212
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
238
213
|
is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
|
|
239
214
|
original_user_yaml_path,
|
|
240
215
|
env_file_path,
|
|
@@ -286,11 +261,11 @@ def scheduled_launch(job_id: int):
|
|
|
286
261
|
except exceptions.NoClusterLaunchedError:
|
|
287
262
|
# NoClusterLaunchedError is indicates that the job is in retry backoff.
|
|
288
263
|
# We should transition to ALIVE_BACKOFF instead of ALIVE.
|
|
289
|
-
with filelock.FileLock(
|
|
264
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
290
265
|
state.scheduler_set_alive_backoff(job_id)
|
|
291
266
|
raise
|
|
292
267
|
else:
|
|
293
|
-
with filelock.FileLock(
|
|
268
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
294
269
|
state.scheduler_set_alive(job_id)
|
|
295
270
|
finally:
|
|
296
271
|
maybe_schedule_next_jobs(pool)
|
|
@@ -310,56 +285,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
310
285
|
return
|
|
311
286
|
pool = state.get_pool_from_job_id(job_id)
|
|
312
287
|
|
|
313
|
-
with filelock.FileLock(
|
|
288
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
314
289
|
state.scheduler_set_done(job_id, idempotent)
|
|
315
290
|
maybe_schedule_next_jobs(pool)
|
|
316
291
|
|
|
317
292
|
|
|
318
293
|
def _set_alive_waiting(job_id: int) -> None:
|
|
319
294
|
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
320
|
-
with filelock.FileLock(
|
|
295
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
321
296
|
state.scheduler_set_alive_waiting(job_id)
|
|
322
297
|
pool = state.get_pool_from_job_id(job_id)
|
|
323
298
|
maybe_schedule_next_jobs(pool)
|
|
324
299
|
|
|
325
300
|
|
|
326
|
-
def _get_job_parallelism() -> int:
|
|
327
|
-
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
|
328
|
-
|
|
329
|
-
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
|
330
|
-
|
|
331
|
-
return max(job_limit, 1)
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def _get_launch_parallelism() -> int:
|
|
335
|
-
cpus = os.cpu_count()
|
|
336
|
-
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
|
337
|
-
|
|
338
|
-
|
|
339
301
|
def _can_start_new_job(pool: Optional[str]) -> bool:
|
|
340
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
341
|
-
alive_jobs = state.get_num_alive_jobs()
|
|
342
|
-
|
|
343
302
|
# Check basic resource limits
|
|
344
|
-
|
|
345
|
-
|
|
303
|
+
# Pool jobs don't need to provision resources, so we skip the check.
|
|
304
|
+
if not ((controller_utils.can_provision() or pool is not None) and
|
|
305
|
+
controller_utils.can_start_new_process()):
|
|
346
306
|
return False
|
|
347
307
|
|
|
348
|
-
# Check if there are available
|
|
308
|
+
# Check if there are available workers in the pool
|
|
349
309
|
if pool is not None:
|
|
350
310
|
alive_jobs_in_pool = state.get_num_alive_jobs(pool)
|
|
351
|
-
if alive_jobs_in_pool >= serve_utils.
|
|
352
|
-
logger.debug(f'No
|
|
311
|
+
if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
|
|
312
|
+
logger.debug(f'No READY workers available in pool {pool}')
|
|
353
313
|
return False
|
|
354
314
|
|
|
355
315
|
return True
|
|
356
316
|
|
|
357
317
|
|
|
358
|
-
def _can_lauch_in_alive_job() -> bool:
|
|
359
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
360
|
-
return launching_jobs < _get_launch_parallelism()
|
|
361
|
-
|
|
362
|
-
|
|
363
318
|
if __name__ == '__main__':
|
|
364
319
|
parser = ArgumentParser()
|
|
365
320
|
parser.add_argument('dag_yaml',
|
sky/jobs/server/core.py
CHANGED
|
@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
|
93
93
|
return local_to_controller_file_mounts
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
97
|
-
num_jobs:
|
|
96
|
+
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
97
|
+
num_jobs: int) -> Optional[List[int]]:
|
|
98
98
|
"""Submit the managed job locally if in consolidation mode.
|
|
99
99
|
|
|
100
100
|
In normal mode the managed job submission is done in the ray job submission.
|
|
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
|
|
|
109
109
|
# Create local directory for the managed job.
|
|
110
110
|
pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
|
|
111
111
|
job_ids = []
|
|
112
|
+
pool = dag.pool
|
|
112
113
|
pool_hash = None
|
|
113
114
|
if pool is not None:
|
|
114
115
|
pool_hash = serve_state.get_service_hash(pool)
|
|
115
116
|
# Already checked in the sdk.
|
|
116
117
|
assert pool_hash is not None, f'Pool {pool} not found'
|
|
117
|
-
for _ in range(num_jobs
|
|
118
|
+
for _ in range(num_jobs):
|
|
118
119
|
# TODO(tian): We should have a separate name for each job when
|
|
119
120
|
# submitting multiple jobs. Current blocker is that we are sharing
|
|
120
121
|
# the same dag object for all jobs. Maybe we can do copy.copy() for
|
|
@@ -172,9 +173,6 @@ def launch(
|
|
|
172
173
|
handle: Optional[backends.ResourceHandle]; handle to the controller VM.
|
|
173
174
|
None if dryrun.
|
|
174
175
|
"""
|
|
175
|
-
if pool is not None and not managed_job_utils.is_consolidation_mode():
|
|
176
|
-
with ux_utils.print_exception_no_traceback():
|
|
177
|
-
raise ValueError('pool is only supported in consolidation mode.')
|
|
178
176
|
entrypoint = task
|
|
179
177
|
# using hasattr instead of isinstance to avoid importing sky
|
|
180
178
|
if hasattr(task, 'metadata'):
|
|
@@ -295,8 +293,13 @@ def launch(
|
|
|
295
293
|
controller=controller,
|
|
296
294
|
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
|
297
295
|
|
|
296
|
+
num_jobs = num_jobs if num_jobs is not None else 1
|
|
297
|
+
# We do this assignment after applying the admin policy, so that we don't
|
|
298
|
+
# need to serialize the pool name in the dag. The dag object will be
|
|
299
|
+
# preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
|
|
300
|
+
dag.pool = pool
|
|
298
301
|
consolidation_mode_job_ids = _maybe_submit_job_locally(
|
|
299
|
-
prefix, dag,
|
|
302
|
+
prefix, dag, num_jobs)
|
|
300
303
|
|
|
301
304
|
# This is only needed for non-consolidation mode. For consolidation
|
|
302
305
|
# mode, the controller uses the same catalog as API server.
|
|
@@ -373,8 +376,8 @@ def launch(
|
|
|
373
376
|
controller_task._metadata = metadata
|
|
374
377
|
|
|
375
378
|
job_identity = ''
|
|
376
|
-
if
|
|
377
|
-
job_identity = f' (
|
|
379
|
+
if job_rank is not None:
|
|
380
|
+
job_identity = f' (rank: {job_rank})'
|
|
378
381
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
379
382
|
f'Launching managed job {dag.name!r}{job_identity} '
|
|
380
383
|
f'from jobs controller...{colorama.Style.RESET_ALL}')
|
|
@@ -428,14 +431,17 @@ def launch(
|
|
|
428
431
|
backend.run_on_head(local_handle, run_script)
|
|
429
432
|
return consolidation_mode_job_id, local_handle
|
|
430
433
|
|
|
431
|
-
if consolidation_mode_job_ids is None:
|
|
432
|
-
return _submit_one()
|
|
433
434
|
if pool is None:
|
|
435
|
+
if consolidation_mode_job_ids is None:
|
|
436
|
+
return _submit_one()
|
|
434
437
|
assert len(consolidation_mode_job_ids) == 1
|
|
435
438
|
return _submit_one(consolidation_mode_job_ids[0])
|
|
439
|
+
|
|
436
440
|
ids = []
|
|
437
441
|
all_handle = None
|
|
438
|
-
for job_rank
|
|
442
|
+
for job_rank in range(num_jobs):
|
|
443
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
444
|
+
if consolidation_mode_job_ids is not None else None)
|
|
439
445
|
jid, handle = _submit_one(job_id, job_rank)
|
|
440
446
|
assert jid is not None, (job_id, handle)
|
|
441
447
|
ids.append(jid)
|
sky/jobs/state.py
CHANGED
|
@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
441
441
|
|
|
442
442
|
# === Status transition functions ===
|
|
443
443
|
@_init_db
|
|
444
|
-
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str
|
|
444
|
+
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
|
|
445
|
+
pool: Optional[str], pool_hash: Optional[str]):
|
|
445
446
|
assert _SQLALCHEMY_ENGINE is not None
|
|
446
447
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
447
448
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
|
457
458
|
name=name,
|
|
458
459
|
schedule_state=ManagedJobScheduleState.INACTIVE.value,
|
|
459
460
|
workspace=workspace,
|
|
460
|
-
entrypoint=entrypoint
|
|
461
|
+
entrypoint=entrypoint,
|
|
462
|
+
pool=pool,
|
|
463
|
+
pool_hash=pool_hash,
|
|
464
|
+
)
|
|
461
465
|
session.execute(insert_stmt)
|
|
462
466
|
session.commit()
|
|
463
467
|
|
sky/jobs/utils.py
CHANGED
|
@@ -1690,6 +1690,7 @@ class ManagedJobCodeGen:
|
|
|
1690
1690
|
def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
|
|
1691
1691
|
workspace: str, entrypoint: str) -> str:
|
|
1692
1692
|
dag_name = managed_job_dag.name
|
|
1693
|
+
pool = managed_job_dag.pool
|
|
1693
1694
|
# Add the managed job to queue table.
|
|
1694
1695
|
code = textwrap.dedent(f"""\
|
|
1695
1696
|
set_job_info_kwargs = {{'workspace': {workspace!r}}}
|
|
@@ -1697,6 +1698,13 @@ class ManagedJobCodeGen:
|
|
|
1697
1698
|
set_job_info_kwargs = {{}}
|
|
1698
1699
|
if managed_job_version >= 5:
|
|
1699
1700
|
set_job_info_kwargs['entrypoint'] = {entrypoint!r}
|
|
1701
|
+
if managed_job_version >= 8:
|
|
1702
|
+
from sky.serve import serve_state
|
|
1703
|
+
pool_hash = None
|
|
1704
|
+
if {pool!r} != None:
|
|
1705
|
+
pool_hash = serve_state.get_service_hash({pool!r})
|
|
1706
|
+
set_job_info_kwargs['pool'] = {pool!r}
|
|
1707
|
+
set_job_info_kwargs['pool_hash'] = pool_hash
|
|
1700
1708
|
managed_job_state.set_job_info(
|
|
1701
1709
|
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
|
1702
1710
|
""")
|
sky/provision/__init__.py
CHANGED
sky/provision/aws/config.py
CHANGED
|
@@ -19,6 +19,7 @@ import colorama
|
|
|
19
19
|
from sky import exceptions
|
|
20
20
|
from sky import sky_logging
|
|
21
21
|
from sky.adaptors import aws
|
|
22
|
+
from sky.clouds import aws as aws_cloud
|
|
22
23
|
from sky.provision import common
|
|
23
24
|
from sky.provision.aws import utils
|
|
24
25
|
from sky.utils import annotations
|
|
@@ -103,6 +104,14 @@ def bootstrap_instances(
|
|
|
103
104
|
security_group_ids = _configure_security_group(ec2, vpc_id,
|
|
104
105
|
expected_sg_name,
|
|
105
106
|
extended_ip_rules)
|
|
107
|
+
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
108
|
+
# Ensure the default security group is created. This is needed
|
|
109
|
+
# to enable us to use the default security group to quickly
|
|
110
|
+
# delete the cluster. If the default security group is not created,
|
|
111
|
+
# we will need to block on instance termination to delete the
|
|
112
|
+
# security group.
|
|
113
|
+
_configure_security_group(ec2, vpc_id,
|
|
114
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME, [])
|
|
106
115
|
end_time = time.time()
|
|
107
116
|
elapsed = end_time - start_time
|
|
108
117
|
logger.info(
|