skypilot-nightly 1.0.0.dev20251005__py3-none-any.whl → 1.0.0.dev20251009__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +17 -21
- sky/backends/backend.py +1 -3
- sky/backends/cloud_vm_ray_backend.py +76 -54
- sky/backends/local_docker_backend.py +0 -5
- sky/client/cli/command.py +6 -6
- sky/client/sdk.py +24 -23
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +11 -0
- sky/dashboard/out/_next/static/chunks/{9037-d0c00018a5ba198c.js → 1871-49141c317f3a9020.js} +2 -2
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.a19328ba41517b8b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/{5339.4a881570243431a5.js → 9360.71e83b2ddc844ec2.js} +4 -24
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-72794fc3fcdd517a.js → [job]-8f058b0346db2aff.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-477555ab7c0b13d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-2f61f65487f6d8ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-553b8b5cb65e100b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-910a22500c50596f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-dd64309c3fe67ed2.js → [job]-4f7079dcab6ed653.js} +7 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-509b2977a6373bf6.js → [pool]-bc979970c247d8f3.js} +7 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs-a35a9dc3c5ccd657.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-98d2ed979084162a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-835d14ba94808f79.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-e8688c35c06f0ac5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7528cc0ef8c522c5.js → workspaces-69c80d677d3c2949.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +1 -0
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +1 -11
- sky/global_user_state.py +16 -5
- sky/jobs/constants.py +1 -7
- sky/jobs/controller.py +19 -3
- sky/jobs/recovery_strategy.py +3 -1
- sky/jobs/scheduler.py +30 -15
- sky/jobs/server/core.py +8 -3
- sky/jobs/utils.py +30 -2
- sky/metrics/utils.py +65 -37
- sky/provision/instance_setup.py +32 -10
- sky/provision/kubernetes/instance.py +18 -3
- sky/provision/kubernetes/utils.py +4 -1
- sky/provision/provisioner.py +10 -7
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/server/common.py +1 -0
- sky/server/config.py +2 -0
- sky/server/metrics.py +3 -1
- sky/server/requests/executor.py +103 -77
- sky/server/requests/requests.py +26 -11
- sky/server/server.py +16 -0
- sky/skylet/constants.py +9 -1
- sky/skylet/events.py +17 -0
- sky/skylet/skylet.py +3 -0
- sky/templates/kubernetes-ray.yml.j2 +6 -1
- sky/utils/context_utils.py +5 -1
- sky/utils/controller_utils.py +14 -0
- sky/utils/db/db_utils.py +2 -0
- sky/utils/db/migration_utils.py +11 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251009.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251009.dist-info}/RECORD +85 -84
- sky/dashboard/out/_next/static/Vg53Kzbf7u4o6fYPeOHMe/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3294.93d9336bdc032b3a.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e052384df65ef200.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-3286453d56f3c0a0.js +0 -1
- /sky/dashboard/out/_next/static/{Vg53Kzbf7u4o6fYPeOHMe → hIViZcQBkn0HE8SpaSsUU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251009.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251009.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251009.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251009.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6a5ddd0184bfa22c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7359-c8d04e06886000b3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-66237729cdf9749e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-d0782b9251f0fcd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-8d748834fcc60b46.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-3b40c39626f99c89.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-e8688c35c06f0ac5.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"hIViZcQBkn0HE8SpaSsUU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6a5ddd0184bfa22c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-69c80d677d3c2949.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/hIViZcQBkn0HE8SpaSsUU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"hIViZcQBkn0HE8SpaSsUU","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/execution.py
CHANGED
|
@@ -112,7 +112,6 @@ def _execute(
|
|
|
112
112
|
stages: Optional[List[Stage]] = None,
|
|
113
113
|
cluster_name: Optional[str] = None,
|
|
114
114
|
detach_setup: bool = False,
|
|
115
|
-
detach_run: bool = False,
|
|
116
115
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
117
116
|
no_setup: bool = False,
|
|
118
117
|
clone_disk_from: Optional[str] = None,
|
|
@@ -157,8 +156,6 @@ def _execute(
|
|
|
157
156
|
job itself. You can safely ctrl-c to detach from logging, and it will
|
|
158
157
|
not interrupt the setup process. To see the logs again after detaching,
|
|
159
158
|
use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
|
|
160
|
-
detach_run: If True, as soon as a job is submitted, return from this
|
|
161
|
-
function and do not stream execution logs.
|
|
162
159
|
idle_minutes_to_autostop: int; if provided, the cluster will be set to
|
|
163
160
|
autostop after this many minutes of idleness.
|
|
164
161
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
|
@@ -217,7 +214,6 @@ def _execute(
|
|
|
217
214
|
stages=stages,
|
|
218
215
|
cluster_name=cluster_name,
|
|
219
216
|
detach_setup=detach_setup,
|
|
220
|
-
detach_run=detach_run,
|
|
221
217
|
no_setup=no_setup,
|
|
222
218
|
clone_disk_from=clone_disk_from,
|
|
223
219
|
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
|
@@ -239,7 +235,6 @@ def _execute_dag(
|
|
|
239
235
|
stages: Optional[List[Stage]],
|
|
240
236
|
cluster_name: Optional[str],
|
|
241
237
|
detach_setup: bool,
|
|
242
|
-
detach_run: bool,
|
|
243
238
|
no_setup: bool,
|
|
244
239
|
clone_disk_from: Optional[str],
|
|
245
240
|
skip_unnecessary_provisioning: bool,
|
|
@@ -507,10 +502,7 @@ def _execute_dag(
|
|
|
507
502
|
if Stage.EXEC in stages:
|
|
508
503
|
try:
|
|
509
504
|
global_user_state.update_last_use(handle.get_cluster_name())
|
|
510
|
-
job_id = backend.execute(handle,
|
|
511
|
-
task,
|
|
512
|
-
detach_run,
|
|
513
|
-
dryrun=dryrun)
|
|
505
|
+
job_id = backend.execute(handle, task, dryrun=dryrun)
|
|
514
506
|
finally:
|
|
515
507
|
# Enables post_execute() to be run after KeyboardInterrupt.
|
|
516
508
|
backend.post_execute(handle, down)
|
|
@@ -707,7 +699,6 @@ def launch(
|
|
|
707
699
|
stages=stages,
|
|
708
700
|
cluster_name=cluster_name,
|
|
709
701
|
detach_setup=detach_setup,
|
|
710
|
-
detach_run=True,
|
|
711
702
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
712
703
|
no_setup=no_setup,
|
|
713
704
|
clone_disk_from=clone_disk_from,
|
|
@@ -802,6 +793,5 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
802
793
|
Stage.EXEC,
|
|
803
794
|
],
|
|
804
795
|
cluster_name=cluster_name,
|
|
805
|
-
detach_run=True,
|
|
806
796
|
job_logger=job_logger,
|
|
807
797
|
)
|
sky/global_user_state.py
CHANGED
|
@@ -2495,11 +2495,22 @@ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
|
|
|
2495
2495
|
# on the local file system and migrate it to the database.
|
|
2496
2496
|
# TODO(syang): remove this check once we have a way to migrate the
|
|
2497
2497
|
# cluster from file to database. Remove on v0.12.0.
|
|
2498
|
-
if cluster_yaml_path is not None
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2498
|
+
if cluster_yaml_path is not None:
|
|
2499
|
+
# First try the exact path
|
|
2500
|
+
path_to_read = None
|
|
2501
|
+
if os.path.exists(cluster_yaml_path):
|
|
2502
|
+
path_to_read = cluster_yaml_path
|
|
2503
|
+
# Fallback: try with .debug suffix (when debug logging was enabled)
|
|
2504
|
+
# Debug logging causes YAML files to be saved with .debug suffix
|
|
2505
|
+
# but the path stored in the handle doesn't include it
|
|
2506
|
+
debug_path = cluster_yaml_path + '.debug'
|
|
2507
|
+
if os.path.exists(debug_path):
|
|
2508
|
+
path_to_read = debug_path
|
|
2509
|
+
if path_to_read is not None:
|
|
2510
|
+
with open(path_to_read, 'r', encoding='utf-8') as f:
|
|
2511
|
+
yaml_str = f.read()
|
|
2512
|
+
set_cluster_yaml(cluster_name, yaml_str)
|
|
2513
|
+
return yaml_str
|
|
2503
2514
|
return None
|
|
2504
2515
|
|
|
2505
2516
|
|
sky/jobs/constants.py
CHANGED
|
@@ -15,16 +15,10 @@ JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
|
|
|
15
15
|
CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
|
|
16
16
|
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
17
17
|
# Resources as a dict for the jobs controller.
|
|
18
|
-
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
|
19
|
-
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
|
20
|
-
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
|
21
|
-
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
|
22
|
-
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
|
23
|
-
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
|
24
18
|
# We use 50 GB disk size to reduce the cost.
|
|
25
19
|
CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
|
|
26
20
|
'cpus': '4+',
|
|
27
|
-
'memory': '
|
|
21
|
+
'memory': '4x',
|
|
28
22
|
'disk_size': 50
|
|
29
23
|
}
|
|
30
24
|
|
sky/jobs/controller.py
CHANGED
|
@@ -870,8 +870,16 @@ class Controller:
|
|
|
870
870
|
# because when SkyPilot API server machine sends the yaml config to
|
|
871
871
|
# the controller machine, only storage metadata is sent, not the
|
|
872
872
|
# storage object itself.
|
|
873
|
-
|
|
874
|
-
storage.
|
|
873
|
+
try:
|
|
874
|
+
for storage in task.storage_mounts.values():
|
|
875
|
+
storage.construct()
|
|
876
|
+
except (exceptions.StorageSpecError, exceptions.StorageError) as e:
|
|
877
|
+
job_logger.warning(
|
|
878
|
+
f'Failed to construct storage object for teardown: {e}\n'
|
|
879
|
+
'This may happen because storage construction already '
|
|
880
|
+
'failed during launch, storage was deleted externally, '
|
|
881
|
+
'credentials expired/changed, or network connectivity '
|
|
882
|
+
'issues.')
|
|
875
883
|
try:
|
|
876
884
|
backend.teardown_ephemeral_storage(task)
|
|
877
885
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -1144,7 +1152,15 @@ class Controller:
|
|
|
1144
1152
|
await asyncio.sleep(30)
|
|
1145
1153
|
continue
|
|
1146
1154
|
|
|
1147
|
-
if
|
|
1155
|
+
# Normally, 200 jobs can run on each controller. But if we have a
|
|
1156
|
+
# ton of controllers, we need to limit the number of jobs that can
|
|
1157
|
+
# run on each controller, to achieve a total of 2000 jobs across all
|
|
1158
|
+
# controllers.
|
|
1159
|
+
max_jobs = min(scheduler.MAX_JOBS_PER_WORKER,
|
|
1160
|
+
(scheduler.MAX_TOTAL_RUNNING_JOBS //
|
|
1161
|
+
scheduler.get_number_of_controllers()))
|
|
1162
|
+
|
|
1163
|
+
if len(running_tasks) >= max_jobs:
|
|
1148
1164
|
await asyncio.sleep(60)
|
|
1149
1165
|
continue
|
|
1150
1166
|
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -495,7 +495,9 @@ class StrategyExecutor:
|
|
|
495
495
|
self._logger.info('Managed job cluster launched.')
|
|
496
496
|
except (exceptions.InvalidClusterNameError,
|
|
497
497
|
exceptions.NoCloudAccessError,
|
|
498
|
-
exceptions.ResourcesMismatchError
|
|
498
|
+
exceptions.ResourcesMismatchError,
|
|
499
|
+
exceptions.StorageSpecError,
|
|
500
|
+
exceptions.StorageError) as e:
|
|
499
501
|
self._logger.error(
|
|
500
502
|
'Failure happened before provisioning. '
|
|
501
503
|
f'{common_utils.format_exception(e)}')
|
sky/jobs/scheduler.py
CHANGED
|
@@ -63,7 +63,9 @@ from sky.jobs import state
|
|
|
63
63
|
from sky.jobs import utils as managed_job_utils
|
|
64
64
|
from sky.server import config as server_config
|
|
65
65
|
from sky.skylet import constants
|
|
66
|
+
from sky.utils import annotations
|
|
66
67
|
from sky.utils import common_utils
|
|
68
|
+
from sky.utils import controller_utils
|
|
67
69
|
from sky.utils import subprocess_utils
|
|
68
70
|
|
|
69
71
|
if typing.TYPE_CHECKING:
|
|
@@ -91,20 +93,29 @@ JOB_MEMORY_MB = 400
|
|
|
91
93
|
LAUNCHES_PER_WORKER = 8
|
|
92
94
|
# this can probably be increased to around 300-400 but keeping it lower to just
|
|
93
95
|
# to be safe
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
96
|
+
MAX_JOBS_PER_WORKER = 200
|
|
97
|
+
# Maximum number of controllers that can be running. Hard to handle more than
|
|
98
|
+
# 512 launches at once.
|
|
99
|
+
MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
|
|
100
|
+
# Limit the number of jobs that can be running at once on the entire jobs
|
|
101
|
+
# controller cluster. It's hard to handle cancellation of more than 2000 jobs at
|
|
102
|
+
# once.
|
|
103
|
+
# TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
|
|
104
|
+
# hardcoded max limit.
|
|
105
|
+
MAX_TOTAL_RUNNING_JOBS = 2000
|
|
101
106
|
# Maximum values for above constants. There will start to be lagging issues
|
|
102
107
|
# at these numbers already.
|
|
103
108
|
# JOB_MEMORY_MB = 200
|
|
104
109
|
# LAUNCHES_PER_WORKER = 16
|
|
105
110
|
# JOBS_PER_WORKER = 400
|
|
106
111
|
|
|
112
|
+
# keep 2GB reserved after the controllers
|
|
113
|
+
MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
|
|
114
|
+
|
|
115
|
+
CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
|
|
116
|
+
|
|
107
117
|
|
|
118
|
+
@annotations.lru_cache(scope='global')
|
|
108
119
|
def get_number_of_controllers() -> int:
|
|
109
120
|
"""Returns the number of controllers that should be running.
|
|
110
121
|
|
|
@@ -123,7 +134,7 @@ def get_number_of_controllers() -> int:
|
|
|
123
134
|
consolidation_mode = skypilot_config.get_nested(
|
|
124
135
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
125
136
|
|
|
126
|
-
total_memory_mb =
|
|
137
|
+
total_memory_mb = controller_utils.get_controller_mem_size_gb() * 1024
|
|
127
138
|
if consolidation_mode:
|
|
128
139
|
config = server_config.compute_server_config(deploy=True, quiet=True)
|
|
129
140
|
|
|
@@ -136,13 +147,16 @@ def get_number_of_controllers() -> int:
|
|
|
136
147
|
config.short_worker_config.burstable_parallelism) * \
|
|
137
148
|
server_config.SHORT_WORKER_MEM_GB * 1024
|
|
138
149
|
|
|
139
|
-
return
|
|
150
|
+
return min(MAX_CONTROLLERS,
|
|
151
|
+
max(1, int((total_memory_mb - used) // JOB_MEMORY_MB)))
|
|
140
152
|
else:
|
|
141
|
-
return
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
153
|
+
return min(
|
|
154
|
+
MAX_CONTROLLERS,
|
|
155
|
+
max(
|
|
156
|
+
1,
|
|
157
|
+
int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
|
|
158
|
+
((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) *
|
|
159
|
+
1024 + JOB_MEMORY_MB))))
|
|
146
160
|
|
|
147
161
|
|
|
148
162
|
def start_controller() -> None:
|
|
@@ -280,7 +294,8 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
280
294
|
common_utils.get_user_hash(), priority)
|
|
281
295
|
if state.get_ha_recovery_script(job_id) is None:
|
|
282
296
|
# the run command is just the command that called scheduler
|
|
283
|
-
run = (f'{
|
|
297
|
+
run = (f'source {env_file_path} && '
|
|
298
|
+
f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
|
|
284
299
|
f'--job-id {job_id} --env-file {env_file_path} '
|
|
285
300
|
f'--user-yaml-path {original_user_yaml_path} '
|
|
286
301
|
f'--priority {priority}')
|
sky/jobs/server/core.py
CHANGED
|
@@ -407,9 +407,12 @@ def launch(
|
|
|
407
407
|
job_identity = ''
|
|
408
408
|
if job_rank is not None:
|
|
409
409
|
job_identity = f' (rank: {job_rank})'
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
410
|
+
job_controller_postfix = (' from jobs controller' if
|
|
411
|
+
consolidation_mode_job_id is None else '')
|
|
412
|
+
logger.info(
|
|
413
|
+
f'{colorama.Fore.YELLOW}'
|
|
414
|
+
f'Launching managed job {dag.name!r}{job_identity}'
|
|
415
|
+
f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
|
|
413
416
|
|
|
414
417
|
# Launch with the api server's user hash, so that sky status does
|
|
415
418
|
# not show the owner of the controller as whatever user launched
|
|
@@ -456,6 +459,8 @@ def launch(
|
|
|
456
459
|
managed_job_state.set_ha_recovery_script(
|
|
457
460
|
consolidation_mode_job_id, run_script)
|
|
458
461
|
backend.run_on_head(local_handle, run_script)
|
|
462
|
+
ux_utils.starting_message(
|
|
463
|
+
f'Job submitted, ID: {consolidation_mode_job_id}')
|
|
459
464
|
return consolidation_mode_job_id, local_handle
|
|
460
465
|
|
|
461
466
|
if pool is None:
|
sky/jobs/utils.py
CHANGED
|
@@ -11,6 +11,7 @@ import enum
|
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
13
|
import pathlib
|
|
14
|
+
import re
|
|
14
15
|
import shlex
|
|
15
16
|
import textwrap
|
|
16
17
|
import time
|
|
@@ -299,8 +300,10 @@ async def get_job_status(
|
|
|
299
300
|
job_logger.info(f'Job status: {status}')
|
|
300
301
|
job_logger.info('=' * 34)
|
|
301
302
|
return status
|
|
302
|
-
except (exceptions.CommandError, grpc.RpcError,
|
|
303
|
-
|
|
303
|
+
except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
|
|
304
|
+
ValueError, TypeError) as e:
|
|
305
|
+
# Note: Each of these exceptions has some additional conditions to
|
|
306
|
+
# limit how we handle it and whether or not we catch it.
|
|
304
307
|
# Retry on k8s transient network errors. This is useful when using
|
|
305
308
|
# coreweave which may have transient network issue sometimes.
|
|
306
309
|
is_transient_error = False
|
|
@@ -319,6 +322,31 @@ async def get_job_status(
|
|
|
319
322
|
is_transient_error = True
|
|
320
323
|
elif isinstance(e, grpc.FutureTimeoutError):
|
|
321
324
|
detailed_reason = 'Timeout'
|
|
325
|
+
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
326
|
+
elif isinstance(e, ValueError):
|
|
327
|
+
# If the cluster yaml is deleted in the middle of getting the
|
|
328
|
+
# SSH credentials, we could see this. See
|
|
329
|
+
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
330
|
+
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
331
|
+
detailed_reason = 'Cluster yaml was deleted'
|
|
332
|
+
else:
|
|
333
|
+
raise
|
|
334
|
+
elif isinstance(e, TypeError):
|
|
335
|
+
# We will grab the SSH credentials from the cluster yaml, but if
|
|
336
|
+
# handle.cluster_yaml is None, we will just return an empty dict
|
|
337
|
+
# for the credentials. See
|
|
338
|
+
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
339
|
+
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
340
|
+
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
341
|
+
# TypeError if the cluster yaml is removed from the handle right
|
|
342
|
+
# when we pull it before the cluster is fully deleted.
|
|
343
|
+
error_msg_to_check = (
|
|
344
|
+
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
345
|
+
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
346
|
+
if str(e) == error_msg_to_check:
|
|
347
|
+
detailed_reason = 'SSH credentials were already cleaned up'
|
|
348
|
+
else:
|
|
349
|
+
raise
|
|
322
350
|
if is_transient_error:
|
|
323
351
|
logger.info('Failed to connect to the cluster. Retrying '
|
|
324
352
|
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
sky/metrics/utils.py
CHANGED
|
@@ -11,7 +11,9 @@ from typing import List, Optional, Tuple
|
|
|
11
11
|
import httpx
|
|
12
12
|
import prometheus_client as prom
|
|
13
13
|
|
|
14
|
+
from sky import sky_logging
|
|
14
15
|
from sky.skylet import constants
|
|
16
|
+
from sky.utils import common_utils
|
|
15
17
|
from sky.utils import context_utils
|
|
16
18
|
|
|
17
19
|
_SELECT_TIMEOUT = 1
|
|
@@ -35,6 +37,8 @@ _MEM_BUCKETS = [
|
|
|
35
37
|
float('inf'),
|
|
36
38
|
]
|
|
37
39
|
|
|
40
|
+
logger = sky_logging.init_logger(__name__)
|
|
41
|
+
|
|
38
42
|
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
39
43
|
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
40
44
|
'false').lower() == 'true'
|
|
@@ -188,35 +192,42 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
188
192
|
if 'KUBECONFIG' not in env:
|
|
189
193
|
env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
|
|
190
194
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
stdout=subprocess.PIPE,
|
|
194
|
-
stderr=subprocess.STDOUT,
|
|
195
|
-
text=True,
|
|
196
|
-
env=env)
|
|
197
|
-
|
|
195
|
+
port_forward_process = None
|
|
196
|
+
port_forward_exit = False
|
|
198
197
|
local_port = None
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
198
|
+
poller = None
|
|
199
|
+
fd = None
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
# start the port forward process
|
|
203
|
+
port_forward_process = subprocess.Popen(cmd,
|
|
204
|
+
stdout=subprocess.PIPE,
|
|
205
|
+
stderr=subprocess.STDOUT,
|
|
206
|
+
text=True,
|
|
207
|
+
env=env)
|
|
208
|
+
|
|
209
|
+
# Use poll() instead of select() to avoid FD_SETSIZE limit
|
|
210
|
+
poller = select.poll()
|
|
211
|
+
assert port_forward_process.stdout is not None
|
|
212
|
+
fd = port_forward_process.stdout.fileno()
|
|
213
|
+
poller.register(fd, select.POLLIN)
|
|
214
|
+
|
|
215
|
+
start_time = time.time()
|
|
216
|
+
buffer = ''
|
|
217
|
+
# wait for the port forward to start and extract the local port
|
|
218
|
+
while time.time() - start_time < start_port_forward_timeout:
|
|
219
|
+
if port_forward_process.poll() is not None:
|
|
220
|
+
# port forward process has terminated
|
|
221
|
+
if port_forward_process.returncode != 0:
|
|
222
|
+
port_forward_exit = True
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
# Wait up to 1000ms for data to be available without blocking
|
|
226
|
+
# poll() takes timeout in milliseconds
|
|
227
|
+
events = poller.poll(_SELECT_TIMEOUT * 1000)
|
|
228
|
+
|
|
229
|
+
if events:
|
|
218
230
|
# Read available bytes from the FD without blocking
|
|
219
|
-
fd = port_forward_process.stdout.fileno()
|
|
220
231
|
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
221
232
|
chunk = raw.decode(errors='ignore')
|
|
222
233
|
buffer += chunk
|
|
@@ -225,16 +236,28 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
225
236
|
local_port = int(match.group(1))
|
|
226
237
|
break
|
|
227
238
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
239
|
+
# sleep for 100ms to avoid busy-waiting
|
|
240
|
+
time.sleep(0.1)
|
|
241
|
+
except BaseException: # pylint: disable=broad-exception-caught
|
|
242
|
+
if port_forward_process:
|
|
243
|
+
stop_svc_port_forward(port_forward_process,
|
|
244
|
+
timeout=terminate_port_forward_timeout)
|
|
245
|
+
raise
|
|
246
|
+
finally:
|
|
247
|
+
if poller is not None and fd is not None:
|
|
248
|
+
try:
|
|
249
|
+
poller.unregister(fd)
|
|
250
|
+
except (OSError, ValueError):
|
|
251
|
+
# FD may already be unregistered or invalid
|
|
252
|
+
pass
|
|
253
|
+
if port_forward_exit:
|
|
254
|
+
raise RuntimeError(f'Port forward failed for service {service} in '
|
|
255
|
+
f'namespace {namespace} on context {context}')
|
|
231
256
|
if local_port is None:
|
|
232
257
|
try:
|
|
233
|
-
port_forward_process
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
port_forward_process.kill()
|
|
237
|
-
port_forward_process.wait()
|
|
258
|
+
if port_forward_process:
|
|
259
|
+
stop_svc_port_forward(port_forward_process,
|
|
260
|
+
timeout=terminate_port_forward_timeout)
|
|
238
261
|
finally:
|
|
239
262
|
raise RuntimeError(
|
|
240
263
|
f'Failed to extract local port for service {service} in '
|
|
@@ -243,14 +266,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
243
266
|
return port_forward_process, local_port
|
|
244
267
|
|
|
245
268
|
|
|
246
|
-
def stop_svc_port_forward(port_forward_process: subprocess.Popen
|
|
269
|
+
def stop_svc_port_forward(port_forward_process: subprocess.Popen,
|
|
270
|
+
timeout: int = 5) -> None:
|
|
247
271
|
"""Stops a port forward to a service in a Kubernetes cluster.
|
|
248
272
|
Args:
|
|
249
273
|
port_forward_process: The subprocess.Popen process to terminate
|
|
250
274
|
"""
|
|
251
275
|
try:
|
|
252
276
|
port_forward_process.terminate()
|
|
253
|
-
port_forward_process.wait(timeout=
|
|
277
|
+
port_forward_process.wait(timeout=timeout)
|
|
254
278
|
except subprocess.TimeoutExpired:
|
|
255
279
|
port_forward_process.kill()
|
|
256
280
|
port_forward_process.wait()
|
|
@@ -301,6 +325,10 @@ async def send_metrics_request_with_port_forward(
|
|
|
301
325
|
response.raise_for_status()
|
|
302
326
|
return response.text
|
|
303
327
|
|
|
328
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
329
|
+
logger.error(f'Failed to send metrics request with port forward: '
|
|
330
|
+
f'{common_utils.format_exception(e)}')
|
|
331
|
+
raise
|
|
304
332
|
finally:
|
|
305
333
|
# Always clean up port forward
|
|
306
334
|
if port_forward_process:
|
sky/provision/instance_setup.py
CHANGED
|
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
10
10
|
from sky import exceptions
|
|
11
11
|
from sky import logs
|
|
12
12
|
from sky import provision
|
|
13
|
+
from sky import resources as resources_lib
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky.provision import common
|
|
15
16
|
from sky.provision import docker_utils
|
|
@@ -92,12 +93,6 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
92
93
|
f'{usage_constants.USAGE_RUN_ID_FILE}')
|
|
93
94
|
|
|
94
95
|
|
|
95
|
-
def _set_skypilot_env_var_cmd() -> str:
|
|
96
|
-
"""Sets the skypilot environment variables on the remote machine."""
|
|
97
|
-
env_vars = env_options.Options.all_options()
|
|
98
|
-
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
99
|
-
|
|
100
|
-
|
|
101
96
|
def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
|
|
102
97
|
"""Decorator that retries the function if it fails.
|
|
103
98
|
|
|
@@ -482,11 +477,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
482
477
|
@common.log_function_start_end
|
|
483
478
|
@_auto_retry()
|
|
484
479
|
@timeline.event
|
|
485
|
-
def start_skylet_on_head_node(
|
|
486
|
-
|
|
487
|
-
|
|
480
|
+
def start_skylet_on_head_node(
|
|
481
|
+
cluster_name: resources_utils.ClusterName,
|
|
482
|
+
cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
|
|
483
|
+
launched_resources: resources_lib.Resources) -> None:
|
|
488
484
|
"""Start skylet on the head node."""
|
|
489
|
-
|
|
485
|
+
# Avoid circular import.
|
|
486
|
+
# pylint: disable=import-outside-toplevel
|
|
487
|
+
from sky.utils import controller_utils
|
|
488
|
+
|
|
489
|
+
def _set_skypilot_env_var_cmd() -> str:
|
|
490
|
+
"""Sets the skypilot environment variables on the remote machine."""
|
|
491
|
+
env_vars = {
|
|
492
|
+
k: str(v) for (k, v) in env_options.Options.all_options().items()
|
|
493
|
+
}
|
|
494
|
+
is_controller = controller_utils.Controllers.from_name(
|
|
495
|
+
cluster_name.display_name) is not None
|
|
496
|
+
is_kubernetes = cluster_info.provider_name == 'kubernetes'
|
|
497
|
+
if is_controller and is_kubernetes:
|
|
498
|
+
# For jobs/serve controller, we pass in the CPU and memory limits
|
|
499
|
+
# when starting the skylet to handle cases where these env vars
|
|
500
|
+
# are not set on the cluster's pod spec. The skylet will read
|
|
501
|
+
# these env vars when starting (ManagedJobEvent.start()) and write
|
|
502
|
+
# it to disk.
|
|
503
|
+
resources = launched_resources.assert_launchable()
|
|
504
|
+
vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
|
|
505
|
+
resources.instance_type)
|
|
506
|
+
if vcpus is not None:
|
|
507
|
+
env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
|
|
508
|
+
if mem is not None:
|
|
509
|
+
env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
|
|
510
|
+
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
511
|
+
|
|
490
512
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
491
513
|
cluster_info, **ssh_credentials)
|
|
492
514
|
head_runner = runners[0]
|
|
@@ -934,8 +934,11 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
934
934
|
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
935
935
|
['Pending', 'Running'])
|
|
936
936
|
head_pod_name = _get_head_pod_name(running_pods)
|
|
937
|
+
running_pod_statuses = [{
|
|
938
|
+
pod.metadata.name: pod.status.phase
|
|
939
|
+
} for pod in running_pods.values()]
|
|
937
940
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
|
938
|
-
f'{
|
|
941
|
+
f'{running_pod_statuses}')
|
|
939
942
|
|
|
940
943
|
to_start_count = config.count - len(running_pods)
|
|
941
944
|
if to_start_count < 0:
|
|
@@ -1142,10 +1145,21 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1142
1145
|
pods = created_resources
|
|
1143
1146
|
|
|
1144
1147
|
created_pods = {}
|
|
1148
|
+
valid_pods = []
|
|
1145
1149
|
for pod in pods:
|
|
1150
|
+
# In case Pod is not created
|
|
1151
|
+
if pod is None:
|
|
1152
|
+
continue
|
|
1153
|
+
valid_pods.append(pod)
|
|
1146
1154
|
created_pods[pod.metadata.name] = pod
|
|
1147
1155
|
if head_pod_name is None and _is_head(pod):
|
|
1148
1156
|
head_pod_name = pod.metadata.name
|
|
1157
|
+
pods = valid_pods
|
|
1158
|
+
|
|
1159
|
+
# The running_pods may include Pending Pods, so we add them to the pods
|
|
1160
|
+
# list to wait for scheduling and running
|
|
1161
|
+
if running_pods:
|
|
1162
|
+
pods = pods + list(running_pods.values())
|
|
1149
1163
|
|
|
1150
1164
|
provision_timeout = provider_config['timeout']
|
|
1151
1165
|
|
|
@@ -1369,8 +1383,9 @@ def get_cluster_info(
|
|
|
1369
1383
|
assert head_spec is not None, pod
|
|
1370
1384
|
cpu_request = head_spec.containers[0].resources.requests['cpu']
|
|
1371
1385
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1386
|
+
if cpu_request is None:
|
|
1387
|
+
raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
|
|
1388
|
+
' or not Running, check the Pod status')
|
|
1374
1389
|
|
|
1375
1390
|
ssh_user = 'sky'
|
|
1376
1391
|
# Use pattern matching to extract SSH user, handling MOTD contamination.
|
|
@@ -1688,7 +1688,10 @@ def check_credentials(context: Optional[str],
|
|
|
1688
1688
|
try:
|
|
1689
1689
|
namespace = get_kube_config_context_namespace(context)
|
|
1690
1690
|
kubernetes.core_api(context).list_namespaced_pod(
|
|
1691
|
-
namespace, _request_timeout=timeout)
|
|
1691
|
+
namespace, limit=1, _request_timeout=timeout)
|
|
1692
|
+
# This call is "free" because this function is a cached call,
|
|
1693
|
+
# and it will not be called again in this function.
|
|
1694
|
+
get_kubernetes_nodes(context=context)
|
|
1692
1695
|
except ImportError:
|
|
1693
1696
|
# TODO(romilb): Update these error strs to also include link to docs
|
|
1694
1697
|
# when docs are ready.
|