skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/backends/backend_utils.py
CHANGED
|
@@ -48,6 +48,7 @@ from sky.server.requests import requests as requests_lib
|
|
|
48
48
|
from sky.skylet import autostop_lib
|
|
49
49
|
from sky.skylet import constants
|
|
50
50
|
from sky.usage import usage_lib
|
|
51
|
+
from sky.utils import auth_utils
|
|
51
52
|
from sky.utils import cluster_utils
|
|
52
53
|
from sky.utils import command_runner
|
|
53
54
|
from sky.utils import common
|
|
@@ -755,7 +756,7 @@ def write_cluster_config(
|
|
|
755
756
|
assert k not in credentials, f'{k} already in credentials'
|
|
756
757
|
credentials[k] = v
|
|
757
758
|
|
|
758
|
-
private_key_path, _ =
|
|
759
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
759
760
|
auth_config = {'ssh_private_key': private_key_path}
|
|
760
761
|
region_name = resources_vars.get('region')
|
|
761
762
|
|
|
@@ -1124,6 +1125,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1124
1125
|
config = auth.setup_fluidstack_authentication(config)
|
|
1125
1126
|
elif isinstance(cloud, clouds.Hyperbolic):
|
|
1126
1127
|
config = auth.setup_hyperbolic_authentication(config)
|
|
1128
|
+
elif isinstance(cloud, clouds.Shadeform):
|
|
1129
|
+
config = auth.setup_shadeform_authentication(config)
|
|
1127
1130
|
elif isinstance(cloud, clouds.PrimeIntellect):
|
|
1128
1131
|
config = auth.setup_primeintellect_authentication(config)
|
|
1129
1132
|
elif isinstance(cloud, clouds.Seeweb):
|
|
@@ -1855,6 +1858,13 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
|
1855
1858
|
summary_response=True)
|
|
1856
1859
|
if record is None:
|
|
1857
1860
|
return
|
|
1861
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
1862
|
+
|
|
1863
|
+
|
|
1864
|
+
def _check_owner_identity_with_record(cluster_name: str,
|
|
1865
|
+
record: Dict[str, Any]) -> None:
|
|
1866
|
+
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1867
|
+
return
|
|
1858
1868
|
handle = record['handle']
|
|
1859
1869
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1860
1870
|
return
|
|
@@ -1941,7 +1951,8 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
|
|
1941
1951
|
|
|
1942
1952
|
@context_utils.cancellation_guard
|
|
1943
1953
|
def _query_cluster_status_via_cloud_api(
|
|
1944
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
1954
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
1955
|
+
retry_if_missing: bool,
|
|
1945
1956
|
) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
|
|
1946
1957
|
"""Returns the status of the cluster as a list of tuples corresponding
|
|
1947
1958
|
to the node status and an optional reason string for said status.
|
|
@@ -1968,8 +1979,11 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1968
1979
|
cloud_name = repr(handle.launched_resources.cloud)
|
|
1969
1980
|
try:
|
|
1970
1981
|
node_status_dict = provision_lib.query_instances(
|
|
1971
|
-
cloud_name,
|
|
1972
|
-
|
|
1982
|
+
cloud_name,
|
|
1983
|
+
cluster_name,
|
|
1984
|
+
cluster_name_on_cloud,
|
|
1985
|
+
provider_config,
|
|
1986
|
+
retry_if_missing=retry_if_missing)
|
|
1973
1987
|
logger.debug(f'Querying {cloud_name} cluster '
|
|
1974
1988
|
f'{cluster_name_in_hint} '
|
|
1975
1989
|
f'status:\n{pprint.pformat(node_status_dict)}')
|
|
@@ -2149,6 +2163,8 @@ def check_can_clone_disk_and_override_task(
|
|
|
2149
2163
|
|
|
2150
2164
|
def _update_cluster_status(
|
|
2151
2165
|
cluster_name: str,
|
|
2166
|
+
record: Dict[str, Any],
|
|
2167
|
+
retry_if_missing: bool,
|
|
2152
2168
|
include_user_info: bool = True,
|
|
2153
2169
|
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
2154
2170
|
"""Update the cluster status.
|
|
@@ -2177,12 +2193,6 @@ def _update_cluster_status(
|
|
|
2177
2193
|
fetched from the cloud provider or there are leaked nodes causing
|
|
2178
2194
|
the node number larger than expected.
|
|
2179
2195
|
"""
|
|
2180
|
-
record = global_user_state.get_cluster_from_name(
|
|
2181
|
-
cluster_name,
|
|
2182
|
-
include_user_info=include_user_info,
|
|
2183
|
-
summary_response=summary_response)
|
|
2184
|
-
if record is None:
|
|
2185
|
-
return None
|
|
2186
2196
|
handle = record['handle']
|
|
2187
2197
|
if handle.cluster_yaml is None:
|
|
2188
2198
|
# Remove cluster from db since this cluster does not have a config file
|
|
@@ -2201,7 +2211,8 @@ def _update_cluster_status(
|
|
|
2201
2211
|
return record
|
|
2202
2212
|
cluster_name = handle.cluster_name
|
|
2203
2213
|
|
|
2204
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2214
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2215
|
+
handle, retry_if_missing=retry_if_missing)
|
|
2205
2216
|
|
|
2206
2217
|
all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
|
|
2207
2218
|
for status in node_statuses) and
|
|
@@ -2376,7 +2387,8 @@ def _update_cluster_status(
|
|
|
2376
2387
|
# and check again. This is a best-effort leak prevention check.
|
|
2377
2388
|
# See https://github.com/skypilot-org/skypilot/issues/4431.
|
|
2378
2389
|
time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
|
|
2379
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2390
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2391
|
+
handle, retry_if_missing=False)
|
|
2380
2392
|
# Note: even if all the node_statuses are UP now, we will still
|
|
2381
2393
|
# consider this cluster abnormal, and its status will be INIT.
|
|
2382
2394
|
|
|
@@ -2620,7 +2632,8 @@ def refresh_cluster_record(
|
|
|
2620
2632
|
cluster_lock_already_held: bool = False,
|
|
2621
2633
|
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2622
2634
|
include_user_info: bool = True,
|
|
2623
|
-
summary_response: bool = False
|
|
2635
|
+
summary_response: bool = False,
|
|
2636
|
+
retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
|
|
2624
2637
|
"""Refresh the cluster, and return the possibly updated record.
|
|
2625
2638
|
|
|
2626
2639
|
The function will update the cached cluster status in the global state. For
|
|
@@ -2649,6 +2662,8 @@ def refresh_cluster_record(
|
|
|
2649
2662
|
value is <0, do not timeout (wait for the lock indefinitely). By
|
|
2650
2663
|
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
|
2651
2664
|
if correctness is required, you must set this to -1.
|
|
2665
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
2666
|
+
cluster is not found when querying the live status on the cloud.
|
|
2652
2667
|
|
|
2653
2668
|
Returns:
|
|
2654
2669
|
If the cluster is terminated or does not exist, return None.
|
|
@@ -2675,10 +2690,9 @@ def refresh_cluster_record(
|
|
|
2675
2690
|
# using the correct cloud credentials.
|
|
2676
2691
|
workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
2677
2692
|
with skypilot_config.local_active_workspace_ctx(workspace):
|
|
2678
|
-
check_owner_identity
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
return record
|
|
2693
|
+
# check_owner_identity returns if the record handle is
|
|
2694
|
+
# not a CloudVmRayResourceHandle
|
|
2695
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
2682
2696
|
|
|
2683
2697
|
# The loop logic allows us to notice if the status was updated in the
|
|
2684
2698
|
# global_user_state by another process and stop trying to get the lock.
|
|
@@ -2695,7 +2709,9 @@ def refresh_cluster_record(
|
|
|
2695
2709
|
return record
|
|
2696
2710
|
|
|
2697
2711
|
if cluster_lock_already_held:
|
|
2698
|
-
return _update_cluster_status(cluster_name,
|
|
2712
|
+
return _update_cluster_status(cluster_name, record,
|
|
2713
|
+
retry_if_missing,
|
|
2714
|
+
include_user_info,
|
|
2699
2715
|
summary_response)
|
|
2700
2716
|
|
|
2701
2717
|
# Try to acquire the lock so we can fetch the status.
|
|
@@ -2711,7 +2727,8 @@ def refresh_cluster_record(
|
|
|
2711
2727
|
record, force_refresh_statuses):
|
|
2712
2728
|
return record
|
|
2713
2729
|
# Update and return the cluster status.
|
|
2714
|
-
return _update_cluster_status(cluster_name,
|
|
2730
|
+
return _update_cluster_status(cluster_name, record,
|
|
2731
|
+
retry_if_missing,
|
|
2715
2732
|
include_user_info,
|
|
2716
2733
|
summary_response)
|
|
2717
2734
|
|
|
@@ -2749,7 +2766,8 @@ def refresh_cluster_status_handle(
|
|
|
2749
2766
|
*,
|
|
2750
2767
|
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2751
2768
|
cluster_lock_already_held: bool = False,
|
|
2752
|
-
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
|
|
2769
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2770
|
+
retry_if_missing: bool = True,
|
|
2753
2771
|
) -> Tuple[Optional[status_lib.ClusterStatus],
|
|
2754
2772
|
Optional[backends.ResourceHandle]]:
|
|
2755
2773
|
"""Refresh the cluster, and return the possibly updated status and handle.
|
|
@@ -2764,7 +2782,8 @@ def refresh_cluster_status_handle(
|
|
|
2764
2782
|
cluster_lock_already_held=cluster_lock_already_held,
|
|
2765
2783
|
cluster_status_lock_timeout=cluster_status_lock_timeout,
|
|
2766
2784
|
include_user_info=False,
|
|
2767
|
-
summary_response=True
|
|
2785
|
+
summary_response=True,
|
|
2786
|
+
retry_if_missing=retry_if_missing)
|
|
2768
2787
|
if record is None:
|
|
2769
2788
|
return None, None
|
|
2770
2789
|
return record['status'], record['handle']
|
|
@@ -3115,25 +3134,23 @@ def refresh_cluster_records() -> None:
|
|
|
3115
3134
|
exclude_managed_clusters = True
|
|
3116
3135
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
3117
3136
|
exclude_managed_clusters = False
|
|
3118
|
-
cluster_names =
|
|
3119
|
-
|
|
3137
|
+
cluster_names = set(
|
|
3138
|
+
global_user_state.get_cluster_names(
|
|
3139
|
+
exclude_managed_clusters=exclude_managed_clusters,))
|
|
3120
3140
|
|
|
3121
3141
|
# TODO(syang): we should try not to leak
|
|
3122
3142
|
# request info in backend_utils.py.
|
|
3123
3143
|
# Refactor this to use some other info to
|
|
3124
3144
|
# determine if a launch is in progress.
|
|
3125
|
-
request = requests_lib.get_request_tasks(
|
|
3126
|
-
req_filter=requests_lib.RequestTaskFilter(
|
|
3127
|
-
status=[requests_lib.RequestStatus.RUNNING],
|
|
3128
|
-
cluster_names=cluster_names,
|
|
3129
|
-
include_request_names=['sky.launch']))
|
|
3130
3145
|
cluster_names_with_launch_request = {
|
|
3131
|
-
request.cluster_name for request in
|
|
3146
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3147
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3148
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3149
|
+
include_request_names=['sky.launch'],
|
|
3150
|
+
fields=['cluster_name']))
|
|
3132
3151
|
}
|
|
3133
|
-
cluster_names_without_launch_request =
|
|
3134
|
-
|
|
3135
|
-
if cluster_name not in cluster_names_with_launch_request
|
|
3136
|
-
]
|
|
3152
|
+
cluster_names_without_launch_request = (cluster_names -
|
|
3153
|
+
cluster_names_with_launch_request)
|
|
3137
3154
|
|
|
3138
3155
|
def _refresh_cluster_record(cluster_name):
|
|
3139
3156
|
return _refresh_cluster(cluster_name,
|
|
@@ -3142,7 +3159,7 @@ def refresh_cluster_records() -> None:
|
|
|
3142
3159
|
include_user_info=False,
|
|
3143
3160
|
summary_response=True)
|
|
3144
3161
|
|
|
3145
|
-
if len(
|
|
3162
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3146
3163
|
# Do not refresh the clusters that have an active launch request.
|
|
3147
3164
|
subprocess_utils.run_in_parallel(_refresh_cluster_record,
|
|
3148
3165
|
cluster_names_without_launch_request)
|
|
@@ -3154,6 +3171,7 @@ def get_clusters(
|
|
|
3154
3171
|
all_users: bool = True,
|
|
3155
3172
|
include_credentials: bool = False,
|
|
3156
3173
|
summary_response: bool = False,
|
|
3174
|
+
include_handle: bool = True,
|
|
3157
3175
|
# Internal only:
|
|
3158
3176
|
# pylint: disable=invalid-name
|
|
3159
3177
|
_include_is_managed: bool = False,
|
|
@@ -3237,12 +3255,11 @@ def get_clusters(
|
|
|
3237
3255
|
"""Add resource str to record"""
|
|
3238
3256
|
for record in _get_records_with_handle(records):
|
|
3239
3257
|
handle = record['handle']
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
handle,
|
|
3243
|
-
record[
|
|
3244
|
-
|
|
3245
|
-
handle, simplify=False)
|
|
3258
|
+
resource_str_simple, resource_str_full = (
|
|
3259
|
+
resources_utils.get_readable_resources_repr(
|
|
3260
|
+
handle, simplified_only=False))
|
|
3261
|
+
record['resources_str'] = resource_str_simple
|
|
3262
|
+
record['resources_str_full'] = resource_str_full
|
|
3246
3263
|
if not summary_response:
|
|
3247
3264
|
record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
|
|
3248
3265
|
|
|
@@ -3268,9 +3285,17 @@ def get_clusters(
|
|
|
3268
3285
|
expanded_private_key_path = os.path.expanduser(
|
|
3269
3286
|
ssh_private_key_path)
|
|
3270
3287
|
if not os.path.exists(expanded_private_key_path):
|
|
3271
|
-
|
|
3288
|
+
success = auth_utils.create_ssh_key_files_from_db(
|
|
3289
|
+
ssh_private_key_path)
|
|
3290
|
+
if not success:
|
|
3291
|
+
# If the ssh key files are not found, we do not
|
|
3292
|
+
# update the record with credentials.
|
|
3293
|
+
logger.debug(
|
|
3294
|
+
f'SSH keys not found for cluster {record["name"]} '
|
|
3295
|
+
f'at key path {ssh_private_key_path}')
|
|
3296
|
+
continue
|
|
3272
3297
|
else:
|
|
3273
|
-
private_key_path, _ =
|
|
3298
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
3274
3299
|
expanded_private_key_path = os.path.expanduser(private_key_path)
|
|
3275
3300
|
if expanded_private_key_path in cached_private_keys:
|
|
3276
3301
|
credential['ssh_private_key_content'] = cached_private_keys[
|
|
@@ -3302,6 +3327,8 @@ def get_clusters(
|
|
|
3302
3327
|
record['accelerators'] = (
|
|
3303
3328
|
f'{handle.launched_resources.accelerators}'
|
|
3304
3329
|
if handle.launched_resources.accelerators else None)
|
|
3330
|
+
if not include_handle:
|
|
3331
|
+
record.pop('handle', None)
|
|
3305
3332
|
|
|
3306
3333
|
# Add handle info to the records
|
|
3307
3334
|
_update_records_with_handle_info(records)
|
|
@@ -3330,7 +3357,10 @@ def get_clusters(
|
|
|
3330
3357
|
force_refresh_statuses=force_refresh_statuses,
|
|
3331
3358
|
include_user_info=True,
|
|
3332
3359
|
summary_response=summary_response)
|
|
3333
|
-
if
|
|
3360
|
+
# record may be None if the cluster is deleted during refresh,
|
|
3361
|
+
# e.g. all the Pods of a cluster on Kubernetes have been
|
|
3362
|
+
# deleted before refresh.
|
|
3363
|
+
if record is not None and 'error' not in record:
|
|
3334
3364
|
_update_records_with_handle_info([record])
|
|
3335
3365
|
if include_credentials:
|
|
3336
3366
|
_update_records_with_credentials([record])
|
|
@@ -3342,45 +3372,56 @@ def get_clusters(
|
|
|
3342
3372
|
# request info in backend_utils.py.
|
|
3343
3373
|
# Refactor this to use some other info to
|
|
3344
3374
|
# determine if a launch is in progress.
|
|
3345
|
-
request = requests_lib.get_request_tasks(
|
|
3346
|
-
req_filter=requests_lib.RequestTaskFilter(
|
|
3347
|
-
status=[requests_lib.RequestStatus.RUNNING],
|
|
3348
|
-
cluster_names=cluster_names,
|
|
3349
|
-
include_request_names=['sky.launch']))
|
|
3350
3375
|
cluster_names_with_launch_request = {
|
|
3351
|
-
request.cluster_name for request in
|
|
3376
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3377
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3378
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3379
|
+
include_request_names=['sky.launch'],
|
|
3380
|
+
cluster_names=cluster_names,
|
|
3381
|
+
fields=['cluster_name']))
|
|
3352
3382
|
}
|
|
3383
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3353
3384
|
cluster_names_without_launch_request = [
|
|
3354
|
-
|
|
3385
|
+
(i, cluster_name)
|
|
3386
|
+
for i, cluster_name in enumerate(cluster_names)
|
|
3355
3387
|
if cluster_name not in cluster_names_with_launch_request
|
|
3356
3388
|
]
|
|
3357
3389
|
# for clusters that have an active launch request, we do not refresh the status
|
|
3358
|
-
updated_records = [
|
|
3359
|
-
record for record in records
|
|
3360
|
-
if record['name'] in cluster_names_with_launch_request
|
|
3361
|
-
]
|
|
3390
|
+
updated_records = []
|
|
3362
3391
|
if len(cluster_names_without_launch_request) > 0:
|
|
3363
3392
|
with progress:
|
|
3364
3393
|
updated_records = subprocess_utils.run_in_parallel(
|
|
3365
|
-
_refresh_cluster_record,
|
|
3366
|
-
|
|
3394
|
+
_refresh_cluster_record, [
|
|
3395
|
+
cluster_name
|
|
3396
|
+
for _, cluster_name in cluster_names_without_launch_request
|
|
3397
|
+
])
|
|
3398
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3399
|
+
# before filtering for clusters being launched.
|
|
3400
|
+
updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
|
|
3401
|
+
cluster_names_without_launch_request[i][0]: updated_records[i]
|
|
3402
|
+
for i in range(len(cluster_names_without_launch_request))
|
|
3403
|
+
}
|
|
3367
3404
|
# Show information for removed clusters.
|
|
3368
3405
|
kept_records = []
|
|
3369
3406
|
autodown_clusters, remaining_clusters, failed_clusters = [], [], []
|
|
3370
3407
|
for i, record in enumerate(records):
|
|
3371
|
-
if
|
|
3408
|
+
if i not in updated_records_dict:
|
|
3409
|
+
# record was not refreshed, keep the original record
|
|
3410
|
+
kept_records.append(record)
|
|
3411
|
+
continue
|
|
3412
|
+
updated_record = updated_records_dict[i]
|
|
3413
|
+
if updated_record is None:
|
|
3372
3414
|
if record['to_down']:
|
|
3373
|
-
autodown_clusters.append(
|
|
3415
|
+
autodown_clusters.append(record['name'])
|
|
3374
3416
|
else:
|
|
3375
|
-
remaining_clusters.append(
|
|
3376
|
-
elif
|
|
3377
|
-
failed_clusters.append(
|
|
3378
|
-
(cluster_names[i], updated_records[i]['error']))
|
|
3417
|
+
remaining_clusters.append(record['name'])
|
|
3418
|
+
elif updated_record['status'] == 'UNKNOWN':
|
|
3419
|
+
failed_clusters.append((record['name'], updated_record['error']))
|
|
3379
3420
|
# Keep the original record if the status is unknown,
|
|
3380
3421
|
# so that the user can still see the cluster.
|
|
3381
3422
|
kept_records.append(record)
|
|
3382
3423
|
else:
|
|
3383
|
-
kept_records.append(
|
|
3424
|
+
kept_records.append(updated_record)
|
|
3384
3425
|
|
|
3385
3426
|
if autodown_clusters:
|
|
3386
3427
|
plural = 's' if len(autodown_clusters) > 1 else ''
|