skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -238,6 +238,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
|
|
|
238
238
|
return accelerator, 1
|
|
239
239
|
|
|
240
240
|
|
|
241
|
+
def _is_cloudflare_403_error(exception: Exception) -> bool:
|
|
242
|
+
"""Check if an exception is a transient CloudFlare 403 error.
|
|
243
|
+
|
|
244
|
+
CloudFlare proxy 403 errors with CF-specific headers are transient and
|
|
245
|
+
should be retried, unlike real RBAC 403 errors.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
exception: The exception to check
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if this is a CloudFlare 403 error that should be retried
|
|
252
|
+
"""
|
|
253
|
+
if not isinstance(exception, kubernetes.api_exception()):
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
# Only check for 403 errors
|
|
257
|
+
if exception.status != 403:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
# Check for CloudFlare-specific headers
|
|
261
|
+
headers = exception.headers if hasattr(exception, 'headers') else {}
|
|
262
|
+
if not headers:
|
|
263
|
+
return False
|
|
264
|
+
|
|
265
|
+
# CloudFlare errors have CF-RAY header and/or Server: cloudflare
|
|
266
|
+
for k, v in headers.items():
|
|
267
|
+
if 'cf-ray' in k.lower():
|
|
268
|
+
return True
|
|
269
|
+
if 'server' in k.lower() and 'cloudflare' in str(v).lower():
|
|
270
|
+
return True
|
|
271
|
+
|
|
272
|
+
return False
|
|
273
|
+
|
|
274
|
+
|
|
241
275
|
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
242
276
|
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
|
|
243
277
|
resource_type: Optional[str] = None):
|
|
@@ -272,19 +306,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
|
272
306
|
kubernetes.api_exception(),
|
|
273
307
|
kubernetes.config_exception()) as e:
|
|
274
308
|
last_exception = e
|
|
309
|
+
|
|
310
|
+
# Check if this is a CloudFlare transient 403 error
|
|
311
|
+
is_cloudflare_403 = _is_cloudflare_403_error(e)
|
|
312
|
+
|
|
275
313
|
# Don't retry on permanent errors like 401 (Unauthorized)
|
|
276
|
-
# or 403 (Forbidden)
|
|
314
|
+
# or 403 (Forbidden), unless it's a CloudFlare transient 403
|
|
277
315
|
if (isinstance(e, kubernetes.api_exception()) and
|
|
278
|
-
e.status in (401, 403)):
|
|
316
|
+
e.status in (401, 403) and not is_cloudflare_403):
|
|
279
317
|
# Raise KubeAPIUnreachableError exception so that the
|
|
280
318
|
# optimizer/provisioner can failover to other clouds.
|
|
281
319
|
raise exceptions.KubeAPIUnreachableError(
|
|
282
320
|
f'Kubernetes API error: {str(e)}') from e
|
|
283
321
|
if attempt < max_retries - 1:
|
|
284
322
|
sleep_time = backoff.current_backoff()
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
323
|
+
error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
|
|
324
|
+
logger.debug(
|
|
325
|
+
f'Kubernetes API call {func.__name__} '
|
|
326
|
+
f'failed with {error_type} {str(e)}. Retrying in '
|
|
327
|
+
f'{sleep_time:.1f}s...')
|
|
288
328
|
time.sleep(sleep_time)
|
|
289
329
|
continue
|
|
290
330
|
|
|
@@ -696,6 +736,7 @@ def detect_gpu_label_formatter(
|
|
|
696
736
|
for label, value in node.metadata.labels.items():
|
|
697
737
|
node_labels[node.metadata.name].append((label, value))
|
|
698
738
|
|
|
739
|
+
invalid_label_values: List[Tuple[str, str, str, str]] = []
|
|
699
740
|
# Check if the node labels contain any of the GPU label prefixes
|
|
700
741
|
for lf in LABEL_FORMATTER_REGISTRY:
|
|
701
742
|
skip = False
|
|
@@ -709,11 +750,8 @@ def detect_gpu_label_formatter(
|
|
|
709
750
|
if valid:
|
|
710
751
|
return lf(), node_labels
|
|
711
752
|
else:
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
f'but has invalid value {value}. '
|
|
715
|
-
f'Reason: {reason}. '
|
|
716
|
-
'Skipping...')
|
|
753
|
+
invalid_label_values.append(
|
|
754
|
+
(label, lf.__name__, value, reason))
|
|
717
755
|
skip = True
|
|
718
756
|
break
|
|
719
757
|
if skip:
|
|
@@ -721,6 +759,13 @@ def detect_gpu_label_formatter(
|
|
|
721
759
|
if skip:
|
|
722
760
|
continue
|
|
723
761
|
|
|
762
|
+
for label, lf_name, value, reason in invalid_label_values:
|
|
763
|
+
logger.warning(f'GPU label {label} matched for label '
|
|
764
|
+
f'formatter {lf_name}, '
|
|
765
|
+
f'but has invalid value {value}. '
|
|
766
|
+
f'Reason: {reason}. '
|
|
767
|
+
'Skipping...')
|
|
768
|
+
|
|
724
769
|
return None, node_labels
|
|
725
770
|
|
|
726
771
|
|
|
@@ -1259,30 +1304,52 @@ class V1Pod:
|
|
|
1259
1304
|
|
|
1260
1305
|
|
|
1261
1306
|
@_retry_on_error(resource_type='pod')
|
|
1262
|
-
def
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1307
|
+
def get_allocated_gpu_qty_by_node(
|
|
1308
|
+
*,
|
|
1309
|
+
context: Optional[str] = None,
|
|
1310
|
+
) -> Dict[str, int]:
|
|
1311
|
+
"""Gets allocated GPU quantity by each node by fetching pods in
|
|
1312
|
+
all namespaces in kubernetes cluster indicated by context.
|
|
1268
1313
|
"""
|
|
1269
1314
|
if context is None:
|
|
1270
1315
|
context = get_current_kube_config_context_name()
|
|
1316
|
+
non_included_pod_statuses = POD_STATUSES.copy()
|
|
1317
|
+
status_filters = ['Running', 'Pending']
|
|
1318
|
+
if status_filters is not None:
|
|
1319
|
+
non_included_pod_statuses -= set(status_filters)
|
|
1320
|
+
field_selector = ','.join(
|
|
1321
|
+
[f'status.phase!={status}' for status in non_included_pod_statuses])
|
|
1271
1322
|
|
|
1272
1323
|
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1273
1324
|
# more efficiently.
|
|
1274
1325
|
response = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
1275
|
-
_request_timeout=kubernetes.API_TIMEOUT,
|
|
1326
|
+
_request_timeout=kubernetes.API_TIMEOUT,
|
|
1327
|
+
_preload_content=False,
|
|
1328
|
+
field_selector=field_selector)
|
|
1276
1329
|
try:
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1330
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
1331
|
+
for item_dict in ijson.items(response,
|
|
1332
|
+
'items.item',
|
|
1333
|
+
buf_size=IJSON_BUFFER_SIZE):
|
|
1334
|
+
pod = V1Pod.from_dict(item_dict)
|
|
1335
|
+
if should_exclude_pod_from_gpu_allocation(pod):
|
|
1336
|
+
logger.debug(
|
|
1337
|
+
f'Excluding pod {pod.metadata.name} from GPU count '
|
|
1338
|
+
f'calculations on node {pod.spec.node_name}')
|
|
1339
|
+
continue
|
|
1340
|
+
# Iterate over all the containers in the pod and sum the
|
|
1341
|
+
# GPU requests
|
|
1342
|
+
pod_allocated_qty = 0
|
|
1343
|
+
for container in pod.spec.containers:
|
|
1344
|
+
if container.resources.requests:
|
|
1345
|
+
pod_allocated_qty += get_node_accelerator_count(
|
|
1346
|
+
context, container.resources.requests)
|
|
1347
|
+
if pod_allocated_qty > 0 and pod.spec.node_name:
|
|
1348
|
+
allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
|
|
1349
|
+
return allocated_qty_by_node
|
|
1281
1350
|
finally:
|
|
1282
1351
|
response.release_conn()
|
|
1283
1352
|
|
|
1284
|
-
return pods
|
|
1285
|
-
|
|
1286
1353
|
|
|
1287
1354
|
def check_instance_fits(context: Optional[str],
|
|
1288
1355
|
instance: str) -> Tuple[bool, Optional[str]]:
|
|
@@ -2179,6 +2246,15 @@ def get_kube_config_context_namespace(
|
|
|
2179
2246
|
return DEFAULT_NAMESPACE
|
|
2180
2247
|
|
|
2181
2248
|
|
|
2249
|
+
def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
|
|
2250
|
+
if not resource_str:
|
|
2251
|
+
return 0.0
|
|
2252
|
+
if resource_str[-1] == 'm':
|
|
2253
|
+
return float(resource_str[:-1]) / 1000
|
|
2254
|
+
else:
|
|
2255
|
+
return float(resource_str)
|
|
2256
|
+
|
|
2257
|
+
|
|
2182
2258
|
def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
|
|
2183
2259
|
resource_str = str(resource_qty_str)
|
|
2184
2260
|
if resource_str[-1] == 'm':
|
|
@@ -2738,7 +2814,8 @@ def merge_custom_metadata(
|
|
|
2738
2814
|
config_utils.merge_k8s_configs(original_metadata, custom_metadata)
|
|
2739
2815
|
|
|
2740
2816
|
|
|
2741
|
-
|
|
2817
|
+
@_retry_on_error(resource_type='runtimeclass')
|
|
2818
|
+
def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
|
|
2742
2819
|
"""Checks if the 'nvidia' RuntimeClass exists in the cluster"""
|
|
2743
2820
|
# Fetch the list of available RuntimeClasses
|
|
2744
2821
|
runtime_classes = kubernetes.node_api(context).list_runtime_class()
|
|
@@ -2965,41 +3042,24 @@ def get_kubernetes_node_info(
|
|
|
2965
3042
|
label_keys = lf.get_label_keys()
|
|
2966
3043
|
|
|
2967
3044
|
# Check if all nodes have no accelerators to avoid fetching pods
|
|
2968
|
-
|
|
3045
|
+
has_accelerator_nodes = False
|
|
2969
3046
|
for node in nodes:
|
|
2970
3047
|
accelerator_count = get_node_accelerator_count(context,
|
|
2971
3048
|
node.status.allocatable)
|
|
2972
3049
|
if accelerator_count > 0:
|
|
2973
|
-
|
|
3050
|
+
has_accelerator_nodes = True
|
|
2974
3051
|
break
|
|
2975
3052
|
|
|
2976
|
-
# Get the
|
|
2977
|
-
pods = None
|
|
3053
|
+
# Get the allocated GPU quantity by each node
|
|
2978
3054
|
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
2979
|
-
|
|
3055
|
+
error_on_get_allocated_gpu_qty_by_node = False
|
|
3056
|
+
if has_accelerator_nodes:
|
|
2980
3057
|
try:
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
for pod in pods:
|
|
2984
|
-
if pod.status.phase in ['Running', 'Pending']:
|
|
2985
|
-
# Skip pods that should not count against GPU count
|
|
2986
|
-
if should_exclude_pod_from_gpu_allocation(pod):
|
|
2987
|
-
logger.debug(f'Excluding low priority pod '
|
|
2988
|
-
f'{pod.metadata.name} from GPU allocation '
|
|
2989
|
-
f'calculations')
|
|
2990
|
-
continue
|
|
2991
|
-
# Iterate over all the containers in the pod and sum the
|
|
2992
|
-
# GPU requests
|
|
2993
|
-
pod_allocated_qty = 0
|
|
2994
|
-
for container in pod.spec.containers:
|
|
2995
|
-
if container.resources.requests:
|
|
2996
|
-
pod_allocated_qty += get_node_accelerator_count(
|
|
2997
|
-
context, container.resources.requests)
|
|
2998
|
-
if pod_allocated_qty > 0:
|
|
2999
|
-
allocated_qty_by_node[
|
|
3000
|
-
pod.spec.node_name] += pod_allocated_qty
|
|
3058
|
+
allocated_qty_by_node = get_allocated_gpu_qty_by_node(
|
|
3059
|
+
context=context)
|
|
3001
3060
|
except kubernetes.api_exception() as e:
|
|
3002
3061
|
if e.status == 403:
|
|
3062
|
+
error_on_get_allocated_gpu_qty_by_node = True
|
|
3003
3063
|
pass
|
|
3004
3064
|
else:
|
|
3005
3065
|
raise
|
|
@@ -3044,7 +3104,7 @@ def get_kubernetes_node_info(
|
|
|
3044
3104
|
ip_address=node_ip)
|
|
3045
3105
|
continue
|
|
3046
3106
|
|
|
3047
|
-
if
|
|
3107
|
+
if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
|
|
3048
3108
|
accelerators_available = -1
|
|
3049
3109
|
else:
|
|
3050
3110
|
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
@@ -3241,13 +3301,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
|
3241
3301
|
|
|
3242
3302
|
try:
|
|
3243
3303
|
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
3244
|
-
label_selector=
|
|
3304
|
+
label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
3245
3305
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
3246
3306
|
except kubernetes.max_retry_error():
|
|
3247
3307
|
raise exceptions.ResourcesUnavailableError(
|
|
3248
3308
|
'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
|
|
3249
3309
|
'Please check if the cluster is healthy and retry. To debug, run: '
|
|
3250
|
-
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
|
|
3310
|
+
'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
|
|
3251
3311
|
) from None
|
|
3252
3312
|
return pods
|
|
3253
3313
|
|
|
@@ -3384,7 +3444,8 @@ def process_skypilot_pods(
|
|
|
3384
3444
|
serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
|
|
3385
3445
|
|
|
3386
3446
|
for pod in pods:
|
|
3387
|
-
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3447
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3448
|
+
provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
3388
3449
|
cluster_name = cluster_name_on_cloud.rsplit(
|
|
3389
3450
|
'-', 1
|
|
3390
3451
|
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
|
@@ -5,6 +5,7 @@ from sky import global_user_state
|
|
|
5
5
|
from sky import models
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import kubernetes
|
|
8
|
+
from sky.provision import constants
|
|
8
9
|
from sky.provision.kubernetes import config as config_lib
|
|
9
10
|
from sky.provision.kubernetes import constants as k8s_constants
|
|
10
11
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -75,7 +76,6 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
75
76
|
"""Deletes a volume."""
|
|
76
77
|
context, namespace = _get_context_namespace(config)
|
|
77
78
|
pvc_name = config.name_on_cloud
|
|
78
|
-
logger.info(f'Deleting PVC {pvc_name}')
|
|
79
79
|
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
80
80
|
delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
|
|
81
81
|
context).delete_namespaced_persistent_volume_claim(
|
|
@@ -84,6 +84,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
84
84
|
_request_timeout=config_lib.DELETION_TIMEOUT),
|
|
85
85
|
resource_type='pvc',
|
|
86
86
|
resource_name=pvc_name)
|
|
87
|
+
logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
|
|
87
88
|
return config
|
|
88
89
|
|
|
89
90
|
|
|
@@ -128,7 +129,7 @@ def _get_volume_usedby(
|
|
|
128
129
|
usedby_pods.append(pod.metadata.name)
|
|
129
130
|
# Get the real cluster name
|
|
130
131
|
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
131
|
-
|
|
132
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
132
133
|
if cluster_name_on_cloud is None:
|
|
133
134
|
continue
|
|
134
135
|
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
|
@@ -205,7 +206,7 @@ def get_all_volumes_usedby(
|
|
|
205
206
|
used_by_pods[context][namespace][volume_name].append(
|
|
206
207
|
pod.metadata.name)
|
|
207
208
|
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
208
|
-
|
|
209
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
209
210
|
if cluster_name_on_cloud is None:
|
|
210
211
|
continue
|
|
211
212
|
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
|
@@ -242,9 +243,9 @@ def create_persistent_volume_claim(namespace: str, context: Optional[str],
|
|
|
242
243
|
except kubernetes.api_exception() as e:
|
|
243
244
|
if e.status != 404: # Not found
|
|
244
245
|
raise
|
|
245
|
-
logger.info(f'Creating PVC {pvc_name}')
|
|
246
246
|
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
|
247
247
|
namespace=namespace, body=pvc_spec)
|
|
248
|
+
logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
|
|
248
249
|
|
|
249
250
|
|
|
250
251
|
def _get_pvc_spec(namespace: str,
|
|
@@ -232,9 +232,10 @@ def query_instances(
|
|
|
232
232
|
cluster_name_on_cloud: str,
|
|
233
233
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
234
234
|
non_terminated_only: bool = True,
|
|
235
|
+
retry_if_missing: bool = False,
|
|
235
236
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
236
237
|
"""See sky/provision/__init__.py"""
|
|
237
|
-
del cluster_name # unused
|
|
238
|
+
del cluster_name, retry_if_missing # unused
|
|
238
239
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
239
240
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
240
241
|
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -254,9 +254,10 @@ def query_instances(
|
|
|
254
254
|
cluster_name_on_cloud: str,
|
|
255
255
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
256
256
|
non_terminated_only: bool = True,
|
|
257
|
+
retry_if_missing: bool = False,
|
|
257
258
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
258
259
|
"""See sky/provision/__init__.py"""
|
|
259
|
-
del cluster_name # unused
|
|
260
|
+
del cluster_name, retry_if_missing # unused
|
|
260
261
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
261
262
|
instances = _filter_instances(provider_config['region'],
|
|
262
263
|
cluster_name_on_cloud, None)
|
sky/provision/oci/instance.py
CHANGED
|
@@ -36,6 +36,7 @@ def query_instances(
|
|
|
36
36
|
cluster_name_on_cloud: str,
|
|
37
37
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
38
38
|
non_terminated_only: bool = True,
|
|
39
|
+
retry_if_missing: bool = False,
|
|
39
40
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
40
41
|
"""Query instances.
|
|
41
42
|
|
|
@@ -44,7 +45,7 @@ def query_instances(
|
|
|
44
45
|
A None status means the instance is marked as "terminated"
|
|
45
46
|
or "terminating".
|
|
46
47
|
"""
|
|
47
|
-
del cluster_name #
|
|
48
|
+
del cluster_name, retry_if_missing # unused
|
|
48
49
|
assert provider_config is not None, cluster_name_on_cloud
|
|
49
50
|
region = provider_config['region']
|
|
50
51
|
|
|
@@ -281,9 +281,10 @@ def query_instances(
|
|
|
281
281
|
cluster_name_on_cloud: str,
|
|
282
282
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
283
283
|
non_terminated_only: bool = True,
|
|
284
|
+
retry_if_missing: bool = False,
|
|
284
285
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
285
286
|
"""See sky/provision/__init__.py"""
|
|
286
|
-
del cluster_name, non_terminated_only #unused
|
|
287
|
+
del cluster_name, non_terminated_only, retry_if_missing #unused
|
|
287
288
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
288
289
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
289
290
|
|
sky/provision/provisioner.py
CHANGED
|
@@ -442,6 +442,14 @@ def _post_provision_setup(
|
|
|
442
442
|
cluster_name.name_on_cloud,
|
|
443
443
|
provider_config=provider_config)
|
|
444
444
|
|
|
445
|
+
# Update cluster info in handle so cluster instance ids are set. This
|
|
446
|
+
# allows us to expose provision logs to debug nodes that failed during post
|
|
447
|
+
# provision setup.
|
|
448
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
449
|
+
cluster_name.display_name)
|
|
450
|
+
handle.cached_cluster_info = cluster_info
|
|
451
|
+
global_user_state.update_cluster_handle(cluster_name.display_name, handle)
|
|
452
|
+
|
|
445
453
|
if cluster_info.num_instances > 1:
|
|
446
454
|
# Only worker nodes have logs in the per-instance log directory. Head
|
|
447
455
|
# node's log will be redirected to the main log file.
|
|
@@ -477,12 +485,13 @@ def _post_provision_setup(
|
|
|
477
485
|
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
|
478
486
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
479
487
|
# for the users to SSH into the pod.
|
|
480
|
-
|
|
488
|
+
is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
|
|
489
|
+
if not is_k8s_cloud:
|
|
481
490
|
logger.debug(
|
|
482
491
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
|
483
492
|
wait_for_ssh(cluster_info, ssh_credentials)
|
|
484
493
|
logger.debug(f'SSH Connection ready for {cluster_name!r}')
|
|
485
|
-
vm_str = 'Instance' if
|
|
494
|
+
vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
|
|
486
495
|
plural = '' if len(cluster_info.instances) == 1 else 's'
|
|
487
496
|
verb = 'is' if len(cluster_info.instances) == 1 else 'are'
|
|
488
497
|
indent_str = (ux_utils.INDENT_SYMBOL
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -222,9 +222,10 @@ def query_instances(
|
|
|
222
222
|
cluster_name_on_cloud: str,
|
|
223
223
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
224
224
|
non_terminated_only: bool = True,
|
|
225
|
+
retry_if_missing: bool = False,
|
|
225
226
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
226
227
|
"""See sky/provision/__init__.py"""
|
|
227
|
-
del cluster_name # unused
|
|
228
|
+
del cluster_name, retry_if_missing # unused
|
|
228
229
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
229
230
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
230
231
|
|
sky/provision/scp/instance.py
CHANGED
|
@@ -431,8 +431,9 @@ def query_instances(
|
|
|
431
431
|
cluster_name_on_cloud: str,
|
|
432
432
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
433
433
|
non_terminated_only: bool = True,
|
|
434
|
+
retry_if_missing: bool = False,
|
|
434
435
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
435
|
-
del cluster_name # unused
|
|
436
|
+
del cluster_name, retry_if_missing # unused
|
|
436
437
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
437
438
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
438
439
|
|
sky/provision/seeweb/instance.py
CHANGED
|
@@ -9,7 +9,6 @@ import subprocess
|
|
|
9
9
|
import time
|
|
10
10
|
from typing import Any, Dict, List, Optional, Tuple
|
|
11
11
|
|
|
12
|
-
from sky import authentication as auth
|
|
13
12
|
from sky import sky_logging
|
|
14
13
|
from sky.adaptors import seeweb as seeweb_adaptor
|
|
15
14
|
from sky.provision import common
|
|
@@ -17,6 +16,7 @@ from sky.provision.common import ClusterInfo
|
|
|
17
16
|
from sky.provision.common import InstanceInfo
|
|
18
17
|
from sky.provision.common import ProvisionConfig
|
|
19
18
|
from sky.provision.common import ProvisionRecord
|
|
19
|
+
from sky.utils import auth_utils
|
|
20
20
|
from sky.utils import command_runner # Unified SSH helper
|
|
21
21
|
from sky.utils import common_utils
|
|
22
22
|
from sky.utils import status_lib
|
|
@@ -75,7 +75,7 @@ class SeewebNodeProvider:
|
|
|
75
75
|
if self.config and self.config.authentication_config:
|
|
76
76
|
key_path = self.config.authentication_config.get('ssh_private_key')
|
|
77
77
|
if not key_path:
|
|
78
|
-
key_path, _ =
|
|
78
|
+
key_path, _ = auth_utils.get_or_generate_keys()
|
|
79
79
|
return os.path.expanduser(key_path)
|
|
80
80
|
|
|
81
81
|
# ------------------------------------------------------------------ #
|
|
@@ -661,7 +661,7 @@ def _ping_server_standalone(server_ip: str) -> bool:
|
|
|
661
661
|
def _check_ssh_ready_standalone(server_ip: str) -> bool:
|
|
662
662
|
"""Check that SSH is available on the server (standalone version)."""
|
|
663
663
|
try:
|
|
664
|
-
private_key_path, _ =
|
|
664
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
665
665
|
private_key_path = os.path.expanduser(private_key_path)
|
|
666
666
|
ssh_user = 'ecuser'
|
|
667
667
|
result = subprocess.run([
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Shadeform provisioner."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.shadeform.config import bootstrap_instances
|
|
4
|
+
from sky.provision.shadeform.instance import cleanup_ports
|
|
5
|
+
from sky.provision.shadeform.instance import get_cluster_info
|
|
6
|
+
from sky.provision.shadeform.instance import open_ports
|
|
7
|
+
from sky.provision.shadeform.instance import query_instances
|
|
8
|
+
from sky.provision.shadeform.instance import run_instances
|
|
9
|
+
from sky.provision.shadeform.instance import stop_instances
|
|
10
|
+
from sky.provision.shadeform.instance import terminate_instances
|
|
11
|
+
from sky.provision.shadeform.instance import wait_instances
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Shadeform configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
|
|
12
|
+
return config
|