skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@ import json
|
|
|
5
5
|
import re
|
|
6
6
|
import sys
|
|
7
7
|
import time
|
|
8
|
-
from typing import Any,
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
10
|
from sky import exceptions
|
|
11
11
|
from sky import global_user_state
|
|
@@ -33,6 +33,9 @@ from sky.utils.db import db_utils
|
|
|
33
33
|
POLL_INTERVAL = 2
|
|
34
34
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
35
35
|
_MAX_RETRIES = 3
|
|
36
|
+
_MAX_MISSING_PODS_RETRIES = 5
|
|
37
|
+
_MAX_QUERY_INSTANCES_RETRIES = 5
|
|
38
|
+
_QUERY_INSTANCES_RETRY_INTERVAL = .5
|
|
36
39
|
_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
|
|
37
40
|
|
|
38
41
|
# Pattern to extract SSH user from command output, handling MOTD contamination
|
|
@@ -81,7 +84,7 @@ def is_high_availability_cluster_by_kubectl(
|
|
|
81
84
|
context).list_namespaced_deployment(
|
|
82
85
|
namespace,
|
|
83
86
|
label_selector=
|
|
84
|
-
f'{
|
|
87
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
|
|
85
88
|
except kubernetes.api_exception():
|
|
86
89
|
return False
|
|
87
90
|
# It is a high availability cluster if there is at least one deployment
|
|
@@ -425,11 +428,11 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
|
425
428
|
# Get all pods in a single API call using the cluster name label
|
|
426
429
|
# which all pods in new_nodes should share
|
|
427
430
|
cluster_name_on_cloud = new_nodes[0].metadata.labels[
|
|
428
|
-
|
|
431
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
429
432
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
430
433
|
namespace,
|
|
431
434
|
label_selector=
|
|
432
|
-
f'{
|
|
435
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
433
436
|
).items
|
|
434
437
|
|
|
435
438
|
# Get the set of found pod names and check if we have all expected pods
|
|
@@ -489,17 +492,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
|
489
492
|
|
|
490
493
|
|
|
491
494
|
@timeline.event
|
|
492
|
-
def _wait_for_pods_to_run(namespace, context,
|
|
495
|
+
def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
|
|
493
496
|
"""Wait for pods and their containers to be ready.
|
|
494
497
|
|
|
495
498
|
Pods may be pulling images or may be in the process of container
|
|
496
499
|
creation.
|
|
497
500
|
"""
|
|
498
|
-
if not
|
|
501
|
+
if not new_pods:
|
|
499
502
|
return
|
|
500
503
|
|
|
501
504
|
# Create a set of pod names we're waiting for
|
|
502
|
-
expected_pod_names = {
|
|
505
|
+
expected_pod_names = {pod.metadata.name for pod in new_pods}
|
|
503
506
|
|
|
504
507
|
def _check_init_containers(pod):
|
|
505
508
|
# Check if any of the init containers failed
|
|
@@ -526,28 +529,62 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
526
529
|
'Failed to create init container for pod '
|
|
527
530
|
f'{pod.metadata.name}. Error details: {msg}.')
|
|
528
531
|
|
|
532
|
+
missing_pods_retry = 0
|
|
529
533
|
while True:
|
|
530
534
|
# Get all pods in a single API call
|
|
531
|
-
|
|
532
|
-
|
|
535
|
+
cluster_name_on_cloud = new_pods[0].metadata.labels[
|
|
536
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
533
537
|
all_pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
534
538
|
namespace,
|
|
535
539
|
label_selector=
|
|
536
|
-
f'{
|
|
540
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
541
|
+
).items
|
|
537
542
|
|
|
538
543
|
# Get the set of found pod names and check if we have all expected pods
|
|
539
544
|
found_pod_names = {pod.metadata.name for pod in all_pods}
|
|
540
|
-
|
|
541
|
-
if
|
|
545
|
+
missing_pod_names = expected_pod_names - found_pod_names
|
|
546
|
+
if missing_pod_names:
|
|
547
|
+
# In _wait_for_pods_to_schedule, we already wait for all pods to go
|
|
548
|
+
# from pending to scheduled. So if a pod is missing here, it means
|
|
549
|
+
# something unusual must have happened, and so should be treated as
|
|
550
|
+
# an exception.
|
|
551
|
+
# It is also only in _wait_for_pods_to_schedule that
|
|
552
|
+
# provision_timeout is used.
|
|
553
|
+
# TODO(kevin): Should we take provision_timeout into account here,
|
|
554
|
+
# instead of hardcoding the number of retries?
|
|
555
|
+
if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
|
|
556
|
+
for pod_name in missing_pod_names:
|
|
557
|
+
reason = _get_pod_missing_reason(context, namespace,
|
|
558
|
+
cluster_name, pod_name)
|
|
559
|
+
logger.warning(f'Pod {pod_name} missing: {reason}')
|
|
560
|
+
raise config_lib.KubernetesError(
|
|
561
|
+
f'Failed to get all pods after {missing_pods_retry} '
|
|
562
|
+
f'retries. Some pods may have been terminated or failed '
|
|
563
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
564
|
+
'for more details.')
|
|
542
565
|
logger.info('Retrying running pods check: '
|
|
543
|
-
f'Missing pods: {
|
|
566
|
+
f'Missing pods: {missing_pod_names}')
|
|
544
567
|
time.sleep(0.5)
|
|
568
|
+
missing_pods_retry += 1
|
|
545
569
|
continue
|
|
546
570
|
|
|
547
571
|
all_pods_running = True
|
|
548
572
|
for pod in all_pods:
|
|
549
573
|
if pod.metadata.name not in expected_pod_names:
|
|
550
574
|
continue
|
|
575
|
+
|
|
576
|
+
# Check if pod is terminated/preempted/failed.
|
|
577
|
+
if (pod.metadata.deletion_timestamp is not None or
|
|
578
|
+
pod.status.phase == 'Failed'):
|
|
579
|
+
# Get the reason and write to cluster events before
|
|
580
|
+
# the pod gets completely deleted from the API.
|
|
581
|
+
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
582
|
+
logger.warning(f'Pod {pod.metadata.name} terminated: {reason}')
|
|
583
|
+
raise config_lib.KubernetesError(
|
|
584
|
+
f'Pod {pod.metadata.name} has terminated or failed '
|
|
585
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
586
|
+
'for more details.')
|
|
587
|
+
|
|
551
588
|
# Continue if pod and all the containers within the
|
|
552
589
|
# pod are successfully created and running.
|
|
553
590
|
if pod.status.phase == 'Running' and all(
|
|
@@ -583,31 +620,6 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
583
620
|
time.sleep(1)
|
|
584
621
|
|
|
585
622
|
|
|
586
|
-
def _run_function_with_retries(func: Callable,
|
|
587
|
-
operation_name: str,
|
|
588
|
-
max_retries: int = _MAX_RETRIES,
|
|
589
|
-
retry_delay: int = 5) -> Any:
|
|
590
|
-
"""Runs a function with retries on Kubernetes errors.
|
|
591
|
-
Args:
|
|
592
|
-
func: Function to retry
|
|
593
|
-
operation_name: Name of the operation for logging
|
|
594
|
-
max_retries: Maximum number of retry attempts
|
|
595
|
-
retry_delay: Delay between retries in seconds
|
|
596
|
-
Raises:
|
|
597
|
-
The last exception encountered if all retries fail.
|
|
598
|
-
"""
|
|
599
|
-
for attempt in range(max_retries + 1):
|
|
600
|
-
try:
|
|
601
|
-
return func()
|
|
602
|
-
except config_lib.KubernetesError:
|
|
603
|
-
if attempt < max_retries:
|
|
604
|
-
logger.warning(f'Failed to {operation_name} - '
|
|
605
|
-
f'retrying in {retry_delay} seconds.')
|
|
606
|
-
time.sleep(retry_delay)
|
|
607
|
-
else:
|
|
608
|
-
raise
|
|
609
|
-
|
|
610
|
-
|
|
611
623
|
@timeline.event
|
|
612
624
|
def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
|
|
613
625
|
"""Pre-initialization step for SkyPilot pods.
|
|
@@ -902,7 +914,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
902
914
|
else:
|
|
903
915
|
pod_spec['metadata']['labels'] = tags
|
|
904
916
|
pod_spec['metadata']['labels'].update(
|
|
905
|
-
{
|
|
917
|
+
{constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
|
906
918
|
|
|
907
919
|
terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
908
920
|
['Terminating'])
|
|
@@ -954,7 +966,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
954
966
|
nvidia_runtime_exists = False
|
|
955
967
|
try:
|
|
956
968
|
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
|
957
|
-
context)
|
|
969
|
+
context=context)
|
|
958
970
|
except kubernetes.kubernetes.client.ApiException as e:
|
|
959
971
|
logger.warning('run_instances: Error occurred while checking for '
|
|
960
972
|
f'nvidia RuntimeClass - '
|
|
@@ -984,12 +996,19 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
984
996
|
|
|
985
997
|
def _create_resource_thread(i: int):
|
|
986
998
|
pod_spec_copy = copy.deepcopy(pod_spec)
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
999
|
+
# 0 is for head pod, while 1+ is for worker pods.
|
|
1000
|
+
if i == 0:
|
|
1001
|
+
if head_pod_name is None:
|
|
1002
|
+
# First pod should be head if no head exists
|
|
1003
|
+
pod_spec_copy['metadata']['labels'].update(
|
|
1004
|
+
constants.HEAD_NODE_TAGS)
|
|
1005
|
+
head_selector = _head_service_selector(cluster_name_on_cloud)
|
|
1006
|
+
pod_spec_copy['metadata']['labels'].update(head_selector)
|
|
1007
|
+
pod_spec_copy['metadata'][
|
|
1008
|
+
'name'] = f'{cluster_name_on_cloud}-head'
|
|
1009
|
+
else:
|
|
1010
|
+
# If head pod already exists, we skip creating it.
|
|
1011
|
+
return
|
|
993
1012
|
else:
|
|
994
1013
|
# Worker pods
|
|
995
1014
|
pod_spec_copy['metadata']['labels'].update(
|
|
@@ -1035,7 +1054,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1035
1054
|
'podAffinityTerm': {
|
|
1036
1055
|
'labelSelector': {
|
|
1037
1056
|
'matchExpressions': [{
|
|
1038
|
-
'key':
|
|
1057
|
+
'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
1039
1058
|
'operator': 'In',
|
|
1040
1059
|
'values': [cluster_name_on_cloud]
|
|
1041
1060
|
}]
|
|
@@ -1130,9 +1149,16 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1130
1149
|
'and then up the cluster again.')
|
|
1131
1150
|
raise exceptions.InconsistentHighAvailabilityError(message)
|
|
1132
1151
|
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1152
|
+
created_resources = []
|
|
1153
|
+
if to_start_count > 0:
|
|
1154
|
+
# Create pods in parallel.
|
|
1155
|
+
# Use `config.count` instead of `to_start_count` to keep the index of
|
|
1156
|
+
# the Pods consistent especially for the case where some Pods are down
|
|
1157
|
+
# due to node failure or manual termination, etc. and then launch
|
|
1158
|
+
# again to create the Pods back.
|
|
1159
|
+
# The existing Pods will be skipped in _create_resource_thread.
|
|
1160
|
+
created_resources = subprocess_utils.run_in_parallel(
|
|
1161
|
+
_create_resource_thread, list(range(config.count)), _NUM_THREADS)
|
|
1136
1162
|
|
|
1137
1163
|
if to_create_deployment:
|
|
1138
1164
|
deployments = copy.deepcopy(created_resources)
|
|
@@ -1180,7 +1206,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1180
1206
|
# fail early if there is an error
|
|
1181
1207
|
logger.debug(f'run_instances: waiting for pods to be running (pulling '
|
|
1182
1208
|
f'images): {[pod.metadata.name for pod in pods]}')
|
|
1183
|
-
_wait_for_pods_to_run(namespace, context, pods)
|
|
1209
|
+
_wait_for_pods_to_run(namespace, context, cluster_name, pods)
|
|
1184
1210
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
|
1185
1211
|
f'{[pod.metadata.name for pod in pods]}')
|
|
1186
1212
|
|
|
@@ -1375,6 +1401,9 @@ def get_cluster_info(
|
|
|
1375
1401
|
external_ip=None,
|
|
1376
1402
|
ssh_port=port,
|
|
1377
1403
|
tags=pod.metadata.labels,
|
|
1404
|
+
# TODO(hailong): `cluster.local` may need to be configurable
|
|
1405
|
+
# Service name is same as the pod name for now.
|
|
1406
|
+
internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
|
|
1378
1407
|
)
|
|
1379
1408
|
]
|
|
1380
1409
|
if _is_head(pod):
|
|
@@ -1413,6 +1442,13 @@ def get_cluster_info(
|
|
|
1413
1442
|
logger.debug(
|
|
1414
1443
|
f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
|
|
1415
1444
|
|
|
1445
|
+
# cpu_request may be a string like `100m`, need to parse and convert
|
|
1446
|
+
num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
|
|
1447
|
+
# 'num-cpus' for ray must be an integer, but we should not set it to 0 if
|
|
1448
|
+
# cpus is <1.
|
|
1449
|
+
# Keep consistent with the logic in clouds/kubernetes.py
|
|
1450
|
+
str_cpus = str(max(int(num_cpus), 1))
|
|
1451
|
+
|
|
1416
1452
|
return common.ClusterInfo(
|
|
1417
1453
|
instances=pods,
|
|
1418
1454
|
head_instance_id=head_pod_name,
|
|
@@ -1422,16 +1458,52 @@ def get_cluster_info(
|
|
|
1422
1458
|
# problems for other pods.
|
|
1423
1459
|
custom_ray_options={
|
|
1424
1460
|
'object-store-memory': 500000000,
|
|
1425
|
-
'num-cpus':
|
|
1461
|
+
'num-cpus': str_cpus,
|
|
1426
1462
|
},
|
|
1427
1463
|
provider_name='kubernetes',
|
|
1428
1464
|
provider_config=provider_config)
|
|
1429
1465
|
|
|
1430
1466
|
|
|
1431
1467
|
def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
1432
|
-
"""Get pod termination reason and write to cluster events.
|
|
1433
|
-
|
|
1468
|
+
"""Get pod termination reason and write to cluster events.
|
|
1469
|
+
|
|
1470
|
+
Checks both pod conditions (for preemption/disruption) and
|
|
1471
|
+
container statuses (for exit codes/errors).
|
|
1472
|
+
"""
|
|
1434
1473
|
latest_timestamp = pod.status.start_time or datetime.datetime.min
|
|
1474
|
+
ready_state = 'Unknown'
|
|
1475
|
+
termination_reason = 'Terminated unexpectedly'
|
|
1476
|
+
container_reasons = []
|
|
1477
|
+
|
|
1478
|
+
# Check pod status conditions for high level overview.
|
|
1479
|
+
# No need to sort, as each condition.type will only appear once.
|
|
1480
|
+
for condition in pod.status.conditions:
|
|
1481
|
+
reason = condition.reason or 'Unknown reason'
|
|
1482
|
+
message = condition.message or ''
|
|
1483
|
+
|
|
1484
|
+
# Get last known readiness state.
|
|
1485
|
+
if condition.type == 'Ready':
|
|
1486
|
+
ready_state = f'{reason} ({message})' if message else reason
|
|
1487
|
+
# Kueue preemption, as defined in:
|
|
1488
|
+
# https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
|
|
1489
|
+
elif condition.type == 'TerminationTarget':
|
|
1490
|
+
termination_reason = f'Preempted by Kueue: {reason}'
|
|
1491
|
+
if message:
|
|
1492
|
+
termination_reason += f' ({message})'
|
|
1493
|
+
# Generic disruption.
|
|
1494
|
+
elif condition.type == 'DisruptionTarget':
|
|
1495
|
+
termination_reason = f'Disrupted: {reason}'
|
|
1496
|
+
if message:
|
|
1497
|
+
termination_reason += f' ({message})'
|
|
1498
|
+
|
|
1499
|
+
if condition.last_transition_time is not None:
|
|
1500
|
+
latest_timestamp = max(latest_timestamp,
|
|
1501
|
+
condition.last_transition_time)
|
|
1502
|
+
|
|
1503
|
+
pod_reason = (f'{termination_reason}.\n'
|
|
1504
|
+
f'Last known state: {ready_state}.')
|
|
1505
|
+
|
|
1506
|
+
# Check container statuses for exit codes/errors
|
|
1435
1507
|
if pod.status and pod.status.container_statuses:
|
|
1436
1508
|
for container_status in pod.status.container_statuses:
|
|
1437
1509
|
terminated = container_status.state.terminated
|
|
@@ -1446,18 +1518,15 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
|
1446
1518
|
if reason is None:
|
|
1447
1519
|
# just in-case reason is None, have default for debugging
|
|
1448
1520
|
reason = f'exit({exit_code})'
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
latest_timestamp = terminated.finished_at
|
|
1521
|
+
container_reasons.append(reason)
|
|
1522
|
+
latest_timestamp = max(latest_timestamp, terminated.finished_at)
|
|
1452
1523
|
|
|
1453
1524
|
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1454
1525
|
|
|
1455
|
-
if not reasons:
|
|
1456
|
-
return ''
|
|
1457
|
-
|
|
1458
1526
|
# Normally we will have a single container per pod for skypilot
|
|
1459
1527
|
# but doing this just in-case there are multiple containers.
|
|
1460
|
-
|
|
1528
|
+
if container_reasons:
|
|
1529
|
+
pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
|
|
1461
1530
|
|
|
1462
1531
|
global_user_state.add_cluster_event(
|
|
1463
1532
|
cluster_name,
|
|
@@ -1602,35 +1671,50 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1602
1671
|
return failure_reason
|
|
1603
1672
|
|
|
1604
1673
|
|
|
1605
|
-
def
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
non_terminated_only: bool = True
|
|
1610
|
-
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1611
|
-
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1612
|
-
# phases.
|
|
1613
|
-
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1614
|
-
status_map = {
|
|
1615
|
-
'Pending': status_lib.ClusterStatus.INIT,
|
|
1616
|
-
'Running': status_lib.ClusterStatus.UP,
|
|
1617
|
-
'Failed': status_lib.ClusterStatus.INIT,
|
|
1618
|
-
'Unknown': None,
|
|
1619
|
-
'Succeeded': None,
|
|
1620
|
-
}
|
|
1621
|
-
|
|
1622
|
-
assert provider_config is not None
|
|
1623
|
-
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
1624
|
-
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
1625
|
-
is_ssh = context.startswith('ssh-') if context else False
|
|
1626
|
-
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
|
1627
|
-
|
|
1628
|
-
# Get all the pods with the label skypilot-cluster: <cluster_name>
|
|
1674
|
+
def list_namespaced_pod(context: Optional[str], namespace: str,
|
|
1675
|
+
cluster_name_on_cloud: str, is_ssh: bool, identity: str,
|
|
1676
|
+
label_selector: str) -> List[Any]:
|
|
1677
|
+
# Get all the pods with the label skypilot-cluster-name: <cluster_name>
|
|
1629
1678
|
try:
|
|
1630
|
-
|
|
1679
|
+
# log the query parameters we pass to the k8s api
|
|
1680
|
+
logger.debug(f'Querying k8s api for pods:\n'
|
|
1681
|
+
f'context: {context}\n'
|
|
1682
|
+
f'namespace: {namespace}\n'
|
|
1683
|
+
f'label selector:`{label_selector}`.')
|
|
1684
|
+
|
|
1685
|
+
response = kubernetes.core_api(context).list_namespaced_pod(
|
|
1631
1686
|
namespace,
|
|
1632
|
-
label_selector=
|
|
1633
|
-
_request_timeout=kubernetes.API_TIMEOUT)
|
|
1687
|
+
label_selector=label_selector,
|
|
1688
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
1689
|
+
|
|
1690
|
+
# log PodList response info
|
|
1691
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1692
|
+
logger.debug(f'k8s api response for `{label_selector}`:\n'
|
|
1693
|
+
f'apiVersion={response.api_version}, '
|
|
1694
|
+
f'kind={response.kind},\n'
|
|
1695
|
+
f'metadata={response.metadata}')
|
|
1696
|
+
|
|
1697
|
+
pods = response.items
|
|
1698
|
+
|
|
1699
|
+
# log detailed Pod info
|
|
1700
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1701
|
+
logger.debug(f'k8s api response for `{label_selector}`: '
|
|
1702
|
+
f'len(pods)={len(pods)}')
|
|
1703
|
+
for pod in pods:
|
|
1704
|
+
logger.debug(f'k8s pod info for `{label_selector}`: '
|
|
1705
|
+
f'pod.apiVersion={pod.api_version}, '
|
|
1706
|
+
f'pod.kind={pod.kind}, \n'
|
|
1707
|
+
f'pod.name={pod.metadata.name}, '
|
|
1708
|
+
f'pod.namespace={pod.metadata.namespace}, \n'
|
|
1709
|
+
f'pod.labels={pod.metadata.labels}, \n'
|
|
1710
|
+
f'pod.annotations={pod.metadata.annotations}, \n'
|
|
1711
|
+
'pod.creationTimestamp='
|
|
1712
|
+
f'{pod.metadata.creation_timestamp}, '
|
|
1713
|
+
'pod.deletionTimestamp='
|
|
1714
|
+
f'{pod.metadata.deletion_timestamp}, \n'
|
|
1715
|
+
f'pod.status={pod.status}')
|
|
1716
|
+
return pods
|
|
1717
|
+
|
|
1634
1718
|
except kubernetes.max_retry_error():
|
|
1635
1719
|
with ux_utils.print_exception_no_traceback():
|
|
1636
1720
|
if is_ssh:
|
|
@@ -1654,14 +1738,63 @@ def query_instances(
|
|
|
1654
1738
|
f'Failed to query {identity} {cluster_name_on_cloud!r} '
|
|
1655
1739
|
f'status: {common_utils.format_exception(e)}')
|
|
1656
1740
|
|
|
1741
|
+
|
|
1742
|
+
def query_instances(
|
|
1743
|
+
cluster_name: str,
|
|
1744
|
+
cluster_name_on_cloud: str,
|
|
1745
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
1746
|
+
non_terminated_only: bool = True,
|
|
1747
|
+
retry_if_missing: bool = False,
|
|
1748
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1749
|
+
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1750
|
+
# phases.
|
|
1751
|
+
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1752
|
+
status_map = {
|
|
1753
|
+
'Pending': status_lib.ClusterStatus.INIT,
|
|
1754
|
+
'Running': status_lib.ClusterStatus.UP,
|
|
1755
|
+
'Failed': status_lib.ClusterStatus.INIT,
|
|
1756
|
+
'Unknown': None,
|
|
1757
|
+
'Succeeded': None,
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
assert provider_config is not None
|
|
1761
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
1762
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
1763
|
+
is_ssh = context.startswith('ssh-') if context else False
|
|
1764
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
|
1765
|
+
label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
|
|
1766
|
+
f'{cluster_name_on_cloud}')
|
|
1767
|
+
|
|
1768
|
+
attempts = 0
|
|
1769
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1770
|
+
is_ssh, identity, label_selector)
|
|
1771
|
+
# When we see no pods returned from the k8s api, we assume the pods have
|
|
1772
|
+
# been terminated by the user directly and mark the cluster as terminated
|
|
1773
|
+
# in the global user state.
|
|
1774
|
+
# We add retry logic here as an attempt to mitigate a leak caused by the
|
|
1775
|
+
# kubernetes api returning no pods despite the pods actually existing.
|
|
1776
|
+
while (retry_if_missing and not pods and
|
|
1777
|
+
attempts < _MAX_QUERY_INSTANCES_RETRIES):
|
|
1778
|
+
logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
|
|
1779
|
+
f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
|
|
1780
|
+
f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
|
|
1781
|
+
time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
|
|
1782
|
+
attempts += 1
|
|
1783
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1784
|
+
is_ssh, identity, label_selector)
|
|
1785
|
+
if len(pods) > 0:
|
|
1786
|
+
logger.info(f'Found {len(pods)} pods for {label_selector} after'
|
|
1787
|
+
f'{attempts} retries.')
|
|
1788
|
+
|
|
1657
1789
|
# Check if the pods are running or pending
|
|
1658
1790
|
cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
1659
1791
|
Optional[str]]] = {}
|
|
1660
1792
|
for pod in pods:
|
|
1661
1793
|
phase = pod.status.phase
|
|
1794
|
+
is_terminating = pod.metadata.deletion_timestamp is not None
|
|
1662
1795
|
pod_status = status_map[phase]
|
|
1663
1796
|
reason = None
|
|
1664
|
-
if phase in ('Failed', 'Unknown'):
|
|
1797
|
+
if phase in ('Failed', 'Unknown') or is_terminating:
|
|
1665
1798
|
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
1666
1799
|
logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
|
|
1667
1800
|
if non_terminated_only and pod_status is None:
|
|
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional
|
|
|
4
4
|
from sky import sky_logging
|
|
5
5
|
from sky.adaptors import kubernetes
|
|
6
6
|
from sky.provision import common
|
|
7
|
+
from sky.provision import constants as provision_constants
|
|
7
8
|
from sky.provision.kubernetes import network_utils
|
|
8
9
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
9
10
|
from sky.utils import kubernetes_enums
|
|
@@ -48,12 +49,14 @@ def _open_ports_using_loadbalancer(
|
|
|
48
49
|
service_name = _LOADBALANCER_SERVICE_NAME.format(
|
|
49
50
|
cluster_name_on_cloud=cluster_name_on_cloud)
|
|
50
51
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
52
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
53
|
+
|
|
51
54
|
content = network_utils.fill_loadbalancer_template(
|
|
52
|
-
namespace=
|
|
55
|
+
namespace=namespace,
|
|
53
56
|
context=context,
|
|
54
57
|
service_name=service_name,
|
|
55
58
|
ports=ports,
|
|
56
|
-
selector_key=
|
|
59
|
+
selector_key=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
57
60
|
selector_value=cluster_name_on_cloud,
|
|
58
61
|
)
|
|
59
62
|
|
|
@@ -103,11 +106,11 @@ def _open_ports_using_ingress(
|
|
|
103
106
|
# To avoid this, we change ingress creation into one object containing
|
|
104
107
|
# multiple rules.
|
|
105
108
|
content = network_utils.fill_ingress_template(
|
|
106
|
-
namespace=
|
|
109
|
+
namespace=namespace,
|
|
107
110
|
context=context,
|
|
108
111
|
service_details=service_details,
|
|
109
112
|
ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
|
|
110
|
-
selector_key=
|
|
113
|
+
selector_key=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
111
114
|
selector_value=cluster_name_on_cloud,
|
|
112
115
|
)
|
|
113
116
|
|
|
@@ -165,9 +168,10 @@ def _cleanup_ports_for_loadbalancer(
|
|
|
165
168
|
# TODO(aylei): test coverage
|
|
166
169
|
context = provider_config.get(
|
|
167
170
|
'context', kubernetes_utils.get_current_kube_config_context_name())
|
|
171
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
168
172
|
network_utils.delete_namespaced_service(
|
|
169
173
|
context=context,
|
|
170
|
-
namespace=
|
|
174
|
+
namespace=namespace,
|
|
171
175
|
service_name=service_name,
|
|
172
176
|
)
|
|
173
177
|
|
|
@@ -180,19 +184,19 @@ def _cleanup_ports_for_ingress(
|
|
|
180
184
|
# Delete services for each port
|
|
181
185
|
context = provider_config.get(
|
|
182
186
|
'context', kubernetes_utils.get_current_kube_config_context_name())
|
|
187
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
183
188
|
for port in ports:
|
|
184
189
|
service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}'
|
|
185
190
|
network_utils.delete_namespaced_service(
|
|
186
191
|
context=context,
|
|
187
|
-
namespace=
|
|
188
|
-
kubernetes_utils.DEFAULT_NAMESPACE),
|
|
192
|
+
namespace=namespace,
|
|
189
193
|
service_name=service_name,
|
|
190
194
|
)
|
|
191
195
|
|
|
192
196
|
# Delete the single ingress used for all ports
|
|
193
197
|
ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
|
|
194
198
|
network_utils.delete_namespaced_ingress(
|
|
195
|
-
namespace=
|
|
199
|
+
namespace=namespace,
|
|
196
200
|
context=kubernetes_utils.get_context_from_config(provider_config),
|
|
197
201
|
ingress_name=ingress_name,
|
|
198
202
|
)
|