skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +200 -78
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +104 -53
- sky/client/sdk.py +13 -5
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +7 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +224 -38
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/controller.py +7 -7
- sky/jobs/server/core.py +3 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -11
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/auth/oauth2_proxy.py +2 -2
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +31 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +12 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +2 -3
- sky/templates/kubernetes-ray.yml.j2 +16 -15
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +22 -14
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,7 @@ import shlex
|
|
|
4
4
|
import subprocess
|
|
5
5
|
import sys
|
|
6
6
|
import tempfile
|
|
7
|
+
import textwrap
|
|
7
8
|
from typing import List, Optional
|
|
8
9
|
|
|
9
10
|
import colorama
|
|
@@ -24,6 +25,9 @@ logger = sky_logging.init_logger(__name__)
|
|
|
24
25
|
|
|
25
26
|
# Default path for Kubernetes configuration file
|
|
26
27
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
28
|
+
DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
|
|
29
|
+
LOCAL_CLUSTER_PORT_RANGE = 101
|
|
30
|
+
LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
def check_ssh_cluster_dependencies(
|
|
@@ -252,7 +256,68 @@ def deploy_remote_cluster(ip_list: List[str],
|
|
|
252
256
|
is_local=True))
|
|
253
257
|
|
|
254
258
|
|
|
255
|
-
def
|
|
259
|
+
def generate_kind_config(port_start: int,
|
|
260
|
+
num_nodes: int = 1,
|
|
261
|
+
gpus: bool = False) -> str:
|
|
262
|
+
"""Generate a kind cluster config with ports mapped from host to container
|
|
263
|
+
|
|
264
|
+
Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
|
|
265
|
+
Internally, this will map to ports 30000 - 30100
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
path: Path to generate the config file at
|
|
269
|
+
port_start: Port range start for mappings
|
|
270
|
+
num_nodes: Number of nodes in the cluster
|
|
271
|
+
gpus: If true, initialize kind cluster with GPU support
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
The kind cluster config
|
|
275
|
+
"""
|
|
276
|
+
internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
277
|
+
internal_end = internal_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
278
|
+
|
|
279
|
+
config = textwrap.dedent(f"""
|
|
280
|
+
apiVersion: kind.x-k8s.io/v1alpha4
|
|
281
|
+
kind: Cluster
|
|
282
|
+
kubeadmConfigPatches:
|
|
283
|
+
- |
|
|
284
|
+
kind: ClusterConfiguration
|
|
285
|
+
apiServer:
|
|
286
|
+
extraArgs:
|
|
287
|
+
"service-node-port-range": {internal_start}-{internal_end}
|
|
288
|
+
nodes:
|
|
289
|
+
- role: control-plane
|
|
290
|
+
kubeadmConfigPatches:
|
|
291
|
+
- |
|
|
292
|
+
kind: InitConfiguration
|
|
293
|
+
nodeRegistration:
|
|
294
|
+
kubeletExtraArgs:
|
|
295
|
+
node-labels: "ingress-ready=true"
|
|
296
|
+
""")
|
|
297
|
+
if gpus:
|
|
298
|
+
config += textwrap.indent(
|
|
299
|
+
textwrap.dedent("""
|
|
300
|
+
extraMounts:
|
|
301
|
+
- hostPath: /dev/null
|
|
302
|
+
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
303
|
+
config += textwrap.indent(textwrap.dedent("""
|
|
304
|
+
extraPortMappings:"""), ' ' * 2)
|
|
305
|
+
for offset in range(LOCAL_CLUSTER_PORT_RANGE):
|
|
306
|
+
config += textwrap.indent(
|
|
307
|
+
textwrap.dedent(f"""
|
|
308
|
+
- containerPort: {internal_start + offset}
|
|
309
|
+
hostPort: {port_start + offset}
|
|
310
|
+
listenAddress: "0.0.0.0"
|
|
311
|
+
protocol: tcp
|
|
312
|
+
"""), ' ' * 2)
|
|
313
|
+
if num_nodes > 1:
|
|
314
|
+
config += '- role: worker\n' * (num_nodes - 1)
|
|
315
|
+
return config
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def deploy_local_cluster(name: Optional[str], gpus: bool):
|
|
319
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
320
|
+
context_name = f'kind-{name}'
|
|
256
321
|
cluster_created = False
|
|
257
322
|
|
|
258
323
|
# Check if GPUs are available on the host
|
|
@@ -262,41 +327,57 @@ def deploy_local_cluster(gpus: bool):
|
|
|
262
327
|
# Check if ~/.kube/config exists:
|
|
263
328
|
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
264
329
|
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
265
|
-
|
|
266
|
-
if curr_context is not None and curr_context != skypilot_context:
|
|
330
|
+
if curr_context is not None and curr_context != context_name:
|
|
267
331
|
logger.info(
|
|
268
332
|
f'Current context in kube config: {curr_context}'
|
|
269
|
-
'\nWill automatically switch to
|
|
270
|
-
'cluster is created.')
|
|
271
|
-
message_str = 'Creating local cluster{}...'
|
|
272
|
-
message_str = message_str.format(
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
333
|
+
f'\nWill automatically switch to {context_name} after the '
|
|
334
|
+
'local cluster is created.')
|
|
335
|
+
message_str = 'Creating local cluster {}{}...'
|
|
336
|
+
message_str = message_str.format(
|
|
337
|
+
name,
|
|
338
|
+
' with GPU support (this may take up to 15 minutes)' if gpus else '')
|
|
339
|
+
|
|
340
|
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
|
|
341
|
+
delete=True) as f:
|
|
342
|
+
# Choose random port range to use on the host machine.
|
|
343
|
+
# Port range is port_start - port_start + 99 (exactly 100 ports).
|
|
344
|
+
# port_start = random.randint(300, 399) * 100
|
|
345
|
+
# TODO (kyuds): hard coding to pass smoketests. Need to figure out
|
|
346
|
+
# how to deal with this later.
|
|
347
|
+
port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
348
|
+
port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
349
|
+
logger.debug(f'Using port range {port_start}-{port_end}')
|
|
350
|
+
f.write(generate_kind_config(port_start, gpus=gpus))
|
|
351
|
+
f.flush()
|
|
352
|
+
|
|
353
|
+
path_to_package = os.path.dirname(__file__)
|
|
354
|
+
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
355
|
+
|
|
356
|
+
# Get directory of script and run it from there
|
|
357
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
358
|
+
run_command = f'{up_script_path} {name} {f.name}'
|
|
359
|
+
if gpus:
|
|
360
|
+
run_command += ' --gpus'
|
|
361
|
+
run_command = shlex.split(run_command)
|
|
281
362
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
363
|
+
# Setup logging paths
|
|
364
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
365
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
366
|
+
'local_up.log')
|
|
367
|
+
logger.info(message_str)
|
|
287
368
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
369
|
+
with rich_utils.safe_status(
|
|
370
|
+
ux_utils.spinner_message(message_str,
|
|
371
|
+
log_path=log_path,
|
|
372
|
+
is_local=True)):
|
|
373
|
+
returncode, _, stderr = log_lib.run_with_log(
|
|
374
|
+
cmd=run_command,
|
|
375
|
+
log_path=log_path,
|
|
376
|
+
require_outputs=True,
|
|
377
|
+
stream_logs=False,
|
|
378
|
+
line_processor=log_utils.SkyLocalUpLineProcessor(
|
|
379
|
+
log_path=log_path, is_local=True),
|
|
380
|
+
cwd=cwd)
|
|
300
381
|
|
|
301
382
|
# Kind always writes to stderr even if it succeeds.
|
|
302
383
|
# If the failure happens after the cluster is created, we need
|
|
@@ -309,11 +390,11 @@ def deploy_local_cluster(gpus: bool):
|
|
|
309
390
|
elif returncode == 100:
|
|
310
391
|
logger.info(
|
|
311
392
|
ux_utils.finishing_message(
|
|
312
|
-
'Local cluster already exists.\n',
|
|
393
|
+
f'Local cluster {name} already exists.\n',
|
|
313
394
|
log_path=log_path,
|
|
314
395
|
is_local=True,
|
|
315
396
|
follow_up_message=
|
|
316
|
-
'If you want to delete it instead, run: sky local down'))
|
|
397
|
+
'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
|
|
317
398
|
else:
|
|
318
399
|
with ux_utils.print_exception_no_traceback():
|
|
319
400
|
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
@@ -339,7 +420,7 @@ def deploy_local_cluster(gpus: bool):
|
|
|
339
420
|
if gpus:
|
|
340
421
|
# Get GPU model by querying the node labels
|
|
341
422
|
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
|
342
|
-
gpu_type_cmd = f'kubectl get node
|
|
423
|
+
gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
|
343
424
|
try:
|
|
344
425
|
# Run the command and capture the output
|
|
345
426
|
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
|
@@ -375,8 +456,9 @@ def deploy_local_cluster(gpus: bool):
|
|
|
375
456
|
'This may cause issues with running tasks.')
|
|
376
457
|
logger.info(
|
|
377
458
|
ux_utils.finishing_message(
|
|
378
|
-
message=(
|
|
379
|
-
|
|
459
|
+
message=(
|
|
460
|
+
f'Local Kubernetes cluster {name} created successfully '
|
|
461
|
+
f'with {num_cpus} CPUs{gpu_message}.'),
|
|
380
462
|
log_path=log_path,
|
|
381
463
|
is_local=True,
|
|
382
464
|
follow_up_message=(
|
|
@@ -384,3 +466,54 @@ def deploy_local_cluster(gpus: bool):
|
|
|
384
466
|
'Hint: To change the number of CPUs, change your docker '
|
|
385
467
|
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
|
386
468
|
f'{gpu_hint}')))
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def teardown_local_cluster(name: Optional[str] = None):
|
|
472
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
473
|
+
cluster_removed = False
|
|
474
|
+
|
|
475
|
+
path_to_package = os.path.dirname(__file__)
|
|
476
|
+
down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
|
|
477
|
+
|
|
478
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
|
479
|
+
run_command = f'{down_script_path} {name}'
|
|
480
|
+
run_command = shlex.split(run_command)
|
|
481
|
+
|
|
482
|
+
# Setup logging paths
|
|
483
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
484
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
485
|
+
'local_down.log')
|
|
486
|
+
|
|
487
|
+
with rich_utils.safe_status(
|
|
488
|
+
ux_utils.spinner_message(f'Removing local cluster {name}',
|
|
489
|
+
log_path=log_path,
|
|
490
|
+
is_local=True)):
|
|
491
|
+
|
|
492
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
|
493
|
+
log_path=log_path,
|
|
494
|
+
require_outputs=True,
|
|
495
|
+
stream_logs=False,
|
|
496
|
+
cwd=cwd)
|
|
497
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
498
|
+
|
|
499
|
+
if returncode == 0:
|
|
500
|
+
cluster_removed = True
|
|
501
|
+
elif returncode == 100:
|
|
502
|
+
logger.info(
|
|
503
|
+
ux_utils.error_message(f'Local cluster {name} does not exist.'))
|
|
504
|
+
else:
|
|
505
|
+
with ux_utils.print_exception_no_traceback():
|
|
506
|
+
raise RuntimeError(f'Failed to down local cluster {name}. '
|
|
507
|
+
f'Stdout: {stdout}'
|
|
508
|
+
f'\nError: {stderr}')
|
|
509
|
+
if cluster_removed:
|
|
510
|
+
# Run sky check
|
|
511
|
+
with rich_utils.safe_status(
|
|
512
|
+
ux_utils.spinner_message('Running sky check...')):
|
|
513
|
+
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
514
|
+
clouds=['kubernetes'],
|
|
515
|
+
quiet=True)
|
|
516
|
+
logger.info(
|
|
517
|
+
ux_utils.finishing_message(f'Local cluster {name} removed.',
|
|
518
|
+
log_path=log_path,
|
|
519
|
+
is_local=True))
|
sky/utils/kubernetes_enums.py
CHANGED
|
@@ -44,3 +44,8 @@ class KubernetesAutoscalerType(enum.Enum):
|
|
|
44
44
|
KARPENTER = 'karpenter'
|
|
45
45
|
COREWEAVE = 'coreweave'
|
|
46
46
|
GENERIC = 'generic'
|
|
47
|
+
|
|
48
|
+
def emits_autoscale_event(self) -> bool:
|
|
49
|
+
"""Returns whether specific autoscaler emits the event reason
|
|
50
|
+
TriggeredScaleUp."""
|
|
51
|
+
return self not in {self.KARPENTER}
|
sky/utils/ux_utils.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
"""Utility functions for UX."""
|
|
2
2
|
import contextlib
|
|
3
3
|
import enum
|
|
4
|
+
import fnmatch
|
|
4
5
|
import os
|
|
5
6
|
import sys
|
|
6
7
|
import traceback
|
|
7
8
|
import typing
|
|
8
|
-
from typing import Callable, Optional, Union
|
|
9
|
+
from typing import Callable, Iterable, List, Optional, Union
|
|
9
10
|
|
|
10
11
|
import colorama
|
|
11
12
|
|
|
@@ -288,3 +289,36 @@ def command_hint_messages(hint_type: CommandHintType,
|
|
|
288
289
|
f'{BOLD}sky jobs queue{RESET_BOLD}')
|
|
289
290
|
else:
|
|
290
291
|
raise ValueError(f'Invalid hint type: {hint_type}')
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def is_glob_pattern(pattern: str) -> bool:
|
|
295
|
+
"""Checks if a string contains common glob pattern wildcards."""
|
|
296
|
+
glob_chars = {'*', '?', '[', ']'}
|
|
297
|
+
# Also check for '**' as a specific globstar pattern
|
|
298
|
+
if '**' in pattern:
|
|
299
|
+
return True
|
|
300
|
+
for char in pattern:
|
|
301
|
+
if char in glob_chars:
|
|
302
|
+
return True
|
|
303
|
+
return False
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def get_non_matched_query(query_clusters: Iterable[str],
|
|
307
|
+
cluster_names: Iterable[str]) -> List[str]:
|
|
308
|
+
"""Gets the non-matched query clusters."""
|
|
309
|
+
glob_query_clusters = []
|
|
310
|
+
non_glob_query_clusters = []
|
|
311
|
+
for cluster_name in query_clusters:
|
|
312
|
+
if is_glob_pattern(cluster_name):
|
|
313
|
+
glob_query_clusters.append(cluster_name)
|
|
314
|
+
else:
|
|
315
|
+
non_glob_query_clusters.append(cluster_name)
|
|
316
|
+
not_found_clusters = [
|
|
317
|
+
query_cluster for query_cluster in non_glob_query_clusters
|
|
318
|
+
if query_cluster not in cluster_names
|
|
319
|
+
]
|
|
320
|
+
not_found_clusters.extend([
|
|
321
|
+
query_cluster for query_cluster in glob_query_clusters
|
|
322
|
+
if not fnmatch.filter(cluster_names, query_cluster)
|
|
323
|
+
])
|
|
324
|
+
return not_found_clusters
|
sky/utils/yaml_utils.py
CHANGED
|
@@ -44,6 +44,15 @@ def read_yaml(path: Optional[str]) -> Dict[str, Any]:
|
|
|
44
44
|
return config
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def read_yaml_str(yaml_str: str) -> Dict[str, Any]:
|
|
48
|
+
stream = io.StringIO(yaml_str)
|
|
49
|
+
parsed_yaml = safe_load(stream)
|
|
50
|
+
if not parsed_yaml:
|
|
51
|
+
# Empty dict
|
|
52
|
+
return {}
|
|
53
|
+
return parsed_yaml
|
|
54
|
+
|
|
55
|
+
|
|
47
56
|
def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
|
|
48
57
|
stream = io.StringIO(yaml_str)
|
|
49
58
|
config = safe_load_all(stream)
|
sky/volumes/client/sdk.py
CHANGED
|
@@ -3,13 +3,16 @@ import json
|
|
|
3
3
|
import typing
|
|
4
4
|
from typing import Any, Dict, List
|
|
5
5
|
|
|
6
|
+
from sky import exceptions
|
|
6
7
|
from sky import sky_logging
|
|
7
8
|
from sky.adaptors import common as adaptors_common
|
|
8
9
|
from sky.server import common as server_common
|
|
10
|
+
from sky.server import versions
|
|
9
11
|
from sky.server.requests import payloads
|
|
10
12
|
from sky.usage import usage_lib
|
|
11
13
|
from sky.utils import annotations
|
|
12
14
|
from sky.utils import context
|
|
15
|
+
from sky.utils import ux_utils
|
|
13
16
|
from sky.volumes import volume as volume_lib
|
|
14
17
|
|
|
15
18
|
if typing.TYPE_CHECKING:
|
|
@@ -71,12 +74,44 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
|
|
|
71
74
|
config=volume.config,
|
|
72
75
|
labels=volume.labels,
|
|
73
76
|
)
|
|
74
|
-
response =
|
|
75
|
-
|
|
76
|
-
cookies=server_common.get_api_cookie_jar())
|
|
77
|
+
response = server_common.make_authenticated_request(
|
|
78
|
+
'POST', '/volumes/apply', json=json.loads(body.model_dump_json()))
|
|
77
79
|
return server_common.get_request_id(response)
|
|
78
80
|
|
|
79
81
|
|
|
82
|
+
@context.contextual
|
|
83
|
+
@usage_lib.entrypoint
|
|
84
|
+
@server_common.check_server_healthy_or_start
|
|
85
|
+
@annotations.client_api
|
|
86
|
+
@versions.minimal_api_version(20)
|
|
87
|
+
def validate(volume: volume_lib.Volume) -> None:
|
|
88
|
+
"""Validates the volume.
|
|
89
|
+
|
|
90
|
+
All validation is done on the server side.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
volume: The volume to validate.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If the volume is invalid.
|
|
97
|
+
"""
|
|
98
|
+
body = payloads.VolumeValidateBody(
|
|
99
|
+
name=volume.name,
|
|
100
|
+
volume_type=volume.type,
|
|
101
|
+
infra=volume.infra,
|
|
102
|
+
resource_name=volume.resource_name,
|
|
103
|
+
size=volume.size,
|
|
104
|
+
config=volume.config,
|
|
105
|
+
labels=volume.labels,
|
|
106
|
+
)
|
|
107
|
+
response = server_common.make_authenticated_request(
|
|
108
|
+
'POST', '/volumes/validate', json=json.loads(body.model_dump_json()))
|
|
109
|
+
if response.status_code == 400:
|
|
110
|
+
with ux_utils.print_exception_no_traceback():
|
|
111
|
+
raise exceptions.deserialize_exception(
|
|
112
|
+
response.json().get('detail'))
|
|
113
|
+
|
|
114
|
+
|
|
80
115
|
@context.contextual
|
|
81
116
|
@usage_lib.entrypoint
|
|
82
117
|
@server_common.check_server_healthy_or_start
|
|
@@ -87,8 +122,10 @@ def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
|
|
|
87
122
|
Returns:
|
|
88
123
|
The request ID of the list request.
|
|
89
124
|
"""
|
|
90
|
-
response =
|
|
91
|
-
|
|
125
|
+
response = server_common.make_authenticated_request(
|
|
126
|
+
'GET',
|
|
127
|
+
'/volumes',
|
|
128
|
+
)
|
|
92
129
|
return server_common.get_request_id(response)
|
|
93
130
|
|
|
94
131
|
|
|
@@ -106,7 +143,6 @@ def delete(names: List[str]) -> server_common.RequestId[None]:
|
|
|
106
143
|
The request ID of the delete request.
|
|
107
144
|
"""
|
|
108
145
|
body = payloads.VolumeDeleteBody(names=names)
|
|
109
|
-
response =
|
|
110
|
-
|
|
111
|
-
cookies=server_common.get_api_cookie_jar())
|
|
146
|
+
response = server_common.make_authenticated_request(
|
|
147
|
+
'POST', '/volumes/delete', json=json.loads(body.model_dump_json()))
|
|
112
148
|
return server_common.get_request_id(response)
|
sky/volumes/server/server.py
CHANGED
|
@@ -3,12 +3,13 @@
|
|
|
3
3
|
import fastapi
|
|
4
4
|
|
|
5
5
|
from sky import clouds
|
|
6
|
+
from sky import exceptions
|
|
6
7
|
from sky import sky_logging
|
|
7
8
|
from sky.server.requests import executor
|
|
8
9
|
from sky.server.requests import payloads
|
|
9
10
|
from sky.server.requests import requests as requests_lib
|
|
10
11
|
from sky.utils import registry
|
|
11
|
-
from sky.utils import volume
|
|
12
|
+
from sky.utils import volume as volume_utils
|
|
12
13
|
from sky.volumes.server import core
|
|
13
14
|
|
|
14
15
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -46,6 +47,31 @@ async def volume_delete(request: fastapi.Request,
|
|
|
46
47
|
)
|
|
47
48
|
|
|
48
49
|
|
|
50
|
+
@router.post('/validate')
|
|
51
|
+
async def volume_validate(
|
|
52
|
+
_: fastapi.Request,
|
|
53
|
+
volume_validate_body: payloads.VolumeValidateBody) -> None:
|
|
54
|
+
"""Validates a volume."""
|
|
55
|
+
# pylint: disable=import-outside-toplevel
|
|
56
|
+
from sky.volumes import volume as volume_lib
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
volume_config = {
|
|
60
|
+
'name': volume_validate_body.name,
|
|
61
|
+
'type': volume_validate_body.volume_type,
|
|
62
|
+
'infra': volume_validate_body.infra,
|
|
63
|
+
'size': volume_validate_body.size,
|
|
64
|
+
'labels': volume_validate_body.labels,
|
|
65
|
+
'config': volume_validate_body.config,
|
|
66
|
+
'resource_name': volume_validate_body.resource_name,
|
|
67
|
+
}
|
|
68
|
+
volume = volume_lib.Volume.from_yaml_config(volume_config)
|
|
69
|
+
volume.validate()
|
|
70
|
+
except Exception as e:
|
|
71
|
+
raise fastapi.HTTPException(status_code=400,
|
|
72
|
+
detail=exceptions.serialize_exception(e))
|
|
73
|
+
|
|
74
|
+
|
|
49
75
|
@router.post('/apply')
|
|
50
76
|
async def volume_apply(request: fastapi.Request,
|
|
51
77
|
volume_apply_body: payloads.VolumeApplyBody) -> None:
|
|
@@ -55,7 +81,7 @@ async def volume_apply(request: fastapi.Request,
|
|
|
55
81
|
volume_config = volume_apply_body.config
|
|
56
82
|
|
|
57
83
|
supported_volume_types = [
|
|
58
|
-
volume_type.value for volume_type in
|
|
84
|
+
volume_type.value for volume_type in volume_utils.VolumeType
|
|
59
85
|
]
|
|
60
86
|
if volume_type not in supported_volume_types:
|
|
61
87
|
raise fastapi.HTTPException(
|
|
@@ -64,24 +90,24 @@ async def volume_apply(request: fastapi.Request,
|
|
|
64
90
|
if cloud is None:
|
|
65
91
|
raise fastapi.HTTPException(status_code=400,
|
|
66
92
|
detail=f'Invalid cloud: {volume_cloud}')
|
|
67
|
-
if volume_type ==
|
|
93
|
+
if volume_type == volume_utils.VolumeType.PVC.value:
|
|
68
94
|
if not cloud.is_same_cloud(clouds.Kubernetes()):
|
|
69
95
|
raise fastapi.HTTPException(
|
|
70
96
|
status_code=400,
|
|
71
97
|
detail='PVC storage is only supported on Kubernetes')
|
|
72
98
|
supported_access_modes = [
|
|
73
|
-
access_mode.value for access_mode in
|
|
99
|
+
access_mode.value for access_mode in volume_utils.VolumeAccessMode
|
|
74
100
|
]
|
|
75
101
|
if volume_config is None:
|
|
76
102
|
volume_config = {}
|
|
77
103
|
access_mode = volume_config.get('access_mode')
|
|
78
104
|
if access_mode is None:
|
|
79
|
-
volume_config[
|
|
80
|
-
|
|
105
|
+
volume_config['access_mode'] = (
|
|
106
|
+
volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value)
|
|
81
107
|
elif access_mode not in supported_access_modes:
|
|
82
108
|
raise fastapi.HTTPException(
|
|
83
109
|
status_code=400, detail=f'Invalid access mode: {access_mode}')
|
|
84
|
-
elif volume_type ==
|
|
110
|
+
elif volume_type == volume_utils.VolumeType.RUNPOD_NETWORK_VOLUME.value:
|
|
85
111
|
if not cloud.is_same_cloud(clouds.RunPod()):
|
|
86
112
|
raise fastapi.HTTPException(
|
|
87
113
|
status_code=400,
|
sky/volumes/volume.py
CHANGED
|
@@ -115,9 +115,6 @@ class Volume:
|
|
|
115
115
|
self.region = infra_info.region
|
|
116
116
|
self.zone = infra_info.zone
|
|
117
117
|
|
|
118
|
-
# Validate the volume config
|
|
119
|
-
self._validate_config()
|
|
120
|
-
|
|
121
118
|
def _adjust_config(self) -> None:
|
|
122
119
|
"""Adjust the volume config (e.g., parse size)."""
|
|
123
120
|
if self.size is None:
|
|
@@ -132,8 +129,28 @@ class Volume:
|
|
|
132
129
|
except ValueError as e:
|
|
133
130
|
raise ValueError(f'Invalid size {self.size}: {e}') from e
|
|
134
131
|
|
|
135
|
-
def
|
|
136
|
-
"""
|
|
132
|
+
def validate(self, skip_cloud_compatibility: bool = False) -> None:
|
|
133
|
+
"""Validates the volume."""
|
|
134
|
+
self.validate_name()
|
|
135
|
+
self.validate_size()
|
|
136
|
+
if not skip_cloud_compatibility:
|
|
137
|
+
self.validate_cloud_compatibility()
|
|
138
|
+
# Extra, type-specific validations
|
|
139
|
+
self._validate_config_extra()
|
|
140
|
+
|
|
141
|
+
def validate_name(self) -> None:
|
|
142
|
+
"""Validates if the volume name is set."""
|
|
143
|
+
assert self.name is not None, 'Volume name must be set'
|
|
144
|
+
|
|
145
|
+
def validate_size(self) -> None:
|
|
146
|
+
"""Validates that size is specified for new volumes."""
|
|
147
|
+
if not self.resource_name and not self.size:
|
|
148
|
+
raise ValueError('Size is required for new volumes. '
|
|
149
|
+
'Please specify the size in the YAML file or '
|
|
150
|
+
'use the --size flag.')
|
|
151
|
+
|
|
152
|
+
def validate_cloud_compatibility(self) -> None:
|
|
153
|
+
"""Validates that the specified cloud is compatible with volume type."""
|
|
137
154
|
cloud_obj_from_type = VOLUME_TYPE_TO_CLOUD.get(
|
|
138
155
|
volume_lib.VolumeType(self.type))
|
|
139
156
|
if self.cloud:
|
|
@@ -150,25 +167,16 @@ class Volume:
|
|
|
150
167
|
self.region, self.zone = cloud_obj.validate_region_zone(
|
|
151
168
|
self.region, self.zone)
|
|
152
169
|
|
|
153
|
-
# Name must be set by factory before validation.
|
|
154
|
-
assert self.name is not None
|
|
155
170
|
valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
|
|
156
171
|
if not valid:
|
|
157
172
|
raise ValueError(f'Invalid volume name: {err_msg}')
|
|
158
173
|
|
|
159
|
-
if not self.resource_name and not self.size:
|
|
160
|
-
raise ValueError('Size is required for new volumes. '
|
|
161
|
-
'Please specify the size in the YAML file or '
|
|
162
|
-
'use the --size flag.')
|
|
163
174
|
if self.labels:
|
|
164
175
|
for key, value in self.labels.items():
|
|
165
176
|
valid, err_msg = cloud_obj.is_label_valid(key, value)
|
|
166
177
|
if not valid:
|
|
167
178
|
raise ValueError(f'{err_msg}')
|
|
168
179
|
|
|
169
|
-
# Extra, type-specific validations
|
|
170
|
-
self._validate_config_extra()
|
|
171
|
-
|
|
172
180
|
# Hook methods for subclasses
|
|
173
181
|
def _validate_config_extra(self) -> None:
|
|
174
182
|
"""Additional type-specific validation.
|