skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +194 -69
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +104 -53
- sky/client/sdk.py +13 -5
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +7 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-16ba1d7187d2e3b1.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +217 -36
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/controller.py +7 -7
- sky/jobs/server/core.py +3 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +31 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +4 -1
- sky/setup_files/setup.py +44 -44
- sky/templates/kubernetes-ray.yml.j2 +16 -15
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +22 -14
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +107 -107
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
sky/logs/aws.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, Dict, Optional
|
|
|
5
5
|
import pydantic
|
|
6
6
|
|
|
7
7
|
from sky.logs.agent import FluentbitAgent
|
|
8
|
-
from sky.skylet import constants
|
|
9
8
|
from sky.utils import resources_utils
|
|
10
9
|
from sky.utils import yaml_utils
|
|
11
10
|
|
|
@@ -176,6 +175,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
176
175
|
Returns:
|
|
177
176
|
The Fluent Bit configuration as a YAML string.
|
|
178
177
|
"""
|
|
178
|
+
cfg_dict = yaml_utils.read_yaml_str(
|
|
179
|
+
super().fluentbit_config(cluster_name))
|
|
179
180
|
display_name = cluster_name.display_name
|
|
180
181
|
unique_name = cluster_name.name_on_cloud
|
|
181
182
|
# Build tags for the log stream
|
|
@@ -197,24 +198,13 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
197
198
|
'value': value
|
|
198
199
|
})
|
|
199
200
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
# job creates a new log file and we must be responsive
|
|
208
|
-
# for this: the VM might be autodown within a minute
|
|
209
|
-
# right after the job completion.
|
|
210
|
-
'refresh_interval': 1,
|
|
211
|
-
'processors': {
|
|
212
|
-
'logs': log_processors,
|
|
213
|
-
}
|
|
214
|
-
}],
|
|
215
|
-
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
216
|
-
}
|
|
217
|
-
}
|
|
201
|
+
# Add log processors to config
|
|
202
|
+
processors_config = cfg_dict['pipeline']['inputs'][0].get(
|
|
203
|
+
'processors', {})
|
|
204
|
+
processors_logs_config = processors_config.get('logs', [])
|
|
205
|
+
processors_logs_config.extend(log_processors)
|
|
206
|
+
processors_config['logs'] = processors_logs_config
|
|
207
|
+
cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
|
|
218
208
|
|
|
219
209
|
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
220
210
|
|
sky/provision/__init__.py
CHANGED
|
@@ -168,7 +168,8 @@ def map_all_volumes_usedby(
|
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
@_route_to_cloud_impl
|
|
171
|
-
def run_instances(provider_name: str, region: str,
|
|
171
|
+
def run_instances(provider_name: str, region: str, cluster_name: str,
|
|
172
|
+
cluster_name_on_cloud: str,
|
|
172
173
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
173
174
|
"""Start instances with bootstrapped configuration."""
|
|
174
175
|
raise NotImplementedError
|
sky/provision/aws/instance.py
CHANGED
|
@@ -311,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
|
|
|
311
311
|
return head_instance_id
|
|
312
312
|
|
|
313
313
|
|
|
314
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
314
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
315
315
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
316
316
|
"""See sky/provision/__init__.py"""
|
|
317
|
+
del cluster_name # unused
|
|
317
318
|
ec2 = _default_ec2_resource(region)
|
|
318
319
|
# NOTE: We set max_attempts=0 for fast failing when the resource is not
|
|
319
320
|
# available (although the doc says it will only retry for network
|
sky/provision/azure/instance.py
CHANGED
|
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
|
|
|
362
362
|
return instances
|
|
363
363
|
|
|
364
364
|
|
|
365
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
365
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
366
366
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
367
367
|
"""See sky/provision/__init__.py"""
|
|
368
|
+
del cluster_name # unused
|
|
368
369
|
# TODO(zhwu): This function is too long. We should refactor it.
|
|
369
370
|
provider_config = config.provider_config
|
|
370
371
|
resource_group = provider_config['resource_group']
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
40
40
|
return head_instance_id
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
43
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
44
44
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
45
45
|
"""Runs instances for the given cluster."""
|
|
46
|
-
|
|
46
|
+
del cluster_name # unused
|
|
47
47
|
pending_status = ['pend', 'init', 'prol', 'boot']
|
|
48
48
|
|
|
49
49
|
while True:
|
sky/provision/do/instance.py
CHANGED
|
@@ -26,10 +26,10 @@ def _get_head_instance(
|
|
|
26
26
|
return None
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
29
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
30
30
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
31
31
|
"""Runs instances for the given cluster."""
|
|
32
|
-
|
|
32
|
+
del cluster_name # unused
|
|
33
33
|
pending_status = ['new']
|
|
34
34
|
newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
|
|
35
35
|
pending_status + ['off'])
|
sky/provision/docker_utils.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import shlex
|
|
5
5
|
import time
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
9
|
from sky.skylet import constants
|
|
@@ -192,12 +192,16 @@ class DockerInitializer:
|
|
|
192
192
|
self.docker_cmd = 'podman' if use_podman else 'docker'
|
|
193
193
|
self.log_path = log_path
|
|
194
194
|
|
|
195
|
-
def _run(
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
195
|
+
def _run(
|
|
196
|
+
self,
|
|
197
|
+
cmd,
|
|
198
|
+
run_env='host',
|
|
199
|
+
wait_for_docker_daemon: bool = False,
|
|
200
|
+
separate_stderr: bool = False,
|
|
201
|
+
log_err_when_fail: bool = True,
|
|
202
|
+
flock_name: Optional[str] = None,
|
|
203
|
+
flock_args: Optional[str] = None,
|
|
204
|
+
) -> str:
|
|
201
205
|
|
|
202
206
|
if run_env == 'docker':
|
|
203
207
|
cmd = self._docker_expand_user(cmd, any_char=True)
|
|
@@ -206,8 +210,13 @@ class DockerInitializer:
|
|
|
206
210
|
# an error: `the input device is not a TTY`, and it works without
|
|
207
211
|
# `-it` flag.
|
|
208
212
|
# TODO(zhwu): ray use the `-it` flag, we need to check why.
|
|
209
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
210
|
-
f' {shlex.quote(cmd)} ')
|
|
213
|
+
cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
|
|
214
|
+
f' /bin/bash -c {shlex.quote(cmd)} ')
|
|
215
|
+
|
|
216
|
+
if flock_name is not None:
|
|
217
|
+
flock_args = flock_args or ''
|
|
218
|
+
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
219
|
+
f'-c {shlex.quote(cmd)}')
|
|
211
220
|
|
|
212
221
|
logger.debug(f'+ {cmd}')
|
|
213
222
|
start = time.time()
|
|
@@ -259,7 +268,10 @@ class DockerInitializer:
|
|
|
259
268
|
if self._check_container_exited():
|
|
260
269
|
self.initialized = True
|
|
261
270
|
self._run(f'{self.docker_cmd} start {self.container_name}')
|
|
262
|
-
self._run('sudo service ssh start',
|
|
271
|
+
self._run('sudo service ssh start',
|
|
272
|
+
run_env='docker',
|
|
273
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
274
|
+
flock_args='-s -w 1')
|
|
263
275
|
return self._run('whoami', run_env='docker')
|
|
264
276
|
|
|
265
277
|
# SkyPilot: Docker login if user specified a private docker registry.
|
|
@@ -358,7 +370,9 @@ class DockerInitializer:
|
|
|
358
370
|
self._auto_configure_shm(user_docker_run_options)),
|
|
359
371
|
self.docker_cmd,
|
|
360
372
|
)
|
|
361
|
-
self._run(f'{remove_container_cmd}
|
|
373
|
+
self._run(f'{remove_container_cmd} && {start_command}',
|
|
374
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
375
|
+
flock_args='-x -w 10')
|
|
362
376
|
|
|
363
377
|
# SkyPilot: Setup Commands.
|
|
364
378
|
# TODO(zhwu): the following setups should be aligned with the kubernetes
|
|
@@ -376,14 +390,18 @@ class DockerInitializer:
|
|
|
376
390
|
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
|
|
377
391
|
run_env='docker')
|
|
378
392
|
# Install dependencies.
|
|
379
|
-
|
|
380
|
-
'
|
|
393
|
+
cmd = (
|
|
394
|
+
'bash -lc \''
|
|
395
|
+
'exec 200>/var/tmp/sky_apt.lock; '
|
|
396
|
+
'flock -x -w 120 200 || exit 1; '
|
|
397
|
+
'export DEBIAN_FRONTEND=noninteractive; '
|
|
398
|
+
'apt-get -yq update && '
|
|
381
399
|
# Our mount script will install gcsfuse without fuse package.
|
|
382
400
|
# We need to install fuse package first to enable storage mount.
|
|
383
401
|
# The dpkg option is to suppress the prompt for fuse installation.
|
|
384
|
-
'
|
|
385
|
-
'rsync curl wget patch openssh-server python3-pip fuse
|
|
386
|
-
|
|
402
|
+
'apt-get -o DPkg::Options::=--force-confnew install -y '
|
|
403
|
+
'rsync curl wget patch openssh-server python3-pip fuse\'')
|
|
404
|
+
self._run(cmd, run_env='docker')
|
|
387
405
|
|
|
388
406
|
# Copy local authorized_keys to docker container.
|
|
389
407
|
# Stop and disable jupyter service. This is to avoid port conflict on
|
|
@@ -459,9 +477,13 @@ class DockerInitializer:
|
|
|
459
477
|
user_pos = string.find('~')
|
|
460
478
|
if user_pos > -1:
|
|
461
479
|
if self.home_dir is None:
|
|
462
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
463
|
-
'printenv HOME')
|
|
464
|
-
self.home_dir = self._run(
|
|
480
|
+
cmd = (f'{self.docker_cmd} exec {self.container_name}'
|
|
481
|
+
' printenv HOME')
|
|
482
|
+
self.home_dir = self._run(
|
|
483
|
+
cmd,
|
|
484
|
+
separate_stderr=True,
|
|
485
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
486
|
+
flock_args='-s -w 1')
|
|
465
487
|
# Check for unexpected newline in home directory, which can be
|
|
466
488
|
# a common issue when the output is mixed with stderr.
|
|
467
489
|
assert '\n' not in self.home_dir, (
|
|
@@ -78,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
78
78
|
return head_instance_id
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
81
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
82
82
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
83
83
|
"""Runs instances for the given cluster."""
|
|
84
|
-
|
|
84
|
+
del cluster_name # unused
|
|
85
85
|
pending_status = ['pending', 'provisioning']
|
|
86
86
|
while True:
|
|
87
87
|
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -360,9 +360,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
360
360
|
created_instance_ids=created_instance_ids)
|
|
361
361
|
|
|
362
362
|
|
|
363
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
363
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
364
364
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
365
365
|
"""See sky/provision/__init__.py"""
|
|
366
|
+
del cluster_name # unused
|
|
366
367
|
try:
|
|
367
368
|
return _run_instances(region, cluster_name_on_cloud, config)
|
|
368
369
|
except gcp.http_error_exception() as e:
|
|
@@ -64,8 +64,9 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
64
64
|
return next(iter(instances.keys()))
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
67
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
68
68
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
69
|
+
del cluster_name # unused
|
|
69
70
|
logger.info(f'Starting run_instances with region={region}, '
|
|
70
71
|
f'cluster={cluster_name_on_cloud}')
|
|
71
72
|
logger.debug(f'Config: {config}')
|
sky/provision/instance_setup.py
CHANGED
|
@@ -84,7 +84,7 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
84
84
|
latest one when the function is called.
|
|
85
85
|
"""
|
|
86
86
|
return (
|
|
87
|
-
f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
|
|
87
|
+
f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
|
|
88
88
|
# The run id is retrieved locally for the current run, so that the
|
|
89
89
|
# remote cluster will be set with the same run id as the initial
|
|
90
90
|
# launch operation.
|
|
@@ -24,6 +24,7 @@ from sky.utils import command_runner
|
|
|
24
24
|
from sky.utils import common_utils
|
|
25
25
|
from sky.utils import config_utils
|
|
26
26
|
from sky.utils import kubernetes_enums
|
|
27
|
+
from sky.utils import rich_utils
|
|
27
28
|
from sky.utils import status_lib
|
|
28
29
|
from sky.utils import subprocess_utils
|
|
29
30
|
from sky.utils import timeline
|
|
@@ -302,8 +303,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
|
|
|
302
303
|
f'code {rc}: {command!r}\nOutput: {stdout}.')
|
|
303
304
|
|
|
304
305
|
|
|
306
|
+
def _detect_cluster_event_reason_occurred(namespace, context, search_start,
|
|
307
|
+
reason) -> bool:
|
|
308
|
+
|
|
309
|
+
def _convert_to_utc(timestamp):
|
|
310
|
+
if timestamp.tzinfo is None:
|
|
311
|
+
return timestamp.replace(tzinfo=datetime.timezone.utc)
|
|
312
|
+
return timestamp.astimezone(datetime.timezone.utc)
|
|
313
|
+
|
|
314
|
+
def _get_event_timestamp(event):
|
|
315
|
+
if event.last_timestamp:
|
|
316
|
+
return event.last_timestamp
|
|
317
|
+
elif event.metadata.creation_timestamp:
|
|
318
|
+
return event.metadata.creation_timestamp
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
events = kubernetes.core_api(context).list_namespaced_event(
|
|
322
|
+
namespace=namespace, field_selector=f'reason={reason}')
|
|
323
|
+
for event in events.items:
|
|
324
|
+
ts = _get_event_timestamp(event)
|
|
325
|
+
if ts and _convert_to_utc(ts) > search_start:
|
|
326
|
+
return True
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
|
|
331
|
+
"""Detects whether the cluster had a autoscaling event after a
|
|
332
|
+
specified datetime. This only works when using cluster-autoscaler.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
namespace: kubernetes namespace
|
|
336
|
+
context: kubernetes context
|
|
337
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
338
|
+
after search_start
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
342
|
+
"""
|
|
343
|
+
assert namespace is not None
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
347
|
+
search_start,
|
|
348
|
+
'TriggeredScaleUp')
|
|
349
|
+
except Exception as e: # pylint: disable=broad-except
|
|
350
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
351
|
+
return False
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
|
|
355
|
+
"""Detects whether a kubernetes cluster may have an autoscaling event.
|
|
356
|
+
|
|
357
|
+
This is not a definitive detection. FailedScheduling, which is an
|
|
358
|
+
event that can occur when not enough resources are present in the cluster,
|
|
359
|
+
which is a trigger for cluster autoscaling. However, FailedScheduling may
|
|
360
|
+
have occurred due to other reasons (cluster itself is abnormal).
|
|
361
|
+
|
|
362
|
+
Hence, this should only be used for autoscalers that don't emit the
|
|
363
|
+
TriggeredScaleUp event, e.g.: Karpenter.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
namespace: kubernetes namespace
|
|
367
|
+
context: kubernetes context
|
|
368
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
369
|
+
after search_start
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
373
|
+
"""
|
|
374
|
+
assert namespace is not None
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
378
|
+
search_start,
|
|
379
|
+
'FailedScheduling')
|
|
380
|
+
except Exception as e: # pylint: disable=broad-except
|
|
381
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
382
|
+
return False
|
|
383
|
+
|
|
384
|
+
|
|
305
385
|
@timeline.event
|
|
306
|
-
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int
|
|
386
|
+
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
387
|
+
cluster_name: str,
|
|
388
|
+
create_pods_start: datetime.datetime):
|
|
307
389
|
"""Wait for all pods to be scheduled.
|
|
308
390
|
|
|
309
391
|
Wait for all pods including jump pod to be scheduled, and if it
|
|
@@ -312,6 +394,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
312
394
|
allocated and we can exit.
|
|
313
395
|
|
|
314
396
|
If timeout is set to a negative value, this method will wait indefinitely.
|
|
397
|
+
|
|
398
|
+
Will update the spinner message to indicate autoscaling if autoscaling
|
|
399
|
+
is happening.
|
|
315
400
|
"""
|
|
316
401
|
# Create a set of pod names we're waiting for
|
|
317
402
|
if not new_nodes:
|
|
@@ -319,6 +404,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
319
404
|
expected_pod_names = {node.metadata.name for node in new_nodes}
|
|
320
405
|
start_time = time.time()
|
|
321
406
|
|
|
407
|
+
# Variables for autoscaler detection
|
|
408
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
409
|
+
cloud='kubernetes',
|
|
410
|
+
region=context,
|
|
411
|
+
keys=('autoscaler',),
|
|
412
|
+
default_value=None)
|
|
413
|
+
autoscaler_is_set = autoscaler_type is not None
|
|
414
|
+
use_heuristic_detection = (autoscaler_is_set and
|
|
415
|
+
not kubernetes_enums.KubernetesAutoscalerType(
|
|
416
|
+
autoscaler_type).emits_autoscale_event())
|
|
417
|
+
is_autoscaling = False
|
|
418
|
+
|
|
322
419
|
def _evaluate_timeout() -> bool:
|
|
323
420
|
# If timeout is negative, retry indefinitely.
|
|
324
421
|
if timeout < 0:
|
|
@@ -328,12 +425,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
328
425
|
while _evaluate_timeout():
|
|
329
426
|
# Get all pods in a single API call using the cluster name label
|
|
330
427
|
# which all pods in new_nodes should share
|
|
331
|
-
|
|
428
|
+
cluster_name_on_cloud = new_nodes[0].metadata.labels[
|
|
332
429
|
k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
333
430
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
334
431
|
namespace,
|
|
335
432
|
label_selector=
|
|
336
|
-
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={
|
|
433
|
+
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
434
|
+
).items
|
|
337
435
|
|
|
338
436
|
# Get the set of found pod names and check if we have all expected pods
|
|
339
437
|
found_pod_names = {pod.metadata.name for pod in pods}
|
|
@@ -357,6 +455,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
357
455
|
|
|
358
456
|
if all_scheduled:
|
|
359
457
|
return
|
|
458
|
+
|
|
459
|
+
# Check if cluster is autoscaling and update spinner message.
|
|
460
|
+
# Minor optimization to not query k8s api after autoscaling
|
|
461
|
+
# event was detected. This is useful because there isn't any
|
|
462
|
+
# autoscaling complete event.
|
|
463
|
+
if autoscaler_is_set and not is_autoscaling:
|
|
464
|
+
if use_heuristic_detection:
|
|
465
|
+
is_autoscaling = _cluster_maybe_autoscaling(
|
|
466
|
+
namespace, context, create_pods_start)
|
|
467
|
+
msg = 'Kubernetes cluster may be scaling up'
|
|
468
|
+
else:
|
|
469
|
+
is_autoscaling = _cluster_had_autoscale_event(
|
|
470
|
+
namespace, context, create_pods_start)
|
|
471
|
+
msg = 'Kubernetes cluster is autoscaling'
|
|
472
|
+
|
|
473
|
+
if is_autoscaling:
|
|
474
|
+
rich_utils.force_update_status(
|
|
475
|
+
ux_utils.spinner_message(f'Launching ({msg})',
|
|
476
|
+
cluster_name=cluster_name))
|
|
477
|
+
|
|
360
478
|
time.sleep(1)
|
|
361
479
|
|
|
362
480
|
# Handle pod scheduling errors
|
|
@@ -761,13 +879,14 @@ def _wait_for_deployment_pod(context,
|
|
|
761
879
|
|
|
762
880
|
|
|
763
881
|
@timeline.event
|
|
764
|
-
def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
882
|
+
def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
765
883
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
766
884
|
"""Create pods based on the config."""
|
|
767
885
|
provider_config = config.provider_config
|
|
768
886
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
769
887
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
770
888
|
pod_spec = copy.deepcopy(config.node_config)
|
|
889
|
+
create_pods_start = datetime.datetime.now(datetime.timezone.utc)
|
|
771
890
|
|
|
772
891
|
to_create_deployment = 'deployment_spec' in pod_spec
|
|
773
892
|
if to_create_deployment:
|
|
@@ -1047,7 +1166,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
1047
1166
|
|
|
1048
1167
|
# Wait until the pods are scheduled and surface cause for error
|
|
1049
1168
|
# if there is one
|
|
1050
|
-
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout
|
|
1169
|
+
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
|
|
1170
|
+
cluster_name, create_pods_start)
|
|
1171
|
+
# Reset spinner message here because it might have hinted autoscaling
|
|
1172
|
+
# while waiting for pods to schedule.
|
|
1173
|
+
rich_utils.force_update_status(
|
|
1174
|
+
ux_utils.spinner_message('Launching', cluster_name=cluster_name))
|
|
1051
1175
|
# Wait until the pods and their containers are up and running, and
|
|
1052
1176
|
# fail early if there is an error
|
|
1053
1177
|
logger.debug(f'run_instances: waiting for pods to be running (pulling '
|
|
@@ -1068,11 +1192,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
1068
1192
|
)
|
|
1069
1193
|
|
|
1070
1194
|
|
|
1071
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
1195
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
1072
1196
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
1073
1197
|
"""Runs instances for the given cluster."""
|
|
1074
1198
|
try:
|
|
1075
|
-
return _create_pods(region, cluster_name_on_cloud, config)
|
|
1199
|
+
return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
|
|
1076
1200
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
|
1077
1201
|
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
|
1078
1202
|
logger.warning('run_instances: Error occurred when creating pods: '
|
|
@@ -1238,6 +1362,7 @@ def get_cluster_info(
|
|
|
1238
1362
|
|
|
1239
1363
|
running_pods = kubernetes_utils.filter_pods(
|
|
1240
1364
|
namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
|
|
1365
|
+
logger.debug(f'Running pods: {list(running_pods.keys())}')
|
|
1241
1366
|
|
|
1242
1367
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
|
1243
1368
|
head_pod_name = None
|
|
@@ -1276,7 +1401,8 @@ def get_cluster_info(
|
|
|
1276
1401
|
assert head_spec is not None, pod
|
|
1277
1402
|
cpu_request = head_spec.containers[0].resources.requests['cpu']
|
|
1278
1403
|
|
|
1279
|
-
assert cpu_request is not None, 'cpu_request should not be None'
|
|
1404
|
+
assert cpu_request is not None, ('cpu_request should not be None, check '
|
|
1405
|
+
'the Pod status')
|
|
1280
1406
|
|
|
1281
1407
|
ssh_user = 'sky'
|
|
1282
1408
|
# Use pattern matching to extract SSH user, handling MOTD contamination.
|
|
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
|
|
|
68
68
|
return private_ip
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
71
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
72
72
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
73
73
|
"""Runs instances for the given cluster"""
|
|
74
|
+
del cluster_name # unused
|
|
74
75
|
lambda_client = _get_lambda_client()
|
|
75
76
|
pending_status = ['booting']
|
|
76
77
|
while True:
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
|
|
|
65
65
|
f' to be ready.')
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Runs instances for the given cluster."""
|
|
71
|
+
del cluster_name # unused
|
|
71
72
|
_wait_until_no_pending(region, cluster_name_on_cloud)
|
|
72
73
|
running_instances = _filter_instances(region, cluster_name_on_cloud,
|
|
73
74
|
['RUNNING'])
|
sky/provision/oci/instance.py
CHANGED
|
@@ -65,9 +65,10 @@ def query_instances(
|
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
@query_utils.debug_enabled(logger)
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Start instances with bootstrapped configuration."""
|
|
71
|
+
del cluster_name # unused
|
|
71
72
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
72
73
|
|
|
73
74
|
start_time = round(time.time() * 1000)
|
|
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
48
48
|
return head_instance_id
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
51
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
52
52
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
53
53
|
"""Runs instances for the given cluster."""
|
|
54
|
-
|
|
54
|
+
del cluster_name # unused
|
|
55
55
|
pending_status = [
|
|
56
56
|
'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
|
|
57
57
|
]
|
|
@@ -65,10 +65,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
65
65
|
# Helper is available as utils.parse_ssh_connection.
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Runs instances for the given cluster."""
|
|
71
|
-
|
|
71
|
+
del cluster_name # unused
|
|
72
72
|
pending_status = [
|
|
73
73
|
'PROVISIONING',
|
|
74
74
|
'PENDING',
|
sky/provision/provisioner.py
CHANGED
sky/provision/runpod/instance.py
CHANGED
|
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
44
44
|
return head_instance_id
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
-
|
|
50
|
+
del cluster_name # unused
|
|
51
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
52
52
|
|
|
53
53
|
while True:
|
sky/provision/scp/instance.py
CHANGED
|
@@ -13,9 +13,9 @@ from sky.utils import status_lib
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
16
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
17
17
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
18
|
-
|
|
18
|
+
del cluster_name # unused
|
|
19
19
|
zone_id = config.node_config['zone_id']
|
|
20
20
|
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
|
21
21
|
head_instance_id = _get_head_instance_id(running_instances)
|
sky/provision/seeweb/instance.py
CHANGED
|
@@ -502,9 +502,10 @@ class SeewebNodeProvider:
|
|
|
502
502
|
# =============================================================================
|
|
503
503
|
|
|
504
504
|
|
|
505
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
505
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
506
506
|
config: ProvisionConfig) -> ProvisionRecord:
|
|
507
507
|
"""Run instances for Seeweb cluster."""
|
|
508
|
+
del cluster_name # unused
|
|
508
509
|
provider = SeewebNodeProvider(config, cluster_name_on_cloud)
|
|
509
510
|
provider.run_instances(config.node_config, config.count)
|
|
510
511
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -44,9 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
44
44
|
return None
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
+
del cluster_name # unused
|
|
50
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
51
52
|
|
|
52
53
|
created_instance_ids = []
|