skypilot-nightly 1.0.0.dev20250210__py3-none-any.whl → 1.0.0.dev20250212__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +14 -18
- sky/execution.py +2 -2
- sky/jobs/constants.py +11 -3
- sky/optimizer.py +2 -4
- sky/provision/instance_setup.py +38 -5
- sky/provision/kubernetes/utils.py +36 -34
- sky/skylet/events.py +9 -0
- sky/skylet/skylet.py +2 -0
- sky/templates/kubernetes-ray.yml.j2 +3 -2
- sky/usage/constants.py +2 -1
- sky/usage/usage_lib.py +53 -11
- sky/utils/env_options.py +6 -0
- {skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/RECORD +19 -19
- {skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '1fe3fab0e7a3242f32039d55b456603350dc4196'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250212'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -623,7 +623,8 @@ def _launch_with_confirm(
|
|
623
623
|
click.confirm(prompt, default=True, abort=True, show_default=True)
|
624
624
|
|
625
625
|
if not confirm_shown:
|
626
|
-
click.secho(
|
626
|
+
click.secho('Running on cluster: ', fg='cyan', nl=False)
|
627
|
+
click.secho(cluster)
|
627
628
|
|
628
629
|
sky.launch(
|
629
630
|
dag,
|
@@ -722,7 +723,6 @@ def _pop_and_ignore_fields_in_override_params(
|
|
722
723
|
def _make_task_or_dag_from_entrypoint_with_overrides(
|
723
724
|
entrypoint: Tuple[str, ...],
|
724
725
|
*,
|
725
|
-
entrypoint_name: str = 'Task',
|
726
726
|
name: Optional[str] = None,
|
727
727
|
workdir: Optional[str] = None,
|
728
728
|
cloud: Optional[str] = None,
|
@@ -754,19 +754,15 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
754
754
|
entrypoint: Optional[str]
|
755
755
|
if is_yaml:
|
756
756
|
# Treat entrypoint as a yaml.
|
757
|
-
click.secho(
|
758
|
-
|
759
|
-
nl=False)
|
760
|
-
click.secho(entrypoint, bold=True)
|
757
|
+
click.secho('YAML to run: ', fg='cyan', nl=False)
|
758
|
+
click.secho(entrypoint)
|
761
759
|
else:
|
762
760
|
if not entrypoint:
|
763
761
|
entrypoint = None
|
764
762
|
else:
|
765
763
|
# Treat entrypoint as a bash command.
|
766
|
-
click.secho(
|
767
|
-
|
768
|
-
nl=False)
|
769
|
-
click.secho(entrypoint, bold=True)
|
764
|
+
click.secho('Command to run: ', fg='cyan', nl=False)
|
765
|
+
click.secho(entrypoint)
|
770
766
|
|
771
767
|
override_params = _parse_override_params(cloud=cloud,
|
772
768
|
region=region,
|
@@ -1333,7 +1329,8 @@ def exec(
|
|
1333
1329
|
'supports a single task only.')
|
1334
1330
|
task = task_or_dag
|
1335
1331
|
|
1336
|
-
click.secho(
|
1332
|
+
click.secho('Submitting job to cluster: ', fg='cyan', nl=False)
|
1333
|
+
click.secho(cluster)
|
1337
1334
|
sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run)
|
1338
1335
|
|
1339
1336
|
|
@@ -1982,7 +1979,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
|
|
1982
1979
|
def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
1983
1980
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1984
1981
|
"""Show the job queue for cluster(s)."""
|
1985
|
-
click.secho('Fetching and parsing job queue...', fg='
|
1982
|
+
click.secho('Fetching and parsing job queue...', fg='cyan')
|
1986
1983
|
if clusters:
|
1987
1984
|
clusters = _get_glob_clusters(clusters)
|
1988
1985
|
else:
|
@@ -3785,7 +3782,7 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
|
|
3785
3782
|
watch -n60 sky jobs queue
|
3786
3783
|
|
3787
3784
|
"""
|
3788
|
-
click.secho('Fetching managed
|
3785
|
+
click.secho('Fetching managed jobs...', fg='cyan')
|
3789
3786
|
with rich_utils.safe_status(
|
3790
3787
|
ux_utils.spinner_message('Checking managed jobs')):
|
3791
3788
|
_, msg = _get_managed_jobs(refresh=refresh,
|
@@ -3938,7 +3935,7 @@ def jobs_dashboard(port: Optional[int]):
|
|
3938
3935
|
# see if the controller is UP first, which is slow; (2) not have to run SSH
|
3939
3936
|
# port forwarding first (we'd just launch a local dashboard which would make
|
3940
3937
|
# REST API calls to the controller dashboard server).
|
3941
|
-
click.secho('Checking if jobs controller is up...', fg='
|
3938
|
+
click.secho('Checking if jobs controller is up...', fg='cyan')
|
3942
3939
|
hint = ('Dashboard is not available if jobs controller is not up. Run a '
|
3943
3940
|
'managed job first.')
|
3944
3941
|
backend_utils.is_controller_accessible(
|
@@ -4032,7 +4029,6 @@ def _generate_task_with_service(
|
|
4032
4029
|
disk_size=disk_size,
|
4033
4030
|
disk_tier=disk_tier,
|
4034
4031
|
ports=ports,
|
4035
|
-
entrypoint_name='Service',
|
4036
4032
|
)
|
4037
4033
|
if isinstance(task, sky.Dag):
|
4038
4034
|
raise click.UsageError(
|
@@ -4197,7 +4193,7 @@ def serve_up(
|
|
4197
4193
|
ports=ports,
|
4198
4194
|
not_supported_cmd='sky serve up',
|
4199
4195
|
)
|
4200
|
-
click.secho('Service
|
4196
|
+
click.secho('Service spec:', fg='cyan')
|
4201
4197
|
click.echo(task.service)
|
4202
4198
|
|
4203
4199
|
click.secho('Each replica will use the following resources (estimated):',
|
@@ -4315,7 +4311,7 @@ def serve_update(
|
|
4315
4311
|
ports=ports,
|
4316
4312
|
not_supported_cmd='sky serve update',
|
4317
4313
|
)
|
4318
|
-
click.secho('Service
|
4314
|
+
click.secho('Service spec:', fg='cyan')
|
4319
4315
|
click.echo(task.service)
|
4320
4316
|
|
4321
4317
|
click.secho('New replica will use the following resources (estimated):',
|
@@ -4767,7 +4763,7 @@ def benchmark_launch(
|
|
4767
4763
|
'Please provide a YAML file.')
|
4768
4764
|
assert config is not None, (is_yaml, config)
|
4769
4765
|
|
4770
|
-
click.secho('Benchmarking a task from YAML
|
4766
|
+
click.secho('Benchmarking a task from YAML: ', fg='cyan', nl=False)
|
4771
4767
|
click.secho(entrypoint, bold=True)
|
4772
4768
|
|
4773
4769
|
candidates = _get_candidate_configs(entrypoint)
|
sky/execution.py
CHANGED
@@ -259,8 +259,8 @@ def _execute(
|
|
259
259
|
bold = colorama.Style.BRIGHT
|
260
260
|
reset = colorama.Style.RESET_ALL
|
261
261
|
logger.info(
|
262
|
-
f'{yellow}Launching
|
263
|
-
f'automatically recover from preemptions.
|
262
|
+
f'{yellow}Launching a spot job that does not '
|
263
|
+
f'automatically recover from preemptions. To '
|
264
264
|
'get automatic recovery, use managed job instead: '
|
265
265
|
f'{reset}{bold}sky jobs launch{reset} {yellow}or{reset} '
|
266
266
|
f'{bold}sky.jobs.launch(){reset}.')
|
sky/jobs/constants.py
CHANGED
@@ -16,10 +16,18 @@ JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
|
16
16
|
# We use 50 GB disk size to reduce the cost.
|
17
17
|
CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
|
18
18
|
|
19
|
+
# TODO(zhwu): This is no longer accurate, after #4592, which increases the
|
20
|
+
# length of user hash appended to the cluster name from 4 to 8 chars. This makes
|
21
|
+
# the cluster name on GCP being wrapped twice. However, we cannot directly
|
22
|
+
# update this constant, because the job cluster cleanup and many other logic
|
23
|
+
# in managed jobs depends on this constant, i.e., updating this constant will
|
24
|
+
# break backward compatibility and existing jobs.
|
25
|
+
#
|
19
26
|
# Max length of the cluster name for GCP is 35, the user hash to be attached is
|
20
|
-
# 4+1 chars, and we assume the maximum length of the job id is
|
21
|
-
# length of the cluster name prefix is 25
|
22
|
-
# long and truncated twice during the
|
27
|
+
# 4(now 8)+1 chars, and we assume the maximum length of the job id is
|
28
|
+
# 4(now 8)+1, so the max length of the cluster name prefix is 25(should be 21
|
29
|
+
# now) to avoid the cluster name being too long and truncated twice during the
|
30
|
+
# cluster creation.
|
23
31
|
JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
24
32
|
|
25
33
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
sky/optimizer.py
CHANGED
@@ -884,10 +884,8 @@ class Optimizer:
|
|
884
884
|
# Add a new line for better readability, when there are multiple
|
885
885
|
# tasks.
|
886
886
|
logger.info('')
|
887
|
-
logger.info(
|
888
|
-
|
889
|
-
f'({task.num_nodes} node{plural}):'
|
890
|
-
f'{colorama.Style.RESET_ALL}')
|
887
|
+
logger.info(f'Considered resources {task_str}'
|
888
|
+
f'({task.num_nodes} node{plural}):')
|
891
889
|
|
892
890
|
# Only print 1 row per cloud.
|
893
891
|
# The following code is to generate the table
|
sky/provision/instance_setup.py
CHANGED
@@ -15,9 +15,12 @@ from sky.provision import docker_utils
|
|
15
15
|
from sky.provision import logging as provision_logging
|
16
16
|
from sky.provision import metadata_utils
|
17
17
|
from sky.skylet import constants
|
18
|
+
from sky.usage import constants as usage_constants
|
19
|
+
from sky.usage import usage_lib
|
18
20
|
from sky.utils import accelerator_registry
|
19
21
|
from sky.utils import command_runner
|
20
22
|
from sky.utils import common_utils
|
23
|
+
from sky.utils import env_options
|
21
24
|
from sky.utils import subprocess_utils
|
22
25
|
from sky.utils import timeline
|
23
26
|
from sky.utils import ux_utils
|
@@ -67,6 +70,30 @@ MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
67
70
|
'sky.skylet.attempt_skylet;')
|
68
71
|
|
69
72
|
|
73
|
+
def _set_usage_run_id_cmd() -> str:
|
74
|
+
"""Gets the command to set the usage run id.
|
75
|
+
|
76
|
+
The command saves the current usage run id to the file, so that the skylet
|
77
|
+
can use it to report the heartbeat.
|
78
|
+
|
79
|
+
We use a function instead of a constant so that the usage run id is the
|
80
|
+
latest one when the function is called.
|
81
|
+
"""
|
82
|
+
return (
|
83
|
+
f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
|
84
|
+
# The run id is retrieved locally for the current run, so that the
|
85
|
+
# remote cluster will be set with the same run id as the initial
|
86
|
+
# launch operation.
|
87
|
+
f'echo "{usage_lib.messages.usage.run_id}" > '
|
88
|
+
f'{usage_constants.USAGE_RUN_ID_FILE}')
|
89
|
+
|
90
|
+
|
91
|
+
def _set_skypilot_env_var_cmd() -> str:
|
92
|
+
"""Sets the skypilot environment variables on the remote machine."""
|
93
|
+
env_vars = env_options.Options.all_options()
|
94
|
+
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
95
|
+
|
96
|
+
|
70
97
|
def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
|
71
98
|
"""Decorator that retries the function if it fails.
|
72
99
|
|
@@ -450,11 +477,17 @@ def start_skylet_on_head_node(cluster_name: str,
|
|
450
477
|
logger.info(f'Running command on head node: {MAYBE_SKYLET_RESTART_CMD}')
|
451
478
|
# We need to source bashrc for skylet to make sure the autostop event can
|
452
479
|
# access the path to the cloud CLIs.
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
480
|
+
set_usage_run_id_cmd = _set_usage_run_id_cmd()
|
481
|
+
# Set the skypilot environment variables, including the usage type, debug
|
482
|
+
# info, and other options.
|
483
|
+
set_skypilot_env_var_cmd = _set_skypilot_env_var_cmd()
|
484
|
+
returncode, stdout, stderr = head_runner.run(
|
485
|
+
f'{set_usage_run_id_cmd}; {set_skypilot_env_var_cmd}; '
|
486
|
+
f'{MAYBE_SKYLET_RESTART_CMD}',
|
487
|
+
stream_logs=False,
|
488
|
+
require_outputs=True,
|
489
|
+
log_path=log_path_abs,
|
490
|
+
source_bashrc=True)
|
458
491
|
if returncode:
|
459
492
|
raise RuntimeError('Failed to start skylet on the head node '
|
460
493
|
f'(exit code {returncode}). Error: '
|
@@ -2178,52 +2178,54 @@ def get_kubernetes_node_info(
|
|
2178
2178
|
|
2179
2179
|
lf, _ = detect_gpu_label_formatter(context)
|
2180
2180
|
if not lf:
|
2181
|
-
|
2181
|
+
label_keys = []
|
2182
2182
|
else:
|
2183
2183
|
label_keys = lf.get_label_keys()
|
2184
2184
|
|
2185
2185
|
node_info_dict: Dict[str, KubernetesNodeInfo] = {}
|
2186
2186
|
|
2187
|
-
for
|
2188
|
-
|
2189
|
-
|
2187
|
+
for node in nodes:
|
2188
|
+
accelerator_name = None
|
2189
|
+
# Determine the accelerator name from the node labels and pick the
|
2190
|
+
# first one found. We assume that the node has only one accelerator type
|
2191
|
+
# (e.g., either GPU or TPU).
|
2192
|
+
for label_key in label_keys:
|
2190
2193
|
if lf is not None and label_key in node.metadata.labels:
|
2191
2194
|
accelerator_name = lf.get_accelerator_from_label_value(
|
2192
2195
|
node.metadata.labels.get(label_key))
|
2193
|
-
|
2194
|
-
accelerator_name = None
|
2196
|
+
break
|
2195
2197
|
|
2196
|
-
|
2197
|
-
|
2198
|
+
allocated_qty = 0
|
2199
|
+
accelerator_count = get_node_accelerator_count(node.status.allocatable)
|
2198
2200
|
|
2199
|
-
|
2200
|
-
|
2201
|
+
if pods is None:
|
2202
|
+
accelerators_available = -1
|
2201
2203
|
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2205
|
-
|
2206
|
-
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
2212
|
-
|
2213
|
-
|
2214
|
-
|
2215
|
-
|
2216
|
-
|
2217
|
-
|
2218
|
-
|
2219
|
-
|
2220
|
-
|
2204
|
+
else:
|
2205
|
+
for pod in pods:
|
2206
|
+
# Get all the pods running on the node
|
2207
|
+
if (pod.spec.node_name == node.metadata.name and
|
2208
|
+
pod.status.phase in ['Running', 'Pending']):
|
2209
|
+
# Iterate over all the containers in the pod and sum the
|
2210
|
+
# GPU requests
|
2211
|
+
for container in pod.spec.containers:
|
2212
|
+
if container.resources.requests:
|
2213
|
+
allocated_qty += get_node_accelerator_count(
|
2214
|
+
container.resources.requests)
|
2215
|
+
|
2216
|
+
accelerators_available = accelerator_count - allocated_qty
|
2217
|
+
|
2218
|
+
# Exclude multi-host TPUs from being processed.
|
2219
|
+
# TODO(Doyoung): Remove the logic when adding support for
|
2220
|
+
# multi-host TPUs.
|
2221
|
+
if is_multi_host_tpu(node.metadata.labels):
|
2222
|
+
continue
|
2221
2223
|
|
2222
|
-
|
2223
|
-
|
2224
|
-
|
2225
|
-
|
2226
|
-
|
2224
|
+
node_info_dict[node.metadata.name] = KubernetesNodeInfo(
|
2225
|
+
name=node.metadata.name,
|
2226
|
+
accelerator_type=accelerator_name,
|
2227
|
+
total={'accelerator_count': int(accelerator_count)},
|
2228
|
+
free={'accelerators_available': int(accelerators_available)})
|
2227
2229
|
|
2228
2230
|
return node_info_dict
|
2229
2231
|
|
sky/skylet/events.py
CHANGED
@@ -20,6 +20,7 @@ from sky.serve import serve_utils
|
|
20
20
|
from sky.skylet import autostop_lib
|
21
21
|
from sky.skylet import constants
|
22
22
|
from sky.skylet import job_lib
|
23
|
+
from sky.usage import usage_lib
|
23
24
|
from sky.utils import cluster_yaml_utils
|
24
25
|
from sky.utils import common_utils
|
25
26
|
from sky.utils import ux_utils
|
@@ -90,6 +91,14 @@ class ServiceUpdateEvent(SkyletEvent):
|
|
90
91
|
serve_utils.update_service_status()
|
91
92
|
|
92
93
|
|
94
|
+
class UsageHeartbeatReportEvent(SkyletEvent):
|
95
|
+
"""Skylet event for reporting usage."""
|
96
|
+
EVENT_INTERVAL_SECONDS = 600
|
97
|
+
|
98
|
+
def _run(self):
|
99
|
+
usage_lib.send_heartbeat(interval_seconds=self.EVENT_INTERVAL_SECONDS)
|
100
|
+
|
101
|
+
|
93
102
|
class AutostopEvent(SkyletEvent):
|
94
103
|
"""Skylet event for autostop.
|
95
104
|
|
sky/skylet/skylet.py
CHANGED
@@ -373,15 +373,16 @@ available_node_types:
|
|
373
373
|
done;
|
374
374
|
if [ ! -z "$INSTALL_FIRST" ]; then
|
375
375
|
echo "Installing core packages: $INSTALL_FIRST";
|
376
|
-
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST;
|
376
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
|
377
377
|
fi;
|
378
378
|
# SSH and other packages are not necessary, so we disable set -e
|
379
379
|
set +e
|
380
380
|
|
381
381
|
if [ ! -z "$MISSING_PACKAGES" ]; then
|
382
382
|
echo "Installing missing packages: $MISSING_PACKAGES";
|
383
|
-
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES;
|
383
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $MISSING_PACKAGES;
|
384
384
|
fi;
|
385
|
+
|
385
386
|
$(prefix_cmd) mkdir -p /var/run/sshd;
|
386
387
|
$(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
|
387
388
|
$(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
|
sky/usage/constants.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
LOG_URL = 'http://usage.skypilot.co:9090/loki/api/v1/push' # pylint: disable=line-too-long
|
4
4
|
|
5
5
|
USAGE_MESSAGE_SCHEMA_VERSION = 1
|
6
|
-
|
7
6
|
PRIVACY_POLICY_PATH = '~/.sky/privacy_policy'
|
8
7
|
|
9
8
|
USAGE_POLICY_MESSAGE = (
|
@@ -15,3 +14,5 @@ USAGE_POLICY_MESSAGE = (
|
|
15
14
|
|
16
15
|
USAGE_MESSAGE_REDACT_KEYS = ['setup', 'run', 'envs']
|
17
16
|
USAGE_MESSAGE_REDACT_TYPES = {str, dict}
|
17
|
+
|
18
|
+
USAGE_RUN_ID_FILE = '~/.sky/usage_run_id'
|
sky/usage/usage_lib.py
CHANGED
@@ -44,6 +44,7 @@ def _get_current_timestamp_ns() -> int:
|
|
44
44
|
class MessageType(enum.Enum):
|
45
45
|
"""Types for messages to be sent to Loki."""
|
46
46
|
USAGE = 'usage'
|
47
|
+
HEARTBEAT = 'heartbeat'
|
47
48
|
# TODO(zhwu): Add more types, e.g., cluster_lifecycle.
|
48
49
|
|
49
50
|
|
@@ -67,8 +68,9 @@ class MessageToReport:
|
|
67
68
|
properties = self.__dict__.copy()
|
68
69
|
return {k: v for k, v in properties.items() if not k.startswith('_')}
|
69
70
|
|
70
|
-
def __repr__(self):
|
71
|
-
|
71
|
+
def __repr__(self) -> str:
|
72
|
+
d = self.get_properties()
|
73
|
+
return json.dumps(d)
|
72
74
|
|
73
75
|
|
74
76
|
class UsageMessageToReport(MessageToReport):
|
@@ -160,10 +162,6 @@ class UsageMessageToReport(MessageToReport):
|
|
160
162
|
self.exception: Optional[str] = None # entrypoint_context
|
161
163
|
self.stacktrace: Optional[str] = None # entrypoint_context
|
162
164
|
|
163
|
-
def __repr__(self) -> str:
|
164
|
-
d = self.get_properties()
|
165
|
-
return json.dumps(d)
|
166
|
-
|
167
165
|
def update_entrypoint(self, msg: str):
|
168
166
|
self.entrypoint = msg
|
169
167
|
|
@@ -275,16 +273,43 @@ class UsageMessageToReport(MessageToReport):
|
|
275
273
|
name_or_fn)
|
276
274
|
|
277
275
|
|
276
|
+
class HeartbeatMessageToReport(MessageToReport):
|
277
|
+
"""Message to be reported to Grafana Loki for heartbeat on a cluster."""
|
278
|
+
|
279
|
+
def __init__(self, interval_seconds: int = 600):
|
280
|
+
super().__init__(constants.USAGE_MESSAGE_SCHEMA_VERSION)
|
281
|
+
# This interval_seconds is mainly for recording the heartbeat interval
|
282
|
+
# in the heartbeat message, so that the collector can use it.
|
283
|
+
self.interval_seconds = interval_seconds
|
284
|
+
|
285
|
+
def get_properties(self) -> Dict[str, Any]:
|
286
|
+
properties = super().get_properties()
|
287
|
+
# The run id is set by the skylet, which will always be the same for
|
288
|
+
# the entire lifetime of the run.
|
289
|
+
with open(os.path.expanduser(constants.USAGE_RUN_ID_FILE),
|
290
|
+
'r',
|
291
|
+
encoding='utf-8') as f:
|
292
|
+
properties['run_id'] = f.read().strip()
|
293
|
+
return properties
|
294
|
+
|
295
|
+
|
278
296
|
class MessageCollection:
|
279
297
|
"""A collection of messages."""
|
280
298
|
|
281
299
|
def __init__(self):
|
282
|
-
self._messages = {
|
300
|
+
self._messages = {
|
301
|
+
MessageType.USAGE: UsageMessageToReport(),
|
302
|
+
MessageType.HEARTBEAT: HeartbeatMessageToReport()
|
303
|
+
}
|
283
304
|
|
284
305
|
@property
|
285
|
-
def usage(self):
|
306
|
+
def usage(self) -> UsageMessageToReport:
|
286
307
|
return self._messages[MessageType.USAGE]
|
287
308
|
|
309
|
+
@property
|
310
|
+
def heartbeat(self) -> HeartbeatMessageToReport:
|
311
|
+
return self._messages[MessageType.HEARTBEAT]
|
312
|
+
|
288
313
|
def reset(self, message_type: MessageType):
|
289
314
|
self._messages[message_type] = self._messages[message_type].__class__()
|
290
315
|
|
@@ -308,13 +333,25 @@ def _send_to_loki(message_type: MessageType):
|
|
308
333
|
|
309
334
|
message = messages[message_type]
|
310
335
|
|
336
|
+
# In case the message has no start time, set it to the current time.
|
337
|
+
message.start()
|
311
338
|
message.send_time = _get_current_timestamp_ns()
|
312
|
-
|
339
|
+
# Use send time instead of start time to avoid the message being dropped
|
340
|
+
# by Loki, due to the timestamp being too old. We still have the start time
|
341
|
+
# in the message for dashboard.
|
342
|
+
log_timestamp = message.send_time
|
313
343
|
|
314
344
|
environment = 'prod'
|
315
345
|
if env_options.Options.IS_DEVELOPER.get():
|
316
346
|
environment = 'dev'
|
317
|
-
prom_labels = {
|
347
|
+
prom_labels = {
|
348
|
+
'type': message_type.value,
|
349
|
+
'environment': environment,
|
350
|
+
'schema_version': message.schema_version,
|
351
|
+
}
|
352
|
+
if message_type == MessageType.USAGE:
|
353
|
+
prom_labels['new_cluster'] = (message.original_cluster_status != 'UP'
|
354
|
+
and message.final_cluster_status == 'UP')
|
318
355
|
|
319
356
|
headers = {'Content-type': 'application/json'}
|
320
357
|
payload = {
|
@@ -392,7 +429,7 @@ def prepare_json_from_yaml_config(
|
|
392
429
|
def _send_local_messages():
|
393
430
|
"""Send all messages not been uploaded to Loki."""
|
394
431
|
for msg_type, message in messages.items():
|
395
|
-
if not message.message_sent:
|
432
|
+
if not message.message_sent and msg_type != MessageType.HEARTBEAT:
|
396
433
|
# Avoid the fallback entrypoint to send the message again
|
397
434
|
# in normal case.
|
398
435
|
try:
|
@@ -402,6 +439,11 @@ def _send_local_messages():
|
|
402
439
|
f'exception caught: {type(e)}({e})')
|
403
440
|
|
404
441
|
|
442
|
+
def send_heartbeat(interval_seconds: int = 600):
|
443
|
+
messages.heartbeat.interval_seconds = interval_seconds
|
444
|
+
_send_to_loki(MessageType.HEARTBEAT)
|
445
|
+
|
446
|
+
|
405
447
|
@contextlib.contextmanager
|
406
448
|
def entrypoint_context(name: str, fallback: bool = False):
|
407
449
|
"""Context manager for entrypoint.
|
sky/utils/env_options.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Global environment options for sky."""
|
2
2
|
import enum
|
3
3
|
import os
|
4
|
+
from typing import Dict
|
4
5
|
|
5
6
|
|
6
7
|
class Options(enum.Enum):
|
@@ -35,3 +36,8 @@ class Options(enum.Enum):
|
|
35
36
|
def env_key(self) -> str:
|
36
37
|
"""The environment variable key name."""
|
37
38
|
return self.value[0]
|
39
|
+
|
40
|
+
@classmethod
|
41
|
+
def all_options(cls) -> Dict[str, bool]:
|
42
|
+
"""Returns all options as a dictionary."""
|
43
|
+
return {option.env_key: option.get() for option in list(Options)}
|
{skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/RECORD
RENAMED
@@ -1,15 +1,15 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=GRvhpT8lUvIyIPLyC2cqv9RiI6hZ0_iMY56YNDpjPbs,5560
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=MNc9uHnvQ1EsEl8SsrYcYCGbxcnDbR6gaRCXVNd5RZE,22338
|
4
4
|
sky/check.py,sha256=xzLlxUkBCrzpOho8lw65EvKLPl_b9lA2nteF5MSYbDQ,10885
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=_Q-XlsLN73e8BJilClajL7VOG8vINVJ_xRjENOpJdDA,213928
|
6
6
|
sky/cloud_stores.py,sha256=PcLT57_8SZy7o6paAluElfBynaLkbaOq3l-8dNg1AVM,23672
|
7
7
|
sky/core.py,sha256=fE1rn4Ku94S0XmWTO5-6t6eT6aaJImNczRqEnTe8v7Q,38742
|
8
8
|
sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
|
9
9
|
sky/exceptions.py,sha256=SEhRubPlk-crkflPC5P_Z085iLrSd3UScYwc790QwYw,9378
|
10
|
-
sky/execution.py,sha256=
|
10
|
+
sky/execution.py,sha256=vNUE9Z8hCSQeil7h3kdote2r6nkbrGXSqqmK6ru594Q,28453
|
11
11
|
sky/global_user_state.py,sha256=cTwltMCDIIBaapuGgARxFwpDJDCiKKyVW-PP_qtWuCA,30241
|
12
|
-
sky/optimizer.py,sha256=
|
12
|
+
sky/optimizer.py,sha256=H5cpKELOQmnFpox0QXMB4P7jGhJxzXog4Ht_TYJaGuA,59758
|
13
13
|
sky/resources.py,sha256=D3jteQxKOUydoNm7VDl90p02dwP3RpbO3gqNcl4dpOI,70327
|
14
14
|
sky/sky_logging.py,sha256=7Zk9mL1TDxFkGsy3INMBKYlqsbognVGSMzAsHZdZlhw,5891
|
15
15
|
sky/skypilot_config.py,sha256=FN93hSG-heQCHBnemlIK2TwrJngKbpx4vMXNUzPIzV8,9087
|
@@ -101,7 +101,7 @@ sky/data/mounting_utils.py,sha256=tJHBPEDP1Wg_r3oSGBwFhMDLnPCMPSFRz26O0QkDd0Y,14
|
|
101
101
|
sky/data/storage.py,sha256=CWVKnHhdzXw1biPbRqYizkyVexL_OCELuJCqtd4hit4,204094
|
102
102
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
103
103
|
sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
|
104
|
-
sky/jobs/constants.py,sha256=
|
104
|
+
sky/jobs/constants.py,sha256=9kIdpwWNI9zWKQO39LTg9spUMGl5Iqx4ByIjRlic7Hw,1893
|
105
105
|
sky/jobs/controller.py,sha256=cX8kGplwa-0Te_ihUfzzOr-TRs_Fw6UdFPm6mrtSE0c,28548
|
106
106
|
sky/jobs/core.py,sha256=b9aJB90AxUdhoasSxsWBoD-mQY1MmC05FbPbtyFMzHI,19154
|
107
107
|
sky/jobs/recovery_strategy.py,sha256=49H1ca5N4bIJ3W4iqurxzSvJE0dIihPt2XnstboxUm4,26370
|
@@ -115,7 +115,7 @@ sky/provision/__init__.py,sha256=hb_z69_7-FH1I8aDpFKNj2x_a8spzceWcovklutNgP8,637
|
|
115
115
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
116
116
|
sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
|
117
117
|
sky/provision/docker_utils.py,sha256=ENm0LkyrYWic3Ikyacho8X5uDMvGsbkZQsb6kNH1DuI,19629
|
118
|
-
sky/provision/instance_setup.py,sha256=
|
118
|
+
sky/provision/instance_setup.py,sha256=YBFOwZQLBzpUjYoVQcX0KItej1rCBRWM23Dw9lg_q24,24386
|
119
119
|
sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
|
120
120
|
sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
|
121
121
|
sky/provision/provisioner.py,sha256=ZOgFOO0NB4QZVPwd4qikRqi615Bq67n0Vcl3cTDVxNE,29153
|
@@ -153,7 +153,7 @@ sky/provision/kubernetes/config.py,sha256=bXwOGdSAnXCkDreew0KsSUqSv3ZrptNeevqat7
|
|
153
153
|
sky/provision/kubernetes/instance.py,sha256=AQikdRgNklpeMgiEd4w2Hh7kGssVABsy0aCh9xsKi5Y,50313
|
154
154
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
155
155
|
sky/provision/kubernetes/network_utils.py,sha256=52BZY_5ynCH6IXlivKObYyAHDgQCJyAJIjmM7J4MpFo,11393
|
156
|
-
sky/provision/kubernetes/utils.py,sha256=
|
156
|
+
sky/provision/kubernetes/utils.py,sha256=swOe6ozgSoucDtoJCExs0HLLWYuoi5HkIGMMSp7fEzc,109962
|
157
157
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
158
158
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
159
159
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
@@ -216,11 +216,11 @@ sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,
|
|
216
216
|
sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
|
217
217
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
218
218
|
sky/skylet/constants.py,sha256=EUSW4yH59eqBDLMIdmQWIYd3nAJBFoUeo5v9MGiginI,16057
|
219
|
-
sky/skylet/events.py,sha256=
|
219
|
+
sky/skylet/events.py,sha256=__7bt6Z8q2W1vwTQv4yug-oAXDwSf8zBeRxb8HFM36U,12792
|
220
220
|
sky/skylet/job_lib.py,sha256=Rk-C069cusJIRXsks8xqCb016JSt7GlpU7LrpX0qFJk,42785
|
221
221
|
sky/skylet/log_lib.py,sha256=oFEBd85vDYFrIyyZKekH30yc4rRYILC0F0o-COQ64oE,20445
|
222
222
|
sky/skylet/log_lib.pyi,sha256=rRk4eUX0RHGs1QL9CXsJq6RE7FqqxZlfuPJOLXTvg7I,4453
|
223
|
-
sky/skylet/skylet.py,sha256=
|
223
|
+
sky/skylet/skylet.py,sha256=mWmqCvxSlfdVU_L8NL6P52jmCt3smd8K0HdyNBfMPeI,1234
|
224
224
|
sky/skylet/subprocess_daemon.py,sha256=gcL-_Hea7-SrBUyZfAbo40RBFbaeuBmPCW0dm4YYkPo,3537
|
225
225
|
sky/skylet/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
226
226
|
sky/skylet/providers/command_runner.py,sha256=DdBKP0QX325_N3zAVYwnmXmfbfXNqkzWQZpF9DSR7Go,16259
|
@@ -250,7 +250,7 @@ sky/templates/jobs-controller.yaml.j2,sha256=FfagMkhXZdUWR6HtJHJ3JEZzJy4eov5CQZH
|
|
250
250
|
sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
|
251
251
|
sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
|
252
252
|
sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=iw7mypHszg6Ggq9MbyiYMFOkSlXaQZulaxqC5IWYGCc,3381
|
253
|
-
sky/templates/kubernetes-ray.yml.j2,sha256=
|
253
|
+
sky/templates/kubernetes-ray.yml.j2,sha256=x3Eq1ejG577E6eAZtJvpTlzXRCW5beMhqApV3J8BEZY,29019
|
254
254
|
sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
|
255
255
|
sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
|
256
256
|
sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
|
@@ -262,8 +262,8 @@ sky/templates/sky-serve-controller.yaml.j2,sha256=W4i1-OGRU2WDvauLC4EDXcYrNxj7mz
|
|
262
262
|
sky/templates/vast-ray.yml.j2,sha256=KaZLBJfI6FzAVRVq0NNM0_SN0RQUrDIehnJJ_LnvwnY,2990
|
263
263
|
sky/templates/vsphere-ray.yml.j2,sha256=cOQ-qdpxGA2FHajMMhTJI-SmlYzdPterX4Gsiq-nkb0,3587
|
264
264
|
sky/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
265
|
-
sky/usage/constants.py,sha256=
|
266
|
-
sky/usage/usage_lib.py,sha256=
|
265
|
+
sky/usage/constants.py,sha256=k7PQ-QP1p3tDgnzvy7QoxJjuTXWDUyVkbtPcIEvDsYM,632
|
266
|
+
sky/usage/usage_lib.py,sha256=jpRt-24WVxYyd-XJz3_lSHboUKmWy8x8lRvvO-JO68g,20026
|
267
267
|
sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
268
268
|
sky/utils/accelerator_registry.py,sha256=cpPS9_MahQPt0ev4qPT-qyGpe12YD78UNj_gAvt720Q,4052
|
269
269
|
sky/utils/admin_policy_utils.py,sha256=_Vt_jTTYCXmMdryj0vrrumFPewa93qHnzUqBDXjAhRU,5981
|
@@ -275,7 +275,7 @@ sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR2
|
|
275
275
|
sky/utils/controller_utils.py,sha256=SUrhK46ouBH2rm7azfFLIWr-T9-voYAdiXl2z5fG4Qw,45948
|
276
276
|
sky/utils/dag_utils.py,sha256=l_0O3RUfe9OdQ9mtbhdlHpJVD4VAF_HQ3A75dgsYIjM,6099
|
277
277
|
sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
|
278
|
-
sky/utils/env_options.py,sha256=
|
278
|
+
sky/utils/env_options.py,sha256=aaD6GoYK0LaZIqjOEZ-R7eccQuiRriW3EuLWtOI5En8,1578
|
279
279
|
sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
|
280
280
|
sky/utils/log_utils.py,sha256=AjkgSrk0GVOUbnnCEC2f4lsf2HOIXkZETCxR0BJw2-U,14152
|
281
281
|
sky/utils/resources_utils.py,sha256=06Kx6AfbBdwBYGmIYFEY_qm6OBc2a5esZMPvIX7gCvc,7787
|
@@ -298,9 +298,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
298
298
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
299
299
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
300
300
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
301
|
-
skypilot_nightly-1.0.0.
|
302
|
-
skypilot_nightly-1.0.0.
|
303
|
-
skypilot_nightly-1.0.0.
|
304
|
-
skypilot_nightly-1.0.0.
|
305
|
-
skypilot_nightly-1.0.0.
|
306
|
-
skypilot_nightly-1.0.0.
|
301
|
+
skypilot_nightly-1.0.0.dev20250212.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
302
|
+
skypilot_nightly-1.0.0.dev20250212.dist-info/METADATA,sha256=rkJIHWHxQtacqsQPb5SZ7XHCGiXMvMBzXNPupXqi4sU,21397
|
303
|
+
skypilot_nightly-1.0.0.dev20250212.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
304
|
+
skypilot_nightly-1.0.0.dev20250212.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
305
|
+
skypilot_nightly-1.0.0.dev20250212.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
306
|
+
skypilot_nightly-1.0.0.dev20250212.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250210.dist-info → skypilot_nightly-1.0.0.dev20250212.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|