skypilot-nightly 1.0.0.dev20250210__py3-none-any.whl → 1.0.0.dev20250212__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '54fe787d1fb31687cc78eb307db59d54c2d79076'
8
+ _SKYPILOT_COMMIT_SHA = '1fe3fab0e7a3242f32039d55b456603350dc4196'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250210'
38
+ __version__ = '1.0.0.dev20250212'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -623,7 +623,8 @@ def _launch_with_confirm(
623
623
  click.confirm(prompt, default=True, abort=True, show_default=True)
624
624
 
625
625
  if not confirm_shown:
626
- click.secho(f'Running task on cluster {cluster}...', fg='yellow')
626
+ click.secho('Running on cluster: ', fg='cyan', nl=False)
627
+ click.secho(cluster)
627
628
 
628
629
  sky.launch(
629
630
  dag,
@@ -722,7 +723,6 @@ def _pop_and_ignore_fields_in_override_params(
722
723
  def _make_task_or_dag_from_entrypoint_with_overrides(
723
724
  entrypoint: Tuple[str, ...],
724
725
  *,
725
- entrypoint_name: str = 'Task',
726
726
  name: Optional[str] = None,
727
727
  workdir: Optional[str] = None,
728
728
  cloud: Optional[str] = None,
@@ -754,19 +754,15 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
754
754
  entrypoint: Optional[str]
755
755
  if is_yaml:
756
756
  # Treat entrypoint as a yaml.
757
- click.secho(f'{entrypoint_name} from YAML spec: ',
758
- fg='yellow',
759
- nl=False)
760
- click.secho(entrypoint, bold=True)
757
+ click.secho('YAML to run: ', fg='cyan', nl=False)
758
+ click.secho(entrypoint)
761
759
  else:
762
760
  if not entrypoint:
763
761
  entrypoint = None
764
762
  else:
765
763
  # Treat entrypoint as a bash command.
766
- click.secho(f'{entrypoint_name} from command: ',
767
- fg='yellow',
768
- nl=False)
769
- click.secho(entrypoint, bold=True)
764
+ click.secho('Command to run: ', fg='cyan', nl=False)
765
+ click.secho(entrypoint)
770
766
 
771
767
  override_params = _parse_override_params(cloud=cloud,
772
768
  region=region,
@@ -1333,7 +1329,8 @@ def exec(
1333
1329
  'supports a single task only.')
1334
1330
  task = task_or_dag
1335
1331
 
1336
- click.secho(f'Executing task on cluster {cluster}...', fg='yellow')
1332
+ click.secho('Submitting job to cluster: ', fg='cyan', nl=False)
1333
+ click.secho(cluster)
1337
1334
  sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run)
1338
1335
 
1339
1336
 
@@ -1982,7 +1979,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
1982
1979
  def queue(clusters: List[str], skip_finished: bool, all_users: bool):
1983
1980
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1984
1981
  """Show the job queue for cluster(s)."""
1985
- click.secho('Fetching and parsing job queue...', fg='yellow')
1982
+ click.secho('Fetching and parsing job queue...', fg='cyan')
1986
1983
  if clusters:
1987
1984
  clusters = _get_glob_clusters(clusters)
1988
1985
  else:
@@ -3785,7 +3782,7 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
3785
3782
  watch -n60 sky jobs queue
3786
3783
 
3787
3784
  """
3788
- click.secho('Fetching managed job statuses...', fg='yellow')
3785
+ click.secho('Fetching managed jobs...', fg='cyan')
3789
3786
  with rich_utils.safe_status(
3790
3787
  ux_utils.spinner_message('Checking managed jobs')):
3791
3788
  _, msg = _get_managed_jobs(refresh=refresh,
@@ -3938,7 +3935,7 @@ def jobs_dashboard(port: Optional[int]):
3938
3935
  # see if the controller is UP first, which is slow; (2) not have to run SSH
3939
3936
  # port forwarding first (we'd just launch a local dashboard which would make
3940
3937
  # REST API calls to the controller dashboard server).
3941
- click.secho('Checking if jobs controller is up...', fg='yellow')
3938
+ click.secho('Checking if jobs controller is up...', fg='cyan')
3942
3939
  hint = ('Dashboard is not available if jobs controller is not up. Run a '
3943
3940
  'managed job first.')
3944
3941
  backend_utils.is_controller_accessible(
@@ -4032,7 +4029,6 @@ def _generate_task_with_service(
4032
4029
  disk_size=disk_size,
4033
4030
  disk_tier=disk_tier,
4034
4031
  ports=ports,
4035
- entrypoint_name='Service',
4036
4032
  )
4037
4033
  if isinstance(task, sky.Dag):
4038
4034
  raise click.UsageError(
@@ -4197,7 +4193,7 @@ def serve_up(
4197
4193
  ports=ports,
4198
4194
  not_supported_cmd='sky serve up',
4199
4195
  )
4200
- click.secho('Service Spec:', fg='cyan')
4196
+ click.secho('Service spec:', fg='cyan')
4201
4197
  click.echo(task.service)
4202
4198
 
4203
4199
  click.secho('Each replica will use the following resources (estimated):',
@@ -4315,7 +4311,7 @@ def serve_update(
4315
4311
  ports=ports,
4316
4312
  not_supported_cmd='sky serve update',
4317
4313
  )
4318
- click.secho('Service Spec:', fg='cyan')
4314
+ click.secho('Service spec:', fg='cyan')
4319
4315
  click.echo(task.service)
4320
4316
 
4321
4317
  click.secho('New replica will use the following resources (estimated):',
@@ -4767,7 +4763,7 @@ def benchmark_launch(
4767
4763
  'Please provide a YAML file.')
4768
4764
  assert config is not None, (is_yaml, config)
4769
4765
 
4770
- click.secho('Benchmarking a task from YAML spec: ', fg='yellow', nl=False)
4766
+ click.secho('Benchmarking a task from YAML: ', fg='cyan', nl=False)
4771
4767
  click.secho(entrypoint, bold=True)
4772
4768
 
4773
4769
  candidates = _get_candidate_configs(entrypoint)
sky/execution.py CHANGED
@@ -259,8 +259,8 @@ def _execute(
259
259
  bold = colorama.Style.BRIGHT
260
260
  reset = colorama.Style.RESET_ALL
261
261
  logger.info(
262
- f'{yellow}Launching an unmanaged spot task, which does not '
263
- f'automatically recover from preemptions.{reset}\n{yellow}To '
262
+ f'{yellow}Launching a spot job that does not '
263
+ f'automatically recover from preemptions. To '
264
264
  'get automatic recovery, use managed job instead: '
265
265
  f'{reset}{bold}sky jobs launch{reset} {yellow}or{reset} '
266
266
  f'{bold}sky.jobs.launch(){reset}.')
sky/jobs/constants.py CHANGED
@@ -16,10 +16,18 @@ JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
16
16
  # We use 50 GB disk size to reduce the cost.
17
17
  CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
18
18
 
19
+ # TODO(zhwu): This is no longer accurate, after #4592, which increases the
20
+ # length of user hash appended to the cluster name from 4 to 8 chars. This makes
21
+ # the cluster name on GCP being wrapped twice. However, we cannot directly
22
+ # update this constant, because the job cluster cleanup and many other logic
23
+ # in managed jobs depends on this constant, i.e., updating this constant will
24
+ # break backward compatibility and existing jobs.
25
+ #
19
26
  # Max length of the cluster name for GCP is 35, the user hash to be attached is
20
- # 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
21
- # length of the cluster name prefix is 25 to avoid the cluster name being too
22
- # long and truncated twice during the cluster creation.
27
+ # 4(now 8)+1 chars, and we assume the maximum length of the job id is
28
+ # 4(now 8)+1, so the max length of the cluster name prefix is 25(should be 21
29
+ # now) to avoid the cluster name being too long and truncated twice during the
30
+ # cluster creation.
23
31
  JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
24
32
 
25
33
  # The version of the lib files that jobs/utils use. Whenever there is an API
sky/optimizer.py CHANGED
@@ -884,10 +884,8 @@ class Optimizer:
884
884
  # Add a new line for better readability, when there are multiple
885
885
  # tasks.
886
886
  logger.info('')
887
- logger.info(
888
- f'{colorama.Style.BRIGHT}Considered resources {task_str}'
889
- f'({task.num_nodes} node{plural}):'
890
- f'{colorama.Style.RESET_ALL}')
887
+ logger.info(f'Considered resources {task_str}'
888
+ f'({task.num_nodes} node{plural}):')
891
889
 
892
890
  # Only print 1 row per cloud.
893
891
  # The following code is to generate the table
@@ -15,9 +15,12 @@ from sky.provision import docker_utils
15
15
  from sky.provision import logging as provision_logging
16
16
  from sky.provision import metadata_utils
17
17
  from sky.skylet import constants
18
+ from sky.usage import constants as usage_constants
19
+ from sky.usage import usage_lib
18
20
  from sky.utils import accelerator_registry
19
21
  from sky.utils import command_runner
20
22
  from sky.utils import common_utils
23
+ from sky.utils import env_options
21
24
  from sky.utils import subprocess_utils
22
25
  from sky.utils import timeline
23
26
  from sky.utils import ux_utils
@@ -67,6 +70,30 @@ MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
67
70
  'sky.skylet.attempt_skylet;')
68
71
 
69
72
 
73
+ def _set_usage_run_id_cmd() -> str:
74
+ """Gets the command to set the usage run id.
75
+
76
+ The command saves the current usage run id to the file, so that the skylet
77
+ can use it to report the heartbeat.
78
+
79
+ We use a function instead of a constant so that the usage run id is the
80
+ latest one when the function is called.
81
+ """
82
+ return (
83
+ f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
84
+ # The run id is retrieved locally for the current run, so that the
85
+ # remote cluster will be set with the same run id as the initial
86
+ # launch operation.
87
+ f'echo "{usage_lib.messages.usage.run_id}" > '
88
+ f'{usage_constants.USAGE_RUN_ID_FILE}')
89
+
90
+
91
+ def _set_skypilot_env_var_cmd() -> str:
92
+ """Sets the skypilot environment variables on the remote machine."""
93
+ env_vars = env_options.Options.all_options()
94
+ return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
95
+
96
+
70
97
  def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
71
98
  """Decorator that retries the function if it fails.
72
99
 
@@ -450,11 +477,17 @@ def start_skylet_on_head_node(cluster_name: str,
450
477
  logger.info(f'Running command on head node: {MAYBE_SKYLET_RESTART_CMD}')
451
478
  # We need to source bashrc for skylet to make sure the autostop event can
452
479
  # access the path to the cloud CLIs.
453
- returncode, stdout, stderr = head_runner.run(MAYBE_SKYLET_RESTART_CMD,
454
- stream_logs=False,
455
- require_outputs=True,
456
- log_path=log_path_abs,
457
- source_bashrc=True)
480
+ set_usage_run_id_cmd = _set_usage_run_id_cmd()
481
+ # Set the skypilot environment variables, including the usage type, debug
482
+ # info, and other options.
483
+ set_skypilot_env_var_cmd = _set_skypilot_env_var_cmd()
484
+ returncode, stdout, stderr = head_runner.run(
485
+ f'{set_usage_run_id_cmd}; {set_skypilot_env_var_cmd}; '
486
+ f'{MAYBE_SKYLET_RESTART_CMD}',
487
+ stream_logs=False,
488
+ require_outputs=True,
489
+ log_path=log_path_abs,
490
+ source_bashrc=True)
458
491
  if returncode:
459
492
  raise RuntimeError('Failed to start skylet on the head node '
460
493
  f'(exit code {returncode}). Error: '
@@ -2178,52 +2178,54 @@ def get_kubernetes_node_info(
2178
2178
 
2179
2179
  lf, _ = detect_gpu_label_formatter(context)
2180
2180
  if not lf:
2181
- label_key = None
2181
+ label_keys = []
2182
2182
  else:
2183
2183
  label_keys = lf.get_label_keys()
2184
2184
 
2185
2185
  node_info_dict: Dict[str, KubernetesNodeInfo] = {}
2186
2186
 
2187
- for label_key in label_keys:
2188
- for node in nodes:
2189
- allocated_qty = 0
2187
+ for node in nodes:
2188
+ accelerator_name = None
2189
+ # Determine the accelerator name from the node labels and pick the
2190
+ # first one found. We assume that the node has only one accelerator type
2191
+ # (e.g., either GPU or TPU).
2192
+ for label_key in label_keys:
2190
2193
  if lf is not None and label_key in node.metadata.labels:
2191
2194
  accelerator_name = lf.get_accelerator_from_label_value(
2192
2195
  node.metadata.labels.get(label_key))
2193
- else:
2194
- accelerator_name = None
2196
+ break
2195
2197
 
2196
- accelerator_count = get_node_accelerator_count(
2197
- node.status.allocatable)
2198
+ allocated_qty = 0
2199
+ accelerator_count = get_node_accelerator_count(node.status.allocatable)
2198
2200
 
2199
- if pods is None:
2200
- accelerators_available = -1
2201
+ if pods is None:
2202
+ accelerators_available = -1
2201
2203
 
2202
- else:
2203
- for pod in pods:
2204
- # Get all the pods running on the node
2205
- if (pod.spec.node_name == node.metadata.name and
2206
- pod.status.phase in ['Running', 'Pending']):
2207
- # Iterate over all the containers in the pod and sum the
2208
- # GPU requests
2209
- for container in pod.spec.containers:
2210
- if container.resources.requests:
2211
- allocated_qty += get_node_accelerator_count(
2212
- container.resources.requests)
2213
-
2214
- accelerators_available = accelerator_count - allocated_qty
2215
-
2216
- # Exclude multi-host TPUs from being processed.
2217
- # TODO(Doyoung): Remove the logic when adding support for
2218
- # multi-host TPUs.
2219
- if is_multi_host_tpu(node.metadata.labels):
2220
- continue
2204
+ else:
2205
+ for pod in pods:
2206
+ # Get all the pods running on the node
2207
+ if (pod.spec.node_name == node.metadata.name and
2208
+ pod.status.phase in ['Running', 'Pending']):
2209
+ # Iterate over all the containers in the pod and sum the
2210
+ # GPU requests
2211
+ for container in pod.spec.containers:
2212
+ if container.resources.requests:
2213
+ allocated_qty += get_node_accelerator_count(
2214
+ container.resources.requests)
2215
+
2216
+ accelerators_available = accelerator_count - allocated_qty
2217
+
2218
+ # Exclude multi-host TPUs from being processed.
2219
+ # TODO(Doyoung): Remove the logic when adding support for
2220
+ # multi-host TPUs.
2221
+ if is_multi_host_tpu(node.metadata.labels):
2222
+ continue
2221
2223
 
2222
- node_info_dict[node.metadata.name] = KubernetesNodeInfo(
2223
- name=node.metadata.name,
2224
- accelerator_type=accelerator_name,
2225
- total={'accelerator_count': int(accelerator_count)},
2226
- free={'accelerators_available': int(accelerators_available)})
2224
+ node_info_dict[node.metadata.name] = KubernetesNodeInfo(
2225
+ name=node.metadata.name,
2226
+ accelerator_type=accelerator_name,
2227
+ total={'accelerator_count': int(accelerator_count)},
2228
+ free={'accelerators_available': int(accelerators_available)})
2227
2229
 
2228
2230
  return node_info_dict
2229
2231
 
sky/skylet/events.py CHANGED
@@ -20,6 +20,7 @@ from sky.serve import serve_utils
20
20
  from sky.skylet import autostop_lib
21
21
  from sky.skylet import constants
22
22
  from sky.skylet import job_lib
23
+ from sky.usage import usage_lib
23
24
  from sky.utils import cluster_yaml_utils
24
25
  from sky.utils import common_utils
25
26
  from sky.utils import ux_utils
@@ -90,6 +91,14 @@ class ServiceUpdateEvent(SkyletEvent):
90
91
  serve_utils.update_service_status()
91
92
 
92
93
 
94
+ class UsageHeartbeatReportEvent(SkyletEvent):
95
+ """Skylet event for reporting usage."""
96
+ EVENT_INTERVAL_SECONDS = 600
97
+
98
+ def _run(self):
99
+ usage_lib.send_heartbeat(interval_seconds=self.EVENT_INTERVAL_SECONDS)
100
+
101
+
93
102
  class AutostopEvent(SkyletEvent):
94
103
  """Skylet event for autostop.
95
104
 
sky/skylet/skylet.py CHANGED
@@ -25,6 +25,8 @@ EVENTS = [
25
25
  # unhealthy, this event will correctly update the controller
26
26
  # status to CONTROLLER_FAILED.
27
27
  events.ServiceUpdateEvent(),
28
+ # Report usage heartbeat every 10 minutes.
29
+ events.UsageHeartbeatReportEvent(),
28
30
  ]
29
31
 
30
32
  while True:
@@ -373,15 +373,16 @@ available_node_types:
373
373
  done;
374
374
  if [ ! -z "$INSTALL_FIRST" ]; then
375
375
  echo "Installing core packages: $INSTALL_FIRST";
376
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST;
376
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
377
377
  fi;
378
378
  # SSH and other packages are not necessary, so we disable set -e
379
379
  set +e
380
380
 
381
381
  if [ ! -z "$MISSING_PACKAGES" ]; then
382
382
  echo "Installing missing packages: $MISSING_PACKAGES";
383
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES;
383
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $MISSING_PACKAGES;
384
384
  fi;
385
+
385
386
  $(prefix_cmd) mkdir -p /var/run/sshd;
386
387
  $(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
387
388
  $(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
sky/usage/constants.py CHANGED
@@ -3,7 +3,6 @@
3
3
  LOG_URL = 'http://usage.skypilot.co:9090/loki/api/v1/push' # pylint: disable=line-too-long
4
4
 
5
5
  USAGE_MESSAGE_SCHEMA_VERSION = 1
6
-
7
6
  PRIVACY_POLICY_PATH = '~/.sky/privacy_policy'
8
7
 
9
8
  USAGE_POLICY_MESSAGE = (
@@ -15,3 +14,5 @@ USAGE_POLICY_MESSAGE = (
15
14
 
16
15
  USAGE_MESSAGE_REDACT_KEYS = ['setup', 'run', 'envs']
17
16
  USAGE_MESSAGE_REDACT_TYPES = {str, dict}
17
+
18
+ USAGE_RUN_ID_FILE = '~/.sky/usage_run_id'
sky/usage/usage_lib.py CHANGED
@@ -44,6 +44,7 @@ def _get_current_timestamp_ns() -> int:
44
44
  class MessageType(enum.Enum):
45
45
  """Types for messages to be sent to Loki."""
46
46
  USAGE = 'usage'
47
+ HEARTBEAT = 'heartbeat'
47
48
  # TODO(zhwu): Add more types, e.g., cluster_lifecycle.
48
49
 
49
50
 
@@ -67,8 +68,9 @@ class MessageToReport:
67
68
  properties = self.__dict__.copy()
68
69
  return {k: v for k, v in properties.items() if not k.startswith('_')}
69
70
 
70
- def __repr__(self):
71
- raise NotImplementedError
71
+ def __repr__(self) -> str:
72
+ d = self.get_properties()
73
+ return json.dumps(d)
72
74
 
73
75
 
74
76
  class UsageMessageToReport(MessageToReport):
@@ -160,10 +162,6 @@ class UsageMessageToReport(MessageToReport):
160
162
  self.exception: Optional[str] = None # entrypoint_context
161
163
  self.stacktrace: Optional[str] = None # entrypoint_context
162
164
 
163
- def __repr__(self) -> str:
164
- d = self.get_properties()
165
- return json.dumps(d)
166
-
167
165
  def update_entrypoint(self, msg: str):
168
166
  self.entrypoint = msg
169
167
 
@@ -275,16 +273,43 @@ class UsageMessageToReport(MessageToReport):
275
273
  name_or_fn)
276
274
 
277
275
 
276
+ class HeartbeatMessageToReport(MessageToReport):
277
+ """Message to be reported to Grafana Loki for heartbeat on a cluster."""
278
+
279
+ def __init__(self, interval_seconds: int = 600):
280
+ super().__init__(constants.USAGE_MESSAGE_SCHEMA_VERSION)
281
+ # This interval_seconds is mainly for recording the heartbeat interval
282
+ # in the heartbeat message, so that the collector can use it.
283
+ self.interval_seconds = interval_seconds
284
+
285
+ def get_properties(self) -> Dict[str, Any]:
286
+ properties = super().get_properties()
287
+ # The run id is set by the skylet, which will always be the same for
288
+ # the entire lifetime of the run.
289
+ with open(os.path.expanduser(constants.USAGE_RUN_ID_FILE),
290
+ 'r',
291
+ encoding='utf-8') as f:
292
+ properties['run_id'] = f.read().strip()
293
+ return properties
294
+
295
+
278
296
  class MessageCollection:
279
297
  """A collection of messages."""
280
298
 
281
299
  def __init__(self):
282
- self._messages = {MessageType.USAGE: UsageMessageToReport()}
300
+ self._messages = {
301
+ MessageType.USAGE: UsageMessageToReport(),
302
+ MessageType.HEARTBEAT: HeartbeatMessageToReport()
303
+ }
283
304
 
284
305
  @property
285
- def usage(self):
306
+ def usage(self) -> UsageMessageToReport:
286
307
  return self._messages[MessageType.USAGE]
287
308
 
309
+ @property
310
+ def heartbeat(self) -> HeartbeatMessageToReport:
311
+ return self._messages[MessageType.HEARTBEAT]
312
+
288
313
  def reset(self, message_type: MessageType):
289
314
  self._messages[message_type] = self._messages[message_type].__class__()
290
315
 
@@ -308,13 +333,25 @@ def _send_to_loki(message_type: MessageType):
308
333
 
309
334
  message = messages[message_type]
310
335
 
336
+ # In case the message has no start time, set it to the current time.
337
+ message.start()
311
338
  message.send_time = _get_current_timestamp_ns()
312
- log_timestamp = message.start_time
339
+ # Use send time instead of start time to avoid the message being dropped
340
+ # by Loki, due to the timestamp being too old. We still have the start time
341
+ # in the message for dashboard.
342
+ log_timestamp = message.send_time
313
343
 
314
344
  environment = 'prod'
315
345
  if env_options.Options.IS_DEVELOPER.get():
316
346
  environment = 'dev'
317
- prom_labels = {'type': message_type.value, 'environment': environment}
347
+ prom_labels = {
348
+ 'type': message_type.value,
349
+ 'environment': environment,
350
+ 'schema_version': message.schema_version,
351
+ }
352
+ if message_type == MessageType.USAGE:
353
+ prom_labels['new_cluster'] = (message.original_cluster_status != 'UP'
354
+ and message.final_cluster_status == 'UP')
318
355
 
319
356
  headers = {'Content-type': 'application/json'}
320
357
  payload = {
@@ -392,7 +429,7 @@ def prepare_json_from_yaml_config(
392
429
  def _send_local_messages():
393
430
  """Send all messages not been uploaded to Loki."""
394
431
  for msg_type, message in messages.items():
395
- if not message.message_sent:
432
+ if not message.message_sent and msg_type != MessageType.HEARTBEAT:
396
433
  # Avoid the fallback entrypoint to send the message again
397
434
  # in normal case.
398
435
  try:
@@ -402,6 +439,11 @@ def _send_local_messages():
402
439
  f'exception caught: {type(e)}({e})')
403
440
 
404
441
 
442
+ def send_heartbeat(interval_seconds: int = 600):
443
+ messages.heartbeat.interval_seconds = interval_seconds
444
+ _send_to_loki(MessageType.HEARTBEAT)
445
+
446
+
405
447
  @contextlib.contextmanager
406
448
  def entrypoint_context(name: str, fallback: bool = False):
407
449
  """Context manager for entrypoint.
sky/utils/env_options.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Global environment options for sky."""
2
2
  import enum
3
3
  import os
4
+ from typing import Dict
4
5
 
5
6
 
6
7
  class Options(enum.Enum):
@@ -35,3 +36,8 @@ class Options(enum.Enum):
35
36
  def env_key(self) -> str:
36
37
  """The environment variable key name."""
37
38
  return self.value[0]
39
+
40
+ @classmethod
41
+ def all_options(cls) -> Dict[str, bool]:
42
+ """Returns all options as a dictionary."""
43
+ return {option.env_key: option.get() for option in list(Options)}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250210
3
+ Version: 1.0.0.dev20250212
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,15 +1,15 @@
1
- sky/__init__.py,sha256=Ouqj97DeXk38Du569MHZa8UrRMi6Yuvwjy_FmuN5sVk,5560
1
+ sky/__init__.py,sha256=GRvhpT8lUvIyIPLyC2cqv9RiI6hZ0_iMY56YNDpjPbs,5560
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=MNc9uHnvQ1EsEl8SsrYcYCGbxcnDbR6gaRCXVNd5RZE,22338
4
4
  sky/check.py,sha256=xzLlxUkBCrzpOho8lw65EvKLPl_b9lA2nteF5MSYbDQ,10885
5
- sky/cli.py,sha256=B-YWYiKnfSGdSOXtAY8SRGOGhneUeNPBjXFZ0FuLZ8w,214131
5
+ sky/cli.py,sha256=_Q-XlsLN73e8BJilClajL7VOG8vINVJ_xRjENOpJdDA,213928
6
6
  sky/cloud_stores.py,sha256=PcLT57_8SZy7o6paAluElfBynaLkbaOq3l-8dNg1AVM,23672
7
7
  sky/core.py,sha256=fE1rn4Ku94S0XmWTO5-6t6eT6aaJImNczRqEnTe8v7Q,38742
8
8
  sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
9
9
  sky/exceptions.py,sha256=SEhRubPlk-crkflPC5P_Z085iLrSd3UScYwc790QwYw,9378
10
- sky/execution.py,sha256=dpbk1kGRkGHT0FCJKGvjqeV3qIGEN2K20NDZbVrcAvI,28483
10
+ sky/execution.py,sha256=vNUE9Z8hCSQeil7h3kdote2r6nkbrGXSqqmK6ru594Q,28453
11
11
  sky/global_user_state.py,sha256=cTwltMCDIIBaapuGgARxFwpDJDCiKKyVW-PP_qtWuCA,30241
12
- sky/optimizer.py,sha256=d5BPAEZVrS3a2oBclSwo8MWkHQKQ3u4tcyawOANN0_0,59836
12
+ sky/optimizer.py,sha256=H5cpKELOQmnFpox0QXMB4P7jGhJxzXog4Ht_TYJaGuA,59758
13
13
  sky/resources.py,sha256=D3jteQxKOUydoNm7VDl90p02dwP3RpbO3gqNcl4dpOI,70327
14
14
  sky/sky_logging.py,sha256=7Zk9mL1TDxFkGsy3INMBKYlqsbognVGSMzAsHZdZlhw,5891
15
15
  sky/skypilot_config.py,sha256=FN93hSG-heQCHBnemlIK2TwrJngKbpx4vMXNUzPIzV8,9087
@@ -101,7 +101,7 @@ sky/data/mounting_utils.py,sha256=tJHBPEDP1Wg_r3oSGBwFhMDLnPCMPSFRz26O0QkDd0Y,14
101
101
  sky/data/storage.py,sha256=CWVKnHhdzXw1biPbRqYizkyVexL_OCELuJCqtd4hit4,204094
102
102
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
103
103
  sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
104
- sky/jobs/constants.py,sha256=6RphkJ6pmafQ7XYW5qwId1Zvqb99HJelA9kgrgfNR7o,1421
104
+ sky/jobs/constants.py,sha256=9kIdpwWNI9zWKQO39LTg9spUMGl5Iqx4ByIjRlic7Hw,1893
105
105
  sky/jobs/controller.py,sha256=cX8kGplwa-0Te_ihUfzzOr-TRs_Fw6UdFPm6mrtSE0c,28548
106
106
  sky/jobs/core.py,sha256=b9aJB90AxUdhoasSxsWBoD-mQY1MmC05FbPbtyFMzHI,19154
107
107
  sky/jobs/recovery_strategy.py,sha256=49H1ca5N4bIJ3W4iqurxzSvJE0dIihPt2XnstboxUm4,26370
@@ -115,7 +115,7 @@ sky/provision/__init__.py,sha256=hb_z69_7-FH1I8aDpFKNj2x_a8spzceWcovklutNgP8,637
115
115
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
116
116
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
117
117
  sky/provision/docker_utils.py,sha256=ENm0LkyrYWic3Ikyacho8X5uDMvGsbkZQsb6kNH1DuI,19629
118
- sky/provision/instance_setup.py,sha256=8Pudbpke6ah0xufr2UwtsDnNZ64-aAYkz8M44ZA0huI,23218
118
+ sky/provision/instance_setup.py,sha256=YBFOwZQLBzpUjYoVQcX0KItej1rCBRWM23Dw9lg_q24,24386
119
119
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
120
120
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
121
121
  sky/provision/provisioner.py,sha256=ZOgFOO0NB4QZVPwd4qikRqi615Bq67n0Vcl3cTDVxNE,29153
@@ -153,7 +153,7 @@ sky/provision/kubernetes/config.py,sha256=bXwOGdSAnXCkDreew0KsSUqSv3ZrptNeevqat7
153
153
  sky/provision/kubernetes/instance.py,sha256=AQikdRgNklpeMgiEd4w2Hh7kGssVABsy0aCh9xsKi5Y,50313
154
154
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
155
155
  sky/provision/kubernetes/network_utils.py,sha256=52BZY_5ynCH6IXlivKObYyAHDgQCJyAJIjmM7J4MpFo,11393
156
- sky/provision/kubernetes/utils.py,sha256=4kSEx6NZB3MAsDqCxLO-elo7EO6Coh-9wypwVqs3jgk,109895
156
+ sky/provision/kubernetes/utils.py,sha256=swOe6ozgSoucDtoJCExs0HLLWYuoi5HkIGMMSp7fEzc,109962
157
157
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
158
158
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
159
159
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -216,11 +216,11 @@ sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,
216
216
  sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
217
217
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
218
218
  sky/skylet/constants.py,sha256=EUSW4yH59eqBDLMIdmQWIYd3nAJBFoUeo5v9MGiginI,16057
219
- sky/skylet/events.py,sha256=0bOjUYpphuAficD9wDB5NOan2vwJDaRqdnm4sl0RK0U,12535
219
+ sky/skylet/events.py,sha256=__7bt6Z8q2W1vwTQv4yug-oAXDwSf8zBeRxb8HFM36U,12792
220
220
  sky/skylet/job_lib.py,sha256=Rk-C069cusJIRXsks8xqCb016JSt7GlpU7LrpX0qFJk,42785
221
221
  sky/skylet/log_lib.py,sha256=oFEBd85vDYFrIyyZKekH30yc4rRYILC0F0o-COQ64oE,20445
222
222
  sky/skylet/log_lib.pyi,sha256=rRk4eUX0RHGs1QL9CXsJq6RE7FqqxZlfuPJOLXTvg7I,4453
223
- sky/skylet/skylet.py,sha256=Tpv4yYR3jwxZsYeFPexB1gS1bCL5_AAfPzGKLsknPhA,1147
223
+ sky/skylet/skylet.py,sha256=mWmqCvxSlfdVU_L8NL6P52jmCt3smd8K0HdyNBfMPeI,1234
224
224
  sky/skylet/subprocess_daemon.py,sha256=gcL-_Hea7-SrBUyZfAbo40RBFbaeuBmPCW0dm4YYkPo,3537
225
225
  sky/skylet/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
226
226
  sky/skylet/providers/command_runner.py,sha256=DdBKP0QX325_N3zAVYwnmXmfbfXNqkzWQZpF9DSR7Go,16259
@@ -250,7 +250,7 @@ sky/templates/jobs-controller.yaml.j2,sha256=FfagMkhXZdUWR6HtJHJ3JEZzJy4eov5CQZH
250
250
  sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
251
251
  sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8wyVEbRNFHRsBuLsw,626
252
252
  sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=iw7mypHszg6Ggq9MbyiYMFOkSlXaQZulaxqC5IWYGCc,3381
253
- sky/templates/kubernetes-ray.yml.j2,sha256=EHUDvALvhaPB44U7cdgXStV6v8Qh8yn5J4T6XFnmZoM,28856
253
+ sky/templates/kubernetes-ray.yml.j2,sha256=x3Eq1ejG577E6eAZtJvpTlzXRCW5beMhqApV3J8BEZY,29019
254
254
  sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
255
255
  sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
256
256
  sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
@@ -262,8 +262,8 @@ sky/templates/sky-serve-controller.yaml.j2,sha256=W4i1-OGRU2WDvauLC4EDXcYrNxj7mz
262
262
  sky/templates/vast-ray.yml.j2,sha256=KaZLBJfI6FzAVRVq0NNM0_SN0RQUrDIehnJJ_LnvwnY,2990
263
263
  sky/templates/vsphere-ray.yml.j2,sha256=cOQ-qdpxGA2FHajMMhTJI-SmlYzdPterX4Gsiq-nkb0,3587
264
264
  sky/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
265
- sky/usage/constants.py,sha256=8xpg9vhDU9A3eObtpkNFjwa42oCazqGEv4yw_vJSO7U,590
266
- sky/usage/usage_lib.py,sha256=rjsekywo8IB_lJwRKBaWfQZ_znUJ-mIu1b9iWkCog88,18211
265
+ sky/usage/constants.py,sha256=k7PQ-QP1p3tDgnzvy7QoxJjuTXWDUyVkbtPcIEvDsYM,632
266
+ sky/usage/usage_lib.py,sha256=jpRt-24WVxYyd-XJz3_lSHboUKmWy8x8lRvvO-JO68g,20026
267
267
  sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
268
268
  sky/utils/accelerator_registry.py,sha256=cpPS9_MahQPt0ev4qPT-qyGpe12YD78UNj_gAvt720Q,4052
269
269
  sky/utils/admin_policy_utils.py,sha256=_Vt_jTTYCXmMdryj0vrrumFPewa93qHnzUqBDXjAhRU,5981
@@ -275,7 +275,7 @@ sky/utils/control_master_utils.py,sha256=90hnxiAUP20gbJ9e3MERh7rb04ZO_I3LsljNjR2
275
275
  sky/utils/controller_utils.py,sha256=SUrhK46ouBH2rm7azfFLIWr-T9-voYAdiXl2z5fG4Qw,45948
276
276
  sky/utils/dag_utils.py,sha256=l_0O3RUfe9OdQ9mtbhdlHpJVD4VAF_HQ3A75dgsYIjM,6099
277
277
  sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
278
- sky/utils/env_options.py,sha256=E5iwRFBUY2Iq6e0y0c1Mv5OSQ4MRNdk0-p38xUyVerc,1366
278
+ sky/utils/env_options.py,sha256=aaD6GoYK0LaZIqjOEZ-R7eccQuiRriW3EuLWtOI5En8,1578
279
279
  sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
280
280
  sky/utils/log_utils.py,sha256=AjkgSrk0GVOUbnnCEC2f4lsf2HOIXkZETCxR0BJw2-U,14152
281
281
  sky/utils/resources_utils.py,sha256=06Kx6AfbBdwBYGmIYFEY_qm6OBc2a5esZMPvIX7gCvc,7787
@@ -298,9 +298,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
298
298
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
299
299
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
300
300
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
301
- skypilot_nightly-1.0.0.dev20250210.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
302
- skypilot_nightly-1.0.0.dev20250210.dist-info/METADATA,sha256=iWffm0fzj79Ph1ttJ-A0Mj2_DZRE7f_02ZXpFAMGW7M,21397
303
- skypilot_nightly-1.0.0.dev20250210.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
304
- skypilot_nightly-1.0.0.dev20250210.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
305
- skypilot_nightly-1.0.0.dev20250210.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
306
- skypilot_nightly-1.0.0.dev20250210.dist-info/RECORD,,
301
+ skypilot_nightly-1.0.0.dev20250212.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
302
+ skypilot_nightly-1.0.0.dev20250212.dist-info/METADATA,sha256=rkJIHWHxQtacqsQPb5SZ7XHCGiXMvMBzXNPupXqi4sU,21397
303
+ skypilot_nightly-1.0.0.dev20250212.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
304
+ skypilot_nightly-1.0.0.dev20250212.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
305
+ skypilot_nightly-1.0.0.dev20250212.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
306
+ skypilot_nightly-1.0.0.dev20250212.dist-info/RECORD,,