skypilot-nightly 1.0.0.dev20241111__py3-none-any.whl → 1.0.0.dev20241113__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +1 -0
- sky/cli.py +22 -6
- sky/clouds/cloud.py +2 -0
- sky/clouds/kubernetes.py +19 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +102 -61
- sky/clouds/utils/gcp_utils.py +5 -1
- sky/jobs/core.py +2 -0
- sky/optimizer.py +2 -0
- sky/provision/__init__.py +2 -0
- sky/provision/kubernetes/instance.py +125 -55
- sky/provision/kubernetes/utils.py +361 -102
- sky/resources.py +38 -27
- sky/serve/serve_utils.py +79 -78
- sky/skylet/log_lib.py +1 -4
- sky/templates/kubernetes-ray.yml.j2 +29 -3
- sky/utils/kubernetes/generate_kubeconfig.sh +3 -0
- sky/utils/kubernetes/gpu_labeler.py +2 -2
- sky/utils/log_utils.py +52 -1
- sky/utils/timeline.py +3 -1
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/RECORD +26 -26
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/top_level.txt +0 -0
sky/resources.py
CHANGED
@@ -14,6 +14,7 @@ from sky import sky_logging
|
|
14
14
|
from sky import skypilot_config
|
15
15
|
from sky.clouds import service_catalog
|
16
16
|
from sky.provision import docker_utils
|
17
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
18
|
from sky.skylet import constants
|
18
19
|
from sky.utils import accelerator_registry
|
19
20
|
from sky.utils import common_utils
|
@@ -582,36 +583,46 @@ class Resources:
|
|
582
583
|
acc, _ = list(accelerators.items())[0]
|
583
584
|
if 'tpu' in acc.lower():
|
584
585
|
if self.cloud is None:
|
585
|
-
|
586
|
-
|
587
|
-
|
586
|
+
if kubernetes_utils.is_tpu_on_gke(acc):
|
587
|
+
self._cloud = clouds.Kubernetes()
|
588
|
+
else:
|
589
|
+
self._cloud = clouds.GCP()
|
590
|
+
assert (self.cloud.is_same_cloud(clouds.GCP()) or
|
591
|
+
self.cloud.is_same_cloud(clouds.Kubernetes())), (
|
592
|
+
'Cloud must be GCP or Kubernetes for TPU '
|
593
|
+
'accelerators.')
|
594
|
+
|
588
595
|
if accelerator_args is None:
|
589
596
|
accelerator_args = {}
|
597
|
+
|
590
598
|
use_tpu_vm = accelerator_args.get('tpu_vm', True)
|
591
|
-
if self.
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
return '
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
599
|
+
if (self.cloud.is_same_cloud(clouds.GCP()) and
|
600
|
+
not kubernetes_utils.is_tpu_on_gke(acc)):
|
601
|
+
if 'runtime_version' not in accelerator_args:
|
602
|
+
|
603
|
+
def _get_default_runtime_version() -> str:
|
604
|
+
if not use_tpu_vm:
|
605
|
+
return '2.12.0'
|
606
|
+
# TPU V5 requires a newer runtime version.
|
607
|
+
if acc.startswith('tpu-v5'):
|
608
|
+
return 'v2-alpha-tpuv5'
|
609
|
+
# TPU V6e requires a newer runtime version.
|
610
|
+
elif acc.startswith('tpu-v6e'):
|
611
|
+
return 'v2-alpha-tpuv6e'
|
612
|
+
return 'tpu-vm-base'
|
613
|
+
|
614
|
+
accelerator_args['runtime_version'] = (
|
615
|
+
_get_default_runtime_version())
|
616
|
+
logger.info(
|
617
|
+
'Missing runtime_version in accelerator_args, using'
|
618
|
+
f' default ({accelerator_args["runtime_version"]})')
|
619
|
+
|
620
|
+
if self.instance_type is not None and use_tpu_vm:
|
621
|
+
if self.instance_type != 'TPU-VM':
|
622
|
+
with ux_utils.print_exception_no_traceback():
|
623
|
+
raise ValueError(
|
624
|
+
'Cannot specify instance type (got '
|
625
|
+
f'{self.instance_type!r}) for TPU VM.')
|
615
626
|
|
616
627
|
self._accelerators = accelerators
|
617
628
|
self._accelerator_args = accelerator_args
|
sky/serve/serve_utils.py
CHANGED
@@ -592,15 +592,26 @@ def get_latest_version_with_min_replicas(
|
|
592
592
|
|
593
593
|
|
594
594
|
def _follow_replica_logs(
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
595
|
+
file: TextIO,
|
596
|
+
cluster_name: str,
|
597
|
+
*,
|
598
|
+
should_stop: Callable[[], bool],
|
599
|
+
stop_on_eof: bool = False,
|
600
|
+
idle_timeout_seconds: Optional[int] = None,
|
601
|
+
) -> Iterator[str]:
|
602
|
+
"""Follows logs for a replica, handling nested log files.
|
603
|
+
|
604
|
+
Args:
|
605
|
+
file: Log file to read from.
|
606
|
+
cluster_name: Name of the cluster being launched.
|
607
|
+
should_stop: Callback that returns True when streaming should stop.
|
608
|
+
stop_on_eof: If True, stop when reaching end of file.
|
609
|
+
idle_timeout_seconds: If set, stop after these many seconds without
|
610
|
+
new content.
|
611
|
+
|
612
|
+
Yields:
|
613
|
+
Log lines from the main file and any nested log files.
|
614
|
+
"""
|
604
615
|
|
605
616
|
def cluster_is_up() -> bool:
|
606
617
|
cluster_record = global_user_state.get_cluster_from_name(cluster_name)
|
@@ -608,51 +619,52 @@ def _follow_replica_logs(
|
|
608
619
|
return False
|
609
620
|
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
610
621
|
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
622
|
+
def process_line(line: str) -> Iterator[str]:
|
623
|
+
# Tailing detailed progress for user. All logs in skypilot is
|
624
|
+
# of format `To view detailed progress: tail -n100 -f *.log`.
|
625
|
+
# Check if the line is directing users to view logs
|
626
|
+
provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
|
627
|
+
other_log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
628
|
+
|
629
|
+
if provision_log_prompt is not None:
|
630
|
+
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
|
631
|
+
with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
|
632
|
+
# We still exit if more than 10 seconds without new content
|
633
|
+
# to avoid any internal bug that causes the launch to fail
|
634
|
+
# while cluster status remains INIT.
|
635
|
+
# Originally, we output the next line first before printing
|
636
|
+
# the launching logs. Since the next line is always
|
637
|
+
# `Launching on <cloud> <region> (<zone>)`, we output it first
|
638
|
+
# to indicate the process is starting.
|
639
|
+
# TODO(andyl): After refactor #4323, the above logic is broken,
|
640
|
+
# but coincidentally with the new UX 3.0, the `Cluster launched`
|
641
|
+
# message is printed first, making the output appear correct.
|
642
|
+
# Explaining this since it's technically a breaking change
|
643
|
+
# for this refactor PR #4323. Will remove soon in a fix PR
|
644
|
+
# for adapting the serve.follow_logs to the new UX.
|
645
|
+
yield from _follow_replica_logs(f,
|
646
|
+
cluster_name,
|
647
|
+
should_stop=cluster_is_up,
|
648
|
+
stop_on_eof=stop_on_eof,
|
649
|
+
idle_timeout_seconds=10)
|
650
|
+
return
|
651
|
+
|
652
|
+
if other_log_prompt is not None:
|
653
|
+
# Now we skip other logs (file sync logs) since we lack
|
654
|
+
# utility to determine when these log files are finished
|
655
|
+
# writing.
|
656
|
+
# TODO(tian): We should not skip these logs since there are
|
657
|
+
# small chance that error will happen in file sync. Need to
|
658
|
+
# find a better way to do this.
|
659
|
+
return
|
660
|
+
|
661
|
+
yield line
|
662
|
+
|
663
|
+
return log_utils.follow_logs(file,
|
664
|
+
should_stop=should_stop,
|
665
|
+
stop_on_eof=stop_on_eof,
|
666
|
+
process_line=process_line,
|
667
|
+
idle_timeout_seconds=idle_timeout_seconds)
|
656
668
|
|
657
669
|
|
658
670
|
def stream_replica_logs(service_name: str, replica_id: int,
|
@@ -687,14 +699,17 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
687
699
|
raise ValueError(
|
688
700
|
_FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id))
|
689
701
|
|
690
|
-
|
702
|
+
replica_provisioned = (
|
691
703
|
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
|
692
704
|
with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
|
693
|
-
for line in _follow_replica_logs(
|
694
|
-
|
695
|
-
|
696
|
-
|
705
|
+
for line in _follow_replica_logs(
|
706
|
+
f,
|
707
|
+
replica_cluster_name,
|
708
|
+
should_stop=replica_provisioned,
|
709
|
+
stop_on_eof=not follow,
|
710
|
+
):
|
697
711
|
print(line, end='', flush=True)
|
712
|
+
|
698
713
|
if (not follow and
|
699
714
|
_get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
|
700
715
|
# Early exit if not following the logs.
|
@@ -719,22 +734,6 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
719
734
|
return ''
|
720
735
|
|
721
736
|
|
722
|
-
def _follow_logs(file: TextIO, *, finish_stream: Callable[[], bool],
|
723
|
-
exit_if_stream_end: bool) -> Iterator[str]:
|
724
|
-
line = ''
|
725
|
-
while True:
|
726
|
-
tmp = file.readline()
|
727
|
-
if tmp is not None and tmp != '':
|
728
|
-
line += tmp
|
729
|
-
if '\n' in line or '\r' in line:
|
730
|
-
yield line
|
731
|
-
line = ''
|
732
|
-
else:
|
733
|
-
if exit_if_stream_end or finish_stream():
|
734
|
-
break
|
735
|
-
time.sleep(1)
|
736
|
-
|
737
|
-
|
738
737
|
def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
739
738
|
follow: bool) -> str:
|
740
739
|
msg = check_service_status_healthy(service_name)
|
@@ -753,9 +752,11 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
753
752
|
|
754
753
|
with open(os.path.expanduser(log_file), 'r', newline='',
|
755
754
|
encoding='utf-8') as f:
|
756
|
-
for line in
|
757
|
-
|
758
|
-
|
755
|
+
for line in log_utils.follow_logs(
|
756
|
+
f,
|
757
|
+
should_stop=_service_is_terminal,
|
758
|
+
stop_on_eof=not follow,
|
759
|
+
):
|
759
760
|
print(line, end='', flush=True)
|
760
761
|
return ''
|
761
762
|
|
sky/skylet/log_lib.py
CHANGED
@@ -320,11 +320,8 @@ def run_bash_command_with_log(bash_command: str,
|
|
320
320
|
# Need this `-i` option to make sure `source ~/.bashrc` work.
|
321
321
|
inner_command = f'/bin/bash -i {script_path}'
|
322
322
|
|
323
|
-
subprocess_cmd: Union[str, List[str]]
|
324
|
-
subprocess_cmd = inner_command
|
325
|
-
|
326
323
|
return run_with_log(
|
327
|
-
|
324
|
+
inner_command,
|
328
325
|
log_path,
|
329
326
|
stream_logs=stream_logs,
|
330
327
|
with_ray=with_ray,
|
@@ -283,12 +283,15 @@ available_node_types:
|
|
283
283
|
|
284
284
|
restartPolicy: Never
|
285
285
|
|
286
|
-
# Add node selector if
|
286
|
+
# Add node selector if GPU/TPUs are requested:
|
287
287
|
{% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
|
288
288
|
nodeSelector:
|
289
289
|
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
|
290
290
|
{{k8s_acc_label_key}}: {{k8s_acc_label_value}}
|
291
291
|
{% endif %}
|
292
|
+
{% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
|
293
|
+
{{k8s_topology_label_key}}: {{k8s_topology_label_value}}
|
294
|
+
{% endif %}
|
292
295
|
{% if k8s_spot_label_key is not none %}
|
293
296
|
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
|
294
297
|
{% endif %}
|
@@ -409,14 +412,24 @@ available_node_types:
|
|
409
412
|
requests:
|
410
413
|
cpu: {{cpus}}
|
411
414
|
memory: {{memory}}G
|
412
|
-
|
415
|
+
{% if k8s_resource_key is not none %}
|
416
|
+
# Number of requested google.com/tpu must be equal to the total
|
417
|
+
# number of available TPU chips on the TPU slice node either it
|
418
|
+
# being a node from multi-host TPU slice or single-host TPU
|
419
|
+
# slice. Example reference:
|
420
|
+
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
|
421
|
+
{{k8s_resource_key}}: {{accelerator_count}}
|
422
|
+
{% endif %}
|
413
423
|
{% if k8s_fuse_device_required %}
|
414
424
|
# Kubernetes resource exposed by the fuse device manager
|
415
425
|
# https://gitlab.com/arm-research/smarter/smarter-device-manager
|
416
426
|
smarter-devices/fuse: "1"
|
417
427
|
{% endif %}
|
418
428
|
limits:
|
419
|
-
|
429
|
+
# Limits need to be defined for GPU/TPU requests
|
430
|
+
{% if k8s_resource_key is not none %}
|
431
|
+
{{k8s_resource_key}}: {{accelerator_count}}
|
432
|
+
{% endif %}
|
420
433
|
{% if k8s_fuse_device_required %}
|
421
434
|
smarter-devices/fuse: "1"
|
422
435
|
{% endif %}
|
@@ -451,6 +464,19 @@ setup_commands:
|
|
451
464
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
452
465
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
453
466
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
467
|
+
{% if tpu_requested %}
|
468
|
+
# The /tmp/tpu_logs directory is where TPU-related logs, such as logs from
|
469
|
+
# the TPU runtime, are written. These capture runtime information about the
|
470
|
+
# TPU execution, including any warnings, errors, or general activity of
|
471
|
+
# the TPU driver. By default, the /tmp/tpu_logs directory is created with
|
472
|
+
# 755 permissions, and the user of the provisioned pod is not necessarily
|
473
|
+
# a root. Hence, we need to update the write permission so the logs can be
|
474
|
+
# properly written.
|
475
|
+
# TODO(Doyoung): Investigate to see why TPU workload fails to run without
|
476
|
+
# execution permission, such as granting 766 to log file. Check if it's a
|
477
|
+
# must and see if there's a workaround to grant minimum permission.
|
478
|
+
- sudo chmod 777 /tmp/tpu_logs;
|
479
|
+
{% endif %}
|
454
480
|
|
455
481
|
# Format: `REMOTE_PATH : LOCAL_PATH`
|
456
482
|
file_mounts: {
|
@@ -112,6 +112,9 @@ rules:
|
|
112
112
|
- apiGroups: ["networking.k8s.io"] # Required for exposing services through ingresses
|
113
113
|
resources: ["ingressclasses"]
|
114
114
|
verbs: ["get", "list", "watch"]
|
115
|
+
- apiGroups: [""] # Required for sky show-gpus command
|
116
|
+
resources: ["pods"]
|
117
|
+
verbs: ["get", "list"]
|
115
118
|
---
|
116
119
|
# ClusterRoleBinding for the service account
|
117
120
|
apiVersion: rbac.authorization.k8s.io/v1
|
@@ -101,7 +101,7 @@ def label():
|
|
101
101
|
# Get the list of nodes with GPUs
|
102
102
|
gpu_nodes = []
|
103
103
|
for node in nodes:
|
104
|
-
if
|
104
|
+
if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity:
|
105
105
|
gpu_nodes.append(node)
|
106
106
|
|
107
107
|
print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')
|
@@ -142,7 +142,7 @@ def label():
|
|
142
142
|
if len(gpu_nodes) == 0:
|
143
143
|
print('No GPU nodes found in the cluster. If you have GPU nodes, '
|
144
144
|
'please ensure that they have the label '
|
145
|
-
'`
|
145
|
+
f'`{kubernetes_utils.GPU_RESOURCE_KEY}: <number of GPUs>`')
|
146
146
|
else:
|
147
147
|
print('GPU labeling started - this may take 10 min or more to complete.'
|
148
148
|
'\nTo check the status of GPU labeling jobs, run '
|
sky/utils/log_utils.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
"""Logging utils."""
|
2
2
|
import enum
|
3
|
+
import time
|
3
4
|
import types
|
4
|
-
from typing import List, Optional, Type
|
5
|
+
from typing import Callable, Iterator, List, Optional, TextIO, Type
|
5
6
|
|
6
7
|
import colorama
|
7
8
|
import pendulum
|
@@ -284,3 +285,53 @@ def readable_time_duration(start: Optional[float],
|
|
284
285
|
diff = diff.replace('hour', 'hr')
|
285
286
|
|
286
287
|
return diff
|
288
|
+
|
289
|
+
|
290
|
+
def follow_logs(
|
291
|
+
file: TextIO,
|
292
|
+
*,
|
293
|
+
should_stop: Callable[[], bool],
|
294
|
+
stop_on_eof: bool = False,
|
295
|
+
process_line: Optional[Callable[[str], Iterator[str]]] = None,
|
296
|
+
idle_timeout_seconds: Optional[int] = None,
|
297
|
+
) -> Iterator[str]:
|
298
|
+
"""Streams and processes logs line by line from a file.
|
299
|
+
|
300
|
+
Args:
|
301
|
+
file: File object to read logs from.
|
302
|
+
should_stop: Callback that returns True when streaming should stop.
|
303
|
+
stop_on_eof: If True, stop when reaching end of file.
|
304
|
+
process_line: Optional callback to transform/filter each line.
|
305
|
+
idle_timeout_seconds: If set, stop after these many seconds without
|
306
|
+
new content.
|
307
|
+
|
308
|
+
Yields:
|
309
|
+
Log lines, possibly transformed by process_line if provided.
|
310
|
+
"""
|
311
|
+
current_line: str = ''
|
312
|
+
seconds_without_content: int = 0
|
313
|
+
|
314
|
+
while True:
|
315
|
+
content = file.readline()
|
316
|
+
|
317
|
+
if not content:
|
318
|
+
if stop_on_eof or should_stop():
|
319
|
+
break
|
320
|
+
|
321
|
+
if idle_timeout_seconds is not None:
|
322
|
+
if seconds_without_content >= idle_timeout_seconds:
|
323
|
+
break
|
324
|
+
seconds_without_content += 1
|
325
|
+
|
326
|
+
time.sleep(1)
|
327
|
+
continue
|
328
|
+
|
329
|
+
seconds_without_content = 0
|
330
|
+
current_line += content
|
331
|
+
|
332
|
+
if '\n' in current_line or '\r' in current_line:
|
333
|
+
if process_line is not None:
|
334
|
+
yield from process_line(current_line)
|
335
|
+
else:
|
336
|
+
yield current_line
|
337
|
+
current_line = ''
|
sky/utils/timeline.py
CHANGED
@@ -9,6 +9,7 @@ import json
|
|
9
9
|
import os
|
10
10
|
import threading
|
11
11
|
import time
|
12
|
+
import traceback
|
12
13
|
from typing import Callable, Optional, Union
|
13
14
|
|
14
15
|
import filelock
|
@@ -48,8 +49,9 @@ class Event:
|
|
48
49
|
'ph': 'B',
|
49
50
|
'ts': f'{time.time() * 10 ** 6: .3f}',
|
50
51
|
})
|
52
|
+
event_begin['args'] = {'stack': '\n'.join(traceback.format_stack())}
|
51
53
|
if self._message is not None:
|
52
|
-
event_begin['args']
|
54
|
+
event_begin['args']['message'] = self._message
|
53
55
|
_events.append(event_begin)
|
54
56
|
|
55
57
|
def end(self):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20241113
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -309,7 +309,7 @@ Runnable examples:
|
|
309
309
|
- [LocalGPT](./llm/localgpt)
|
310
310
|
- [Falcon](./llm/falcon)
|
311
311
|
- Add yours here & see more in [`llm/`](./llm)!
|
312
|
-
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/
|
312
|
+
- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
|
313
313
|
|
314
314
|
Case Studies and Integrations: [Community Spotlights](https://blog.skypilot.co/community/)
|
315
315
|
|