skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Utility functions for generating instance links for cloud providers."""
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
from sky import sky_logging
|
|
5
|
+
from sky.provision import common
|
|
6
|
+
from sky.provision import constants as provision_constants
|
|
7
|
+
|
|
8
|
+
logger = sky_logging.init_logger(__name__)
|
|
9
|
+
|
|
10
|
+
# URL templates for each cloud provider
|
|
11
|
+
# Placeholders:
|
|
12
|
+
# {region} - Cloud region
|
|
13
|
+
# {project_id} - GCP project ID
|
|
14
|
+
# {subscription_id} - Azure subscription ID
|
|
15
|
+
# {resource_group} - Azure resource group
|
|
16
|
+
# {tag_key} - Tag key used to identify cluster instances
|
|
17
|
+
# {cluster_name} - Name of the cluster
|
|
18
|
+
|
|
19
|
+
AWS_INSTANCES_URL = ('https://{region}.console.aws.amazon.com/ec2/v2/home'
|
|
20
|
+
'?region={region}#Instances:tag:{tag_key}={cluster_name}')
|
|
21
|
+
|
|
22
|
+
# Azure doesn't support direct tag filter URLs, so we link to the resource group
|
|
23
|
+
AZURE_RESOURCE_GROUP_URL = (
|
|
24
|
+
'https://portal.azure.com/#@/resource/subscriptions'
|
|
25
|
+
'/{subscription_id}/resourceGroups/{resource_group}/overview')
|
|
26
|
+
|
|
27
|
+
# GCP Console base URL
|
|
28
|
+
GCP_INSTANCES_BASE_URL = 'https://console.cloud.google.com/compute/instances'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _build_gcp_instances_url(project_id: str, tag_key: str,
|
|
32
|
+
cluster_name: str) -> str:
|
|
33
|
+
"""Build GCP instances URL with label filter.
|
|
34
|
+
|
|
35
|
+
GCP Console uses a pageState parameter with a specially encoded filter.
|
|
36
|
+
The filter JSON structure is:
|
|
37
|
+
[{"k":"","t":10,"v":"\"label_key:label_value\"","s":true}]
|
|
38
|
+
|
|
39
|
+
Where:
|
|
40
|
+
- k: filter key (empty for label filters)
|
|
41
|
+
- t: filter type (10 = label filter)
|
|
42
|
+
- v: filter value with escaped quotes around "label_key:label_value"
|
|
43
|
+
- s: unknown, always true
|
|
44
|
+
|
|
45
|
+
GCP uses a mix of:
|
|
46
|
+
- Standard URL encoding for outer structure (%22 for ")
|
|
47
|
+
- Underscore notation inside the filter (_22 for ", _3A for :, etc.)
|
|
48
|
+
- Double URL-encoding for brackets (%255B = %5B = [)
|
|
49
|
+
"""
|
|
50
|
+
# Build the filter value: \"tag_key:cluster_name\"
|
|
51
|
+
# Using underscore notation: _5C_22 = \", _3A = :
|
|
52
|
+
filter_value = f'_5C_22{tag_key}_3A{cluster_name}_5C_22'
|
|
53
|
+
|
|
54
|
+
# Build the filter object using underscore notation for internal quotes and
|
|
55
|
+
# colons.
|
|
56
|
+
# {"k":"","t":10,"v":"<filter_value>","s":true}
|
|
57
|
+
# _22 = ", _3A = :, _2C = ,
|
|
58
|
+
filter_obj = (
|
|
59
|
+
f'_22k_22_3A_22_22_2C' # "k":"",
|
|
60
|
+
f'_22t_22_3A10_2C' # "t":10,
|
|
61
|
+
f'_22v_22_3A_22{filter_value}_22_2C' # "v":"<value>",
|
|
62
|
+
f'_22s_22_3Atrue') # "s":true
|
|
63
|
+
|
|
64
|
+
# Wrap in array brackets (double URL-encoded: %255B = %5B = [, %257D = %7D)
|
|
65
|
+
filter_array = f'%255B%257B{filter_obj}%257D%255D'
|
|
66
|
+
|
|
67
|
+
# Build pageState: ("instances":("p":0,"f":"<filter>"))
|
|
68
|
+
# %22 = " (standard URL encoding)
|
|
69
|
+
page_state = f'(%22instances%22:(%22p%22:0,%22f%22:%22{filter_array}%22))'
|
|
70
|
+
|
|
71
|
+
return (
|
|
72
|
+
f'{GCP_INSTANCES_BASE_URL}?project={project_id}&pageState={page_state}')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def generate_instance_links(
|
|
76
|
+
cluster_info: common.ClusterInfo,
|
|
77
|
+
cluster_name: str,
|
|
78
|
+
) -> Dict[str, str]:
|
|
79
|
+
"""Generate instance links for a cluster based on the cloud provider.
|
|
80
|
+
|
|
81
|
+
Creates links to filtered views in cloud consoles that show all instances
|
|
82
|
+
belonging to the cluster (useful for multi-node jobs).
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
cluster_info: ClusterInfo object containing instance information.
|
|
86
|
+
cluster_name: Cluster name for tag-based filtering.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Dictionary mapping link labels to URLs. Empty dict if links cannot be
|
|
90
|
+
generated (e.g., for Kubernetes or unsupported clouds).
|
|
91
|
+
"""
|
|
92
|
+
links: Dict[str, str] = {}
|
|
93
|
+
provider_name = cluster_info.provider_name.lower()
|
|
94
|
+
provider_config = cluster_info.provider_config or {}
|
|
95
|
+
|
|
96
|
+
# Skip Kubernetes and other non-cloud providers
|
|
97
|
+
if provider_name in ('kubernetes', 'local'):
|
|
98
|
+
return links
|
|
99
|
+
|
|
100
|
+
# Tag used by SkyPilot to identify cluster instances
|
|
101
|
+
tag_key = provision_constants.TAG_RAY_CLUSTER_NAME
|
|
102
|
+
|
|
103
|
+
if provider_name == 'aws':
|
|
104
|
+
region = provider_config.get('region')
|
|
105
|
+
if not region:
|
|
106
|
+
logger.debug('AWS region not found in provider config, '
|
|
107
|
+
'skipping instance links')
|
|
108
|
+
return links
|
|
109
|
+
links['AWS Instances'] = AWS_INSTANCES_URL.format(
|
|
110
|
+
region=region,
|
|
111
|
+
tag_key=tag_key,
|
|
112
|
+
cluster_name=cluster_name,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
elif provider_name == 'gcp':
|
|
116
|
+
project_id = provider_config.get('project_id')
|
|
117
|
+
if not project_id:
|
|
118
|
+
logger.debug('GCP project_id not found in provider config, '
|
|
119
|
+
'skipping instance links')
|
|
120
|
+
return links
|
|
121
|
+
links['GCP Instances'] = _build_gcp_instances_url(
|
|
122
|
+
project_id=project_id,
|
|
123
|
+
tag_key=tag_key,
|
|
124
|
+
cluster_name=cluster_name,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
elif provider_name == 'azure':
|
|
128
|
+
subscription_id = provider_config.get('subscription_id')
|
|
129
|
+
resource_group = provider_config.get('resource_group')
|
|
130
|
+
if not subscription_id or not resource_group:
|
|
131
|
+
logger.debug('Azure subscription_id or resource_group not found '
|
|
132
|
+
'in provider config, skipping instance links')
|
|
133
|
+
return links
|
|
134
|
+
links['Azure Resource Group'] = AZURE_RESOURCE_GROUP_URL.format(
|
|
135
|
+
subscription_id=subscription_id,
|
|
136
|
+
resource_group=resource_group,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return links
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Utilities for server-side interactive SSH functionality."""
|
|
2
|
+
import array
|
|
3
|
+
import socket
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_pty_socket_path(session_id: str) -> str:
|
|
7
|
+
"""Get the Unix socket path for PTY file descriptor passing."""
|
|
8
|
+
return f'/tmp/sky_pty_{session_id}.sock'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def send_fd(sock: socket.socket, fd: int) -> None:
|
|
12
|
+
"""Send file descriptor via Unix socket using SCM_RIGHTS.
|
|
13
|
+
|
|
14
|
+
SCM_RIGHTS allows us to send or receive a set of open
|
|
15
|
+
file descriptors from another process.
|
|
16
|
+
|
|
17
|
+
See:
|
|
18
|
+
https://man7.org/linux/man-pages/man7/unix.7.html
|
|
19
|
+
https://man7.org/linux/man-pages/man3/cmsg.3.html
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
sock: Connected Unix socket.
|
|
23
|
+
fd: File descriptor to send.
|
|
24
|
+
"""
|
|
25
|
+
sock.sendmsg(
|
|
26
|
+
[b'x'], # Dummy data
|
|
27
|
+
[(socket.SOL_SOCKET, socket.SCM_RIGHTS, array.array('i', [fd]))])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def recv_fd(sock: socket.socket) -> int:
|
|
31
|
+
"""Receive file descriptor via Unix socket using SCM_RIGHTS.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
sock: Connected Unix socket.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Received file descriptor.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
RuntimeError: If no file descriptor was received.
|
|
41
|
+
"""
|
|
42
|
+
# NOTE: recvmsg() has no async equivalent
|
|
43
|
+
_, ancdata, _, _ = sock.recvmsg(
|
|
44
|
+
1, socket.CMSG_SPACE(array.array('i', [0]).itemsize))
|
|
45
|
+
if not ancdata:
|
|
46
|
+
raise RuntimeError('No file descriptor received - '
|
|
47
|
+
'sender may have closed connection')
|
|
48
|
+
_, _, cmsg_data = ancdata[0]
|
|
49
|
+
return array.array('i', cmsg_data)[0]
|
|
@@ -12,20 +12,20 @@
|
|
|
12
12
|
# * Specify SKYPILOT_NAMESPACE env var to override the default namespace where the service account is created.
|
|
13
13
|
# * Specify SKYPILOT_SA_NAME env var to override the default service account name.
|
|
14
14
|
# * Specify SKIP_SA_CREATION=1 to skip creating the service account and use an existing one
|
|
15
|
-
# * Specify SUPER_USER=
|
|
15
|
+
# * Specify SUPER_USER=0 to create a service account with minimal permissions
|
|
16
16
|
#
|
|
17
17
|
# Usage:
|
|
18
|
-
# # Create "sky-sa" service account
|
|
18
|
+
# # Create "sky-sa" service account in "default" namespace and generate kubeconfig
|
|
19
19
|
# $ ./generate_kubeconfig.sh
|
|
20
20
|
#
|
|
21
|
-
# # Create "my-sa" service account
|
|
21
|
+
# # Create "my-sa" service account in "my-namespace" namespace and generate kubeconfig
|
|
22
22
|
# $ SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
|
|
23
23
|
#
|
|
24
24
|
# # Use an existing service account "my-sa" in "my-namespace" namespace and generate kubeconfig
|
|
25
25
|
# $ SKIP_SA_CREATION=1 SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
|
|
26
26
|
#
|
|
27
|
-
# # Create "sky-sa" service account with
|
|
28
|
-
# $ SUPER_USER=
|
|
27
|
+
# # Create "sky-sa" service account with minimal permissions in "default" namespace (manual setup may be required)
|
|
28
|
+
# $ SUPER_USER=0 ./generate_kubeconfig.sh
|
|
29
29
|
|
|
30
30
|
set -eu -o pipefail
|
|
31
31
|
|
|
@@ -33,11 +33,18 @@ set -eu -o pipefail
|
|
|
33
33
|
# use default.
|
|
34
34
|
SKYPILOT_SA=${SKYPILOT_SA_NAME:-sky-sa}
|
|
35
35
|
NAMESPACE=${SKYPILOT_NAMESPACE:-default}
|
|
36
|
-
SUPER_USER=${SUPER_USER:-
|
|
36
|
+
SUPER_USER=${SUPER_USER:-1}
|
|
37
37
|
|
|
38
|
-
echo "
|
|
39
|
-
echo "
|
|
40
|
-
echo "
|
|
38
|
+
echo "=========================================="
|
|
39
|
+
echo "SkyPilot Kubeconfig Generation"
|
|
40
|
+
echo "=========================================="
|
|
41
|
+
echo "Service Account: ${SKYPILOT_SA}"
|
|
42
|
+
echo "Namespace: ${NAMESPACE}"
|
|
43
|
+
if [ "${SUPER_USER}" != "1" ]; then
|
|
44
|
+
echo "Permissions: Minimal (manual setup may be required)"
|
|
45
|
+
SUPER_USER=0
|
|
46
|
+
fi
|
|
47
|
+
echo ""
|
|
41
48
|
|
|
42
49
|
# Set OS specific values.
|
|
43
50
|
if [[ "$OSTYPE" == "linux-gnu" ]]; then
|
|
@@ -53,7 +60,7 @@ fi
|
|
|
53
60
|
|
|
54
61
|
# If the user has set SKIP_SA_CREATION=1, skip creating the service account.
|
|
55
62
|
if [ -z ${SKIP_SA_CREATION+x} ]; then
|
|
56
|
-
echo "Creating
|
|
63
|
+
echo "[1/3] Creating Kubernetes Service Account and RBAC permissions..."
|
|
57
64
|
if [ "${SUPER_USER}" = "1" ]; then
|
|
58
65
|
# Create service account with cluster-admin permissions
|
|
59
66
|
kubectl apply -f - <<EOF
|
|
@@ -219,7 +226,8 @@ roleRef:
|
|
|
219
226
|
EOF
|
|
220
227
|
fi
|
|
221
228
|
# Apply optional ingress-related roles, but don't make the script fail if it fails
|
|
222
|
-
|
|
229
|
+
echo " → Applying optional ingress permissions (skipped if ingress-nginx not installed)..."
|
|
230
|
+
kubectl apply -f - 2>/dev/null <<EOF || true
|
|
223
231
|
# Optional: Role for accessing ingress resources
|
|
224
232
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
225
233
|
kind: Role
|
|
@@ -253,8 +261,13 @@ roleRef:
|
|
|
253
261
|
name: ${SKYPILOT_SA}-role-ingress-nginx # Use the same name as the role at line 119
|
|
254
262
|
apiGroup: rbac.authorization.k8s.io
|
|
255
263
|
EOF
|
|
264
|
+
else
|
|
265
|
+
echo "[1/3] Skipping service account creation (using existing account)..."
|
|
256
266
|
fi
|
|
257
267
|
|
|
268
|
+
echo ""
|
|
269
|
+
echo "[2/3] Creating service account token..."
|
|
270
|
+
|
|
258
271
|
# Checks if secret entry was defined for Service account. If defined it means that Kubernetes server has a
|
|
259
272
|
# version bellow 1.24, otherwise one must manually create the secret and bind it to the Service account to have a non expiring token.
|
|
260
273
|
# After Kubernetes v1.24 Service accounts no longer generate automatic tokens/secrets.
|
|
@@ -293,7 +306,9 @@ CURRENT_CONTEXT=$(kubectl config current-context)
|
|
|
293
306
|
CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"${CURRENT_CONTEXT}\"})].context.cluster}")
|
|
294
307
|
CURRENT_CLUSTER_ADDR=$(kubectl config view -o jsonpath="{.clusters[?(@.name == \"${CURRENT_CLUSTER}\"})].cluster.server}")
|
|
295
308
|
|
|
296
|
-
echo "
|
|
309
|
+
echo ""
|
|
310
|
+
echo "[3/3] Generating kubeconfig file..."
|
|
311
|
+
|
|
297
312
|
cat > kubeconfig <<EOF
|
|
298
313
|
apiVersion: v1
|
|
299
314
|
clusters:
|
|
@@ -316,24 +331,18 @@ users:
|
|
|
316
331
|
token: ${SA_TOKEN}
|
|
317
332
|
EOF
|
|
318
333
|
|
|
319
|
-
echo "
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
Also add this to your ~/.sky/config.yaml to use the new service account:
|
|
335
|
-
|
|
336
|
-
# ~/.sky/config.yaml
|
|
337
|
-
kubernetes:
|
|
338
|
-
remote_identity: ${SKYPILOT_SA}
|
|
339
|
-
"
|
|
334
|
+
echo ""
|
|
335
|
+
echo "=========================================="
|
|
336
|
+
echo "✓ SUCCESS!"
|
|
337
|
+
echo "=========================================="
|
|
338
|
+
echo ""
|
|
339
|
+
echo "Kubeconfig file created successfully!"
|
|
340
|
+
echo ""
|
|
341
|
+
echo " Service Account: ${SKYPILOT_SA}"
|
|
342
|
+
echo " Namespace: ${NAMESPACE}"
|
|
343
|
+
echo " Location: $(pwd)/kubeconfig"
|
|
344
|
+
echo ""
|
|
345
|
+
echo "Next steps:"
|
|
346
|
+
echo " Refer to this page for setting up the credential for remote API server:"
|
|
347
|
+
echo " https://docs.skypilot.co/en/latest/reference/api-server/api-server-admin-deploy.html#optional-configure-cloud-accounts"
|
|
348
|
+
echo ""
|
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
"""Utility functions for deploying Kubernetes clusters."""
|
|
1
|
+
"""Utility functions for deploying local Kubernetes kind clusters."""
|
|
2
2
|
import os
|
|
3
3
|
import random
|
|
4
4
|
import shlex
|
|
5
5
|
import subprocess
|
|
6
6
|
import tempfile
|
|
7
7
|
import textwrap
|
|
8
|
-
from typing import
|
|
9
|
-
|
|
10
|
-
import colorama
|
|
8
|
+
from typing import Optional, Tuple
|
|
11
9
|
|
|
12
10
|
from sky import check as sky_check
|
|
13
11
|
from sky import sky_logging
|
|
@@ -20,7 +18,6 @@ from sky.utils import log_utils
|
|
|
20
18
|
from sky.utils import rich_utils
|
|
21
19
|
from sky.utils import subprocess_utils
|
|
22
20
|
from sky.utils import ux_utils
|
|
23
|
-
from sky.utils.kubernetes import deploy_ssh_node_pools
|
|
24
21
|
|
|
25
22
|
logger = sky_logging.init_logger(__name__)
|
|
26
23
|
|
|
@@ -32,95 +29,6 @@ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
|
|
|
32
29
|
LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
|
|
33
30
|
|
|
34
31
|
|
|
35
|
-
def check_ssh_cluster_dependencies(
|
|
36
|
-
raise_error: bool = True) -> Optional[List[str]]:
|
|
37
|
-
"""Checks if the dependencies for ssh cluster are installed.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
raise_error: set to true when the dependency needs to be present.
|
|
41
|
-
set to false for `sky check`, where reason strings are compiled
|
|
42
|
-
at the end.
|
|
43
|
-
|
|
44
|
-
Returns: the reasons list if there are missing dependencies.
|
|
45
|
-
"""
|
|
46
|
-
# error message
|
|
47
|
-
jq_message = ('`jq` is required to setup ssh cluster.')
|
|
48
|
-
|
|
49
|
-
# save
|
|
50
|
-
reasons = []
|
|
51
|
-
required_binaries = []
|
|
52
|
-
|
|
53
|
-
# Ensure jq is installed
|
|
54
|
-
try:
|
|
55
|
-
subprocess.run(['jq', '--version'],
|
|
56
|
-
stdout=subprocess.DEVNULL,
|
|
57
|
-
stderr=subprocess.DEVNULL,
|
|
58
|
-
check=True)
|
|
59
|
-
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
60
|
-
required_binaries.append('jq')
|
|
61
|
-
reasons.append(jq_message)
|
|
62
|
-
|
|
63
|
-
if required_binaries:
|
|
64
|
-
reasons.extend([
|
|
65
|
-
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
|
66
|
-
f' $ sudo apt install {" ".join(required_binaries)}',
|
|
67
|
-
'On MacOS, install with: ',
|
|
68
|
-
f' $ brew install {" ".join(required_binaries)}',
|
|
69
|
-
])
|
|
70
|
-
if raise_error:
|
|
71
|
-
with ux_utils.print_exception_no_traceback():
|
|
72
|
-
raise RuntimeError('\n'.join(reasons))
|
|
73
|
-
return reasons
|
|
74
|
-
return None
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def deploy_ssh_cluster(cleanup: bool = False,
|
|
78
|
-
infra: Optional[str] = None,
|
|
79
|
-
kubeconfig_path: Optional[str] = None):
|
|
80
|
-
"""Deploy a Kubernetes cluster on SSH targets.
|
|
81
|
-
|
|
82
|
-
This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
|
|
83
|
-
Kubernetes cluster on the specified machines.
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
cleanup: Whether to clean up the cluster instead of deploying.
|
|
87
|
-
infra: Name of the cluster in ssh_node_pools.yaml to use.
|
|
88
|
-
If None, the first cluster in the file will be used.
|
|
89
|
-
kubeconfig_path: Path to save the Kubernetes configuration file.
|
|
90
|
-
If None, the default ~/.kube/config will be used.
|
|
91
|
-
"""
|
|
92
|
-
check_ssh_cluster_dependencies()
|
|
93
|
-
|
|
94
|
-
action = 'Cleanup' if cleanup else 'Deployment'
|
|
95
|
-
msg_str = f'Initializing SSH Node Pools {action}...'
|
|
96
|
-
|
|
97
|
-
with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
|
|
98
|
-
try:
|
|
99
|
-
deploy_ssh_node_pools.deploy_clusters(
|
|
100
|
-
infra=infra, cleanup=cleanup, kubeconfig_path=kubeconfig_path)
|
|
101
|
-
except Exception as e: # pylint: disable=broad-except
|
|
102
|
-
logger.error(str(e))
|
|
103
|
-
with ux_utils.print_exception_no_traceback():
|
|
104
|
-
raise RuntimeError(
|
|
105
|
-
'Failed to deploy SkyPilot on some Node Pools.') from e
|
|
106
|
-
|
|
107
|
-
logger.info('')
|
|
108
|
-
if cleanup:
|
|
109
|
-
logger.info(
|
|
110
|
-
ux_utils.finishing_message(
|
|
111
|
-
'🎉 SSH Node Pools cleaned up successfully.'))
|
|
112
|
-
else:
|
|
113
|
-
logger.info(
|
|
114
|
-
ux_utils.finishing_message(
|
|
115
|
-
'🎉 SSH Node Pools set up successfully. ',
|
|
116
|
-
follow_up_message=(
|
|
117
|
-
f'Run `{colorama.Style.BRIGHT}'
|
|
118
|
-
f'sky check ssh'
|
|
119
|
-
f'{colorama.Style.RESET_ALL}` to verify access, '
|
|
120
|
-
f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
|
|
121
|
-
f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
|
|
122
|
-
|
|
123
|
-
|
|
124
32
|
def generate_kind_config(port_start: int,
|
|
125
33
|
num_nodes: int = 1,
|
|
126
34
|
gpus: bool = False) -> str:
|
|
@@ -60,4 +60,8 @@ fi
|
|
|
60
60
|
# We wrap the command in a bash script that waits for rsync, then execs the original command.
|
|
61
61
|
# Timeout after MAX_WAIT_TIME_SECONDS seconds.
|
|
62
62
|
MAX_WAIT_TIME_SECONDS=300
|
|
63
|
-
|
|
63
|
+
MAX_WAIT_COUNT=$((MAX_WAIT_TIME_SECONDS * 2))
|
|
64
|
+
# Use --norc --noprofile to prevent bash from sourcing startup files that might
|
|
65
|
+
# output to stdout and corrupt the rsync protocol. All debug output must go to
|
|
66
|
+
# stderr (>&2) to keep stdout clean for rsync communication.
|
|
67
|
+
eval "${kubectl_cmd_base% --} -i -- bash --norc --noprofile -c 'count=0; until which rsync >/dev/null 2>&1; do if [ \$count -ge $MAX_WAIT_COUNT ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
|