skypilot-nightly 1.0.0.dev20250720__py3-none-any.whl → 1.0.0.dev20250724__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -4
- sky/backends/backend_utils.py +27 -11
- sky/backends/cloud_vm_ray_backend.py +22 -27
- sky/client/cli/command.py +44 -28
- sky/client/sdk.py +52 -7
- sky/client/sdk.pyi +296 -0
- sky/clouds/nebius.py +2 -5
- sky/clouds/utils/oci_utils.py +16 -40
- sky/clouds/vast.py +2 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
- sky/dashboard/out/_next/static/chunks/{1871-a821dcaaae2a3823.js → 1871-ea0e7283886407ca.js} +2 -2
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
- sky/dashboard/out/_next/static/chunks/{2641.5233e938f14e31a7.js → 2641.74c19c4d45a2c034.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
- sky/dashboard/out/_next/static/chunks/{938-63fc419cb82ad9b3.js → 938-7ee806653aef0609.js} +1 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
- sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-b6447da22305b14a.js +1 -0
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +93 -32
- sky/exceptions.py +8 -0
- sky/global_user_state.py +2 -3
- sky/jobs/state.py +2 -2
- sky/logs/__init__.py +4 -0
- sky/logs/agent.py +14 -0
- sky/logs/aws.py +276 -0
- sky/provision/nebius/utils.py +3 -6
- sky/server/common.py +9 -4
- sky/server/requests/payloads.py +20 -4
- sky/server/rest.py +6 -0
- sky/server/server.py +2 -1
- sky/setup_files/MANIFEST.in +1 -1
- sky/setup_files/alembic.ini +0 -4
- sky/skylet/constants.py +4 -0
- sky/skypilot_config.py +5 -31
- sky/utils/common_utils.py +8 -3
- sky/utils/config_utils.py +17 -0
- sky/utils/db/migration_utils.py +44 -4
- sky/utils/locks.py +319 -0
- sky/utils/rich_utils.py +2 -3
- sky/utils/schemas.py +92 -56
- sky/utils/timeline.py +41 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/RECORD +88 -86
- sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
- sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
- sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.c7c055a5c2814f33.js +0 -16
- sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
- sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
- sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
- sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
- sky/dashboard/out/_next/static/chunks/9470-8178183f3bae198f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-507712f30cd3cec3.js +0 -20
- sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-26cdc782eed15a7d.js +0 -1
- sky/dashboard/out/_next/static/css/5122cb0a08486fd3.css +0 -3
- sky/dashboard/out/_next/static/pTQKG61ng32Zc7gsAROFJ/_buildManifest.js +0 -1
- sky/schemas/db/skypilot_config/001_initial_schema.py +0 -30
- /sky/dashboard/out/_next/static/{pTQKG61ng32Zc7gsAROFJ → BURfWrKsQk9psMPv0OXrh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/top_level.txt +0 -0
sky/logs/aws.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""AWS CloudWatch logging agent."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from sky.logs.agent import FluentbitAgent
|
|
8
|
+
from sky.skylet import constants
|
|
9
|
+
from sky.utils import common_utils
|
|
10
|
+
from sky.utils import resources_utils
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _CloudwatchLoggingConfig(pydantic.BaseModel):
|
|
14
|
+
"""Configuration for AWS CloudWatch logging agent."""
|
|
15
|
+
region: Optional[str] = None
|
|
16
|
+
credentials_file: Optional[str] = None
|
|
17
|
+
log_group_name: str = 'skypilot-logs'
|
|
18
|
+
log_stream_prefix: str = 'skypilot-'
|
|
19
|
+
auto_create_group: bool = True
|
|
20
|
+
additional_tags: Optional[Dict[str, str]] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _CloudWatchOutputConfig(pydantic.BaseModel):
|
|
24
|
+
"""Auxiliary model for building CloudWatch output config in YAML.
|
|
25
|
+
|
|
26
|
+
Ref: https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
|
|
27
|
+
"""
|
|
28
|
+
name: str = 'cloudwatch_logs'
|
|
29
|
+
match: str = '*'
|
|
30
|
+
region: Optional[str] = None
|
|
31
|
+
log_group_name: Optional[str] = None
|
|
32
|
+
log_stream_prefix: Optional[str] = None
|
|
33
|
+
auto_create_group: bool = True
|
|
34
|
+
additional_tags: Optional[Dict[str, str]] = None
|
|
35
|
+
|
|
36
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
37
|
+
config = self.model_dump(exclude_none=True)
|
|
38
|
+
if 'auto_create_group' in config:
|
|
39
|
+
config['auto_create_group'] = 'true' if config[
|
|
40
|
+
'auto_create_group'] else 'false'
|
|
41
|
+
return config
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CloudwatchLoggingAgent(FluentbitAgent):
|
|
45
|
+
"""AWS CloudWatch logging agent.
|
|
46
|
+
|
|
47
|
+
This agent forwards logs from SkyPilot clusters to AWS CloudWatch using
|
|
48
|
+
Fluent Bit. It supports authentication via IAM roles (preferred), AWS
|
|
49
|
+
credentials file, or environment variables.
|
|
50
|
+
|
|
51
|
+
Example configuration:
|
|
52
|
+
```yaml
|
|
53
|
+
logs:
|
|
54
|
+
store: aws
|
|
55
|
+
aws:
|
|
56
|
+
region: us-west-2
|
|
57
|
+
log_group_name: skypilot-logs
|
|
58
|
+
log_stream_prefix: my-cluster-
|
|
59
|
+
auto_create_group: true
|
|
60
|
+
```
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, config: Dict[str, Any]):
|
|
64
|
+
"""Initialize the CloudWatch logging agent.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
config: The configuration for the CloudWatch logging agent.
|
|
68
|
+
See the class docstring for the expected format.
|
|
69
|
+
"""
|
|
70
|
+
self.config = _CloudwatchLoggingConfig(**config)
|
|
71
|
+
super().__init__()
|
|
72
|
+
|
|
73
|
+
def get_setup_command(self,
|
|
74
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
75
|
+
"""Get the command to set up the CloudWatch logging agent.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
cluster_name: The name of the cluster.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
The command to set up the CloudWatch logging agent.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
if self.config.credentials_file:
|
|
85
|
+
credential_path = self.config.credentials_file
|
|
86
|
+
|
|
87
|
+
# Set AWS credentials and check whether credentials are valid.
|
|
88
|
+
# CloudWatch plugin supports IAM roles, credentials file, and
|
|
89
|
+
# environment variables. We prefer IAM roles when available
|
|
90
|
+
# (on EC2 instances). If credentials file is provided, we use
|
|
91
|
+
# it. Otherwise, we check if credentials are available in
|
|
92
|
+
# the environment.
|
|
93
|
+
pre_cmd = ''
|
|
94
|
+
if self.config.credentials_file:
|
|
95
|
+
pre_cmd = (
|
|
96
|
+
f'export AWS_SHARED_CREDENTIALS_FILE={credential_path}; '
|
|
97
|
+
f'if [ ! -f {credential_path} ]; then '
|
|
98
|
+
f'echo "ERROR: AWS credentials file {credential_path} '
|
|
99
|
+
f'not found. Please check if the file exists and is '
|
|
100
|
+
f'accessible." && exit 1; '
|
|
101
|
+
f'fi; '
|
|
102
|
+
f'if ! grep -q "\\[.*\\]" {credential_path} || '
|
|
103
|
+
f'! grep -q "aws_access_key_id" {credential_path}; then '
|
|
104
|
+
f'echo "ERROR: AWS credentials file {credential_path} is '
|
|
105
|
+
f'invalid. It should contain a profile section '
|
|
106
|
+
f'[profile_name] and aws_access_key_id." && exit 1; '
|
|
107
|
+
f'fi;')
|
|
108
|
+
else:
|
|
109
|
+
# Check if we're running on EC2 with an IAM role or if
|
|
110
|
+
# AWS credentials are available in the environment
|
|
111
|
+
pre_cmd = (
|
|
112
|
+
'if ! curl -s -m 1 http://169.254.169.254'
|
|
113
|
+
'/latest/meta-data/iam/security-credentials/ > /dev/null; '
|
|
114
|
+
'then '
|
|
115
|
+
# failed EC2 check, look for env vars
|
|
116
|
+
'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
|
|
117
|
+
'[ -z "$AWS_SECRET_ACCESS_KEY" ]; then '
|
|
118
|
+
'echo "ERROR: AWS CloudWatch logging configuration error. '
|
|
119
|
+
'Not running on EC2 with IAM role and AWS credentials not '
|
|
120
|
+
'found in environment. Please do one of the following: '
|
|
121
|
+
'1. Run on an EC2 instance with an IAM role that has '
|
|
122
|
+
'CloudWatch permissions, 2. Set AWS_ACCESS_KEY_ID and '
|
|
123
|
+
'AWS_SECRET_ACCESS_KEY environment variables, or '
|
|
124
|
+
'3. Provide a credentials file via logs.aws.credentials_file '
|
|
125
|
+
'in SkyPilot config." && exit 1; '
|
|
126
|
+
'fi; '
|
|
127
|
+
'fi;')
|
|
128
|
+
|
|
129
|
+
# If region is specified, set it in the environment
|
|
130
|
+
if self.config.region:
|
|
131
|
+
pre_cmd += f' export AWS_REGION={self.config.region};'
|
|
132
|
+
else:
|
|
133
|
+
# If region is not specified, check if it's available in
|
|
134
|
+
# the environment or credentials file
|
|
135
|
+
pre_cmd += (
|
|
136
|
+
' if [ -z "$AWS_REGION" ] && '
|
|
137
|
+
'[ -z "$AWS_DEFAULT_REGION" ]; then '
|
|
138
|
+
'echo "WARNING: AWS region not specified in configuration or '
|
|
139
|
+
'environment. CloudWatch logging may fail if the region '
|
|
140
|
+
'cannot be determined. Consider setting logs.aws.region in '
|
|
141
|
+
'SkyPilot config."; '
|
|
142
|
+
'fi; ')
|
|
143
|
+
|
|
144
|
+
# Add a test command to verify AWS credentials work with CloudWatch
|
|
145
|
+
pre_cmd += (
|
|
146
|
+
' echo "Verifying AWS CloudWatch access..."; '
|
|
147
|
+
'if command -v aws > /dev/null; then '
|
|
148
|
+
'aws cloudwatch list-metrics --namespace AWS/Logs --max-items 1 '
|
|
149
|
+
'> /dev/null 2>&1 || '
|
|
150
|
+
'{ echo "ERROR: Failed to access AWS CloudWatch. Please check '
|
|
151
|
+
'your credentials and permissions."; '
|
|
152
|
+
'echo "The IAM role or user must have cloudwatch:ListMetrics '
|
|
153
|
+
'and logs:* permissions."; '
|
|
154
|
+
'exit 1; }; '
|
|
155
|
+
'else echo "AWS CLI not installed, skipping CloudWatch access '
|
|
156
|
+
'verification."; '
|
|
157
|
+
'fi; ')
|
|
158
|
+
|
|
159
|
+
return pre_cmd + ' ' + super().get_setup_command(cluster_name)
|
|
160
|
+
|
|
161
|
+
def fluentbit_config(self,
|
|
162
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
163
|
+
"""Get the Fluent Bit configuration for CloudWatch.
|
|
164
|
+
|
|
165
|
+
This overrides the base method to add a fallback output for local file
|
|
166
|
+
logging in case CloudWatch logging fails.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
cluster_name: The name of the cluster.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
The Fluent Bit configuration as a YAML string.
|
|
173
|
+
"""
|
|
174
|
+
display_name = cluster_name.display_name
|
|
175
|
+
unique_name = cluster_name.name_on_cloud
|
|
176
|
+
# Build tags for the log stream
|
|
177
|
+
tags = {
|
|
178
|
+
'skypilot.cluster_name': display_name,
|
|
179
|
+
'skypilot.cluster_id': unique_name,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
# Add additional tags if provided
|
|
183
|
+
if self.config.additional_tags:
|
|
184
|
+
tags.update(self.config.additional_tags)
|
|
185
|
+
|
|
186
|
+
log_processors = []
|
|
187
|
+
for key, value in tags.items():
|
|
188
|
+
log_processors.append({
|
|
189
|
+
'name': 'content_modifier',
|
|
190
|
+
'action': 'upsert',
|
|
191
|
+
'key': key,
|
|
192
|
+
'value': value
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
cfg_dict = {
|
|
196
|
+
'pipeline': {
|
|
197
|
+
'inputs': [{
|
|
198
|
+
'name': 'tail',
|
|
199
|
+
'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
|
|
200
|
+
'path_key': 'log_path',
|
|
201
|
+
# Shorten the refresh interval from 60s to 1s since every
|
|
202
|
+
# job creates a new log file and we must be responsive
|
|
203
|
+
# for this: the VM might be autodown within a minute
|
|
204
|
+
# right after the job completion.
|
|
205
|
+
'refresh_interval': 1,
|
|
206
|
+
'processors': {
|
|
207
|
+
'logs': log_processors,
|
|
208
|
+
}
|
|
209
|
+
}],
|
|
210
|
+
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Add fallback outputs for graceful failure handling
|
|
215
|
+
cfg_dict = self.add_fallback_outputs(cfg_dict)
|
|
216
|
+
|
|
217
|
+
return common_utils.dump_yaml_str(cfg_dict)
|
|
218
|
+
|
|
219
|
+
def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
220
|
+
"""Add fallback outputs to the Fluent Bit configuration.
|
|
221
|
+
|
|
222
|
+
This adds a local file output as a fallback in case
|
|
223
|
+
CloudWatch logging fails.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
cfg_dict: The Fluent Bit configuration dictionary.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
The updated configuration dictionary.
|
|
230
|
+
"""
|
|
231
|
+
# Add a local file output as a fallback
|
|
232
|
+
fallback_output = {
|
|
233
|
+
'name': 'file',
|
|
234
|
+
'match': '*',
|
|
235
|
+
'path': '/tmp/skypilot_logs_fallback.log',
|
|
236
|
+
'format': 'out_file',
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
# Add the fallback output to the configuration
|
|
240
|
+
cfg_dict['pipeline']['outputs'].append(fallback_output)
|
|
241
|
+
|
|
242
|
+
return cfg_dict
|
|
243
|
+
|
|
244
|
+
def fluentbit_output_config(
|
|
245
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
|
246
|
+
"""Get the Fluent Bit output configuration for CloudWatch.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
cluster_name: The name of the cluster.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
The Fluent Bit output configuration for CloudWatch.
|
|
253
|
+
"""
|
|
254
|
+
unique_name = cluster_name.name_on_cloud
|
|
255
|
+
|
|
256
|
+
# Format the log stream name to include cluster information
|
|
257
|
+
# This helps with identifying logs in CloudWatch
|
|
258
|
+
log_stream_prefix = f'{self.config.log_stream_prefix}{unique_name}-'
|
|
259
|
+
|
|
260
|
+
# Create the CloudWatch output configuration with error handling options
|
|
261
|
+
return _CloudWatchOutputConfig(
|
|
262
|
+
region=self.config.region,
|
|
263
|
+
log_group_name=self.config.log_group_name,
|
|
264
|
+
log_stream_prefix=log_stream_prefix,
|
|
265
|
+
auto_create_group=self.config.auto_create_group,
|
|
266
|
+
).to_dict()
|
|
267
|
+
|
|
268
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
269
|
+
"""Get the credential file mounts for the CloudWatch logging agent.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
A dictionary mapping local credential file paths to remote paths.
|
|
273
|
+
"""
|
|
274
|
+
if self.config.credentials_file:
|
|
275
|
+
return {self.config.credentials_file: self.config.credentials_file}
|
|
276
|
+
return {}
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -41,10 +41,7 @@ def get_project_by_region(region: str) -> str:
|
|
|
41
41
|
|
|
42
42
|
# Check is there project if in config
|
|
43
43
|
project_id = skypilot_config.get_effective_region_config(
|
|
44
|
-
cloud='nebius',
|
|
45
|
-
region=None,
|
|
46
|
-
keys=(region, 'project_id'),
|
|
47
|
-
default_value=None)
|
|
44
|
+
cloud='nebius', region=region, keys=('project_id',), default_value=None)
|
|
48
45
|
if project_id is not None:
|
|
49
46
|
return project_id
|
|
50
47
|
for project in projects.items:
|
|
@@ -189,8 +186,8 @@ def launch(cluster_name_on_cloud: str,
|
|
|
189
186
|
if preset == '8gpu-128vcpu-1600gb':
|
|
190
187
|
fabric = skypilot_config.get_effective_region_config(
|
|
191
188
|
cloud='nebius',
|
|
192
|
-
region=
|
|
193
|
-
keys=(
|
|
189
|
+
region=region,
|
|
190
|
+
keys=('fabric',),
|
|
194
191
|
default_value=None)
|
|
195
192
|
|
|
196
193
|
# Auto-select fabric if network_tier=best and no fabric configured
|
sky/server/common.py
CHANGED
|
@@ -132,6 +132,8 @@ def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
|
|
|
132
132
|
def set_api_cookie_jar(cookie_jar: CookieJar,
|
|
133
133
|
create_if_not_exists: bool = True) -> None:
|
|
134
134
|
"""Updates the file cookie jar with the given cookie jar."""
|
|
135
|
+
if len(cookie_jar) == 0:
|
|
136
|
+
return
|
|
135
137
|
cookie_path = get_api_cookie_jar_path()
|
|
136
138
|
if not cookie_path.exists() and not create_if_not_exists:
|
|
137
139
|
# if the file doesn't exist and we don't want to create it, do nothing
|
|
@@ -252,8 +254,9 @@ def get_dashboard_url(server_url: str,
|
|
|
252
254
|
|
|
253
255
|
|
|
254
256
|
@annotations.lru_cache(scope='global')
|
|
255
|
-
def is_api_server_local():
|
|
256
|
-
|
|
257
|
+
def is_api_server_local(endpoint: Optional[str] = None):
|
|
258
|
+
server_url = endpoint if endpoint is not None else get_server_url()
|
|
259
|
+
return server_url in AVAILABLE_LOCAL_API_SERVER_URLS
|
|
257
260
|
|
|
258
261
|
|
|
259
262
|
def _handle_non_200_server_status(
|
|
@@ -350,7 +353,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
350
353
|
error=version_info.error)
|
|
351
354
|
|
|
352
355
|
cookies = get_cookies_from_response(response)
|
|
353
|
-
|
|
356
|
+
# Save or refresh the cookie jar in case of session affinity and
|
|
357
|
+
# OAuth.
|
|
358
|
+
set_api_cookie_jar(cookies, create_if_not_exists=True)
|
|
354
359
|
return server_info
|
|
355
360
|
except (json.JSONDecodeError, AttributeError) as e:
|
|
356
361
|
# Try to check if we got redirected to a login page.
|
|
@@ -566,7 +571,7 @@ def check_server_healthy(
|
|
|
566
571
|
api_server_status = api_server_info.status
|
|
567
572
|
if api_server_status == ApiServerStatus.VERSION_MISMATCH:
|
|
568
573
|
msg = api_server_info.error
|
|
569
|
-
if is_api_server_local():
|
|
574
|
+
if is_api_server_local(endpoint):
|
|
570
575
|
# For local server, just hint user to restart the server to get
|
|
571
576
|
# a consistent version.
|
|
572
577
|
msg = _LOCAL_API_SERVER_RESTART_HINT
|
sky/server/requests/payloads.py
CHANGED
|
@@ -203,17 +203,33 @@ class DagRequestBody(RequestBody):
|
|
|
203
203
|
return kwargs
|
|
204
204
|
|
|
205
205
|
|
|
206
|
-
class
|
|
206
|
+
class DagRequestBodyWithRequestOptions(DagRequestBody):
|
|
207
|
+
"""Request body base class for endpoints with a dag and request options."""
|
|
208
|
+
request_options: Optional[admin_policy.RequestOptions]
|
|
209
|
+
|
|
210
|
+
def get_request_options(self) -> Optional[admin_policy.RequestOptions]:
|
|
211
|
+
"""Get the request options."""
|
|
212
|
+
if self.request_options is None:
|
|
213
|
+
return None
|
|
214
|
+
if isinstance(self.request_options, dict):
|
|
215
|
+
return admin_policy.RequestOptions(**self.request_options)
|
|
216
|
+
return self.request_options
|
|
217
|
+
|
|
218
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
|
219
|
+
kwargs = super().to_kwargs()
|
|
220
|
+
kwargs['request_options'] = self.get_request_options()
|
|
221
|
+
return kwargs
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class ValidateBody(DagRequestBodyWithRequestOptions):
|
|
207
225
|
"""The request body for the validate endpoint."""
|
|
208
226
|
dag: str
|
|
209
|
-
request_options: Optional[admin_policy.RequestOptions]
|
|
210
227
|
|
|
211
228
|
|
|
212
|
-
class OptimizeBody(
|
|
229
|
+
class OptimizeBody(DagRequestBodyWithRequestOptions):
|
|
213
230
|
"""The request body for the optimize endpoint."""
|
|
214
231
|
dag: str
|
|
215
232
|
minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
|
|
216
|
-
request_options: Optional[admin_policy.RequestOptions]
|
|
217
233
|
|
|
218
234
|
|
|
219
235
|
class LaunchBody(RequestBody):
|
sky/server/rest.py
CHANGED
|
@@ -89,6 +89,12 @@ def retry_transient_errors(max_retries: int = 3,
|
|
|
89
89
|
for retry_cnt in range(max_retries):
|
|
90
90
|
try:
|
|
91
91
|
return func(*args, **kwargs)
|
|
92
|
+
# Occurs when the server proactively interrupts the request
|
|
93
|
+
# during rolling update, we can retry immediately on the
|
|
94
|
+
# new replica.
|
|
95
|
+
except exceptions.RequestInterruptedError:
|
|
96
|
+
logger.debug('Request interrupted. Retry immediately.')
|
|
97
|
+
continue
|
|
92
98
|
except Exception as e: # pylint: disable=broad-except
|
|
93
99
|
if retry_cnt >= max_retries - 1:
|
|
94
100
|
# Retries exhausted.
|
sky/server/server.py
CHANGED
|
@@ -827,7 +827,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
827
827
|
# added RTTs. For now, we stick to doing the validation inline in the
|
|
828
828
|
# server thread.
|
|
829
829
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
830
|
-
dag,
|
|
830
|
+
dag,
|
|
831
|
+
request_options=validate_body.get_request_options()) as dag:
|
|
831
832
|
# Skip validating workdir and file_mounts, as those need to be
|
|
832
833
|
# validated after the files are uploaded to the SkyPilot API server
|
|
833
834
|
# with `upload_mounts_to_api_server`.
|
sky/setup_files/MANIFEST.in
CHANGED
sky/setup_files/alembic.ini
CHANGED
|
@@ -94,10 +94,6 @@ version_table = alembic_version_state_db
|
|
|
94
94
|
version_locations = %(here)s/../schemas/db/spot_jobs
|
|
95
95
|
version_table = alembic_version_spot_jobs_db
|
|
96
96
|
|
|
97
|
-
[sky_config_db]
|
|
98
|
-
version_locations = %(here)s/../schemas/db/skypilot_config
|
|
99
|
-
version_table = alembic_version_sky_config_db
|
|
100
|
-
|
|
101
97
|
[post_write_hooks]
|
|
102
98
|
# post_write_hooks defines scripts or Python functions that are run
|
|
103
99
|
# on newly generated revision scripts. See the documentation for further
|
sky/skylet/constants.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Constants for SkyPilot."""
|
|
2
|
+
import os
|
|
2
3
|
from typing import List, Tuple
|
|
3
4
|
|
|
4
5
|
from packaging import version
|
|
@@ -491,3 +492,6 @@ DEFAULT_PRIORITY = 0
|
|
|
491
492
|
|
|
492
493
|
GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
|
|
493
494
|
COST_REPORT_DEFAULT_DAYS = 30
|
|
495
|
+
|
|
496
|
+
# The directory for file locks.
|
|
497
|
+
SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
|
sky/skypilot_config.py
CHANGED
|
@@ -58,10 +58,8 @@ import threading
|
|
|
58
58
|
import typing
|
|
59
59
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
60
60
|
|
|
61
|
-
from alembic import command as alembic_command
|
|
62
61
|
import filelock
|
|
63
62
|
import sqlalchemy
|
|
64
|
-
from sqlalchemy import exc as sqlalchemy_exc
|
|
65
63
|
from sqlalchemy import orm
|
|
66
64
|
from sqlalchemy.dialects import postgresql
|
|
67
65
|
from sqlalchemy.dialects import sqlite
|
|
@@ -78,7 +76,6 @@ from sky.utils import context
|
|
|
78
76
|
from sky.utils import schemas
|
|
79
77
|
from sky.utils import ux_utils
|
|
80
78
|
from sky.utils.db import db_utils
|
|
81
|
-
from sky.utils.db import migration_utils
|
|
82
79
|
from sky.utils.kubernetes import config_map_utils
|
|
83
80
|
|
|
84
81
|
if typing.TYPE_CHECKING:
|
|
@@ -574,17 +571,11 @@ def _reload_config_as_server() -> None:
|
|
|
574
571
|
'if db config is specified, no other config is allowed')
|
|
575
572
|
|
|
576
573
|
if db_url:
|
|
577
|
-
with
|
|
574
|
+
with _DB_USE_LOCK:
|
|
578
575
|
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
|
579
576
|
poolclass=NullPool)
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
alembic_config = migration_utils.get_alembic_config(
|
|
583
|
-
sqlalchemy_engine, migration_utils.SKYPILOT_CONFIG_DB_NAME)
|
|
584
|
-
# pylint: disable=line-too-long
|
|
585
|
-
alembic_config.config_ini_section = migration_utils.SKYPILOT_CONFIG_DB_NAME
|
|
586
|
-
alembic_command.upgrade(alembic_config,
|
|
587
|
-
migration_utils.SKYPILOT_CONFIG_VERSION)
|
|
577
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
|
578
|
+
sqlalchemy_engine)
|
|
588
579
|
|
|
589
580
|
def _get_config_yaml_from_db(
|
|
590
581
|
key: str) -> Optional[config_utils.Config]:
|
|
@@ -872,25 +863,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
872
863
|
with _DB_USE_LOCK:
|
|
873
864
|
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
|
874
865
|
poolclass=NullPool)
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
alembic_config = migration_utils.get_alembic_config(
|
|
878
|
-
sqlalchemy_engine, 'sky_config_db')
|
|
879
|
-
alembic_config.config_ini_section = 'sky_config_db'
|
|
880
|
-
try:
|
|
881
|
-
alembic_command.upgrade(alembic_config, '001')
|
|
882
|
-
except (sqlalchemy_exc.IntegrityError,
|
|
883
|
-
sqlalchemy_exc.OperationalError) as e:
|
|
884
|
-
# If the version already exists (due to concurrent
|
|
885
|
-
# initialization), we can safely ignore this error
|
|
886
|
-
if ('UNIQUE constraint failed: '
|
|
887
|
-
'alembic_version_sky_config_db.version_num'
|
|
888
|
-
in str(e) or
|
|
889
|
-
'table alembic_version_sky_config_db already exists'
|
|
890
|
-
in str(e)):
|
|
891
|
-
pass
|
|
892
|
-
else:
|
|
893
|
-
raise
|
|
866
|
+
db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
|
|
867
|
+
sqlalchemy_engine)
|
|
894
868
|
|
|
895
869
|
def _set_config_yaml_to_db(key: str,
|
|
896
870
|
config: config_utils.Config):
|
sky/utils/common_utils.py
CHANGED
|
@@ -562,8 +562,9 @@ def read_yaml_all(path: str) -> List[Dict[str, Any]]:
|
|
|
562
562
|
return read_yaml_all_str(f.read())
|
|
563
563
|
|
|
564
564
|
|
|
565
|
-
def dump_yaml(path: str,
|
|
566
|
-
|
|
565
|
+
def dump_yaml(path: str,
|
|
566
|
+
config: Union[List[Dict[str, Any]], Dict[str, Any]],
|
|
567
|
+
blank: bool = False) -> None:
|
|
567
568
|
"""Dumps a YAML file.
|
|
568
569
|
|
|
569
570
|
Args:
|
|
@@ -571,7 +572,11 @@ def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
|
|
|
571
572
|
config: the configuration to dump.
|
|
572
573
|
"""
|
|
573
574
|
with open(path, 'w', encoding='utf-8') as f:
|
|
574
|
-
|
|
575
|
+
contents = dump_yaml_str(config)
|
|
576
|
+
if blank and isinstance(config, dict) and len(config) == 0:
|
|
577
|
+
# when dumping to yaml, an empty dict will go in as {}.
|
|
578
|
+
contents = ''
|
|
579
|
+
f.write(contents)
|
|
575
580
|
|
|
576
581
|
|
|
577
582
|
def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
|
sky/utils/config_utils.py
CHANGED
|
@@ -6,6 +6,8 @@ from sky import sky_logging
|
|
|
6
6
|
|
|
7
7
|
logger = sky_logging.init_logger(__name__)
|
|
8
8
|
|
|
9
|
+
_REGION_CONFIG_CLOUDS = ['nebius', 'oci']
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
class Config(Dict[str, Any]):
|
|
11
13
|
"""SkyPilot config that supports setting/getting values with nested keys."""
|
|
@@ -248,6 +250,8 @@ def get_cloud_config_value_from_dict(
|
|
|
248
250
|
region_key = None
|
|
249
251
|
if cloud == 'kubernetes':
|
|
250
252
|
region_key = 'context_configs'
|
|
253
|
+
elif cloud in _REGION_CONFIG_CLOUDS:
|
|
254
|
+
region_key = 'region_configs'
|
|
251
255
|
|
|
252
256
|
per_context_config = None
|
|
253
257
|
if region is not None and region_key is not None:
|
|
@@ -255,6 +259,19 @@ def get_cloud_config_value_from_dict(
|
|
|
255
259
|
keys=(cloud, region_key, region) + keys,
|
|
256
260
|
default_value=None,
|
|
257
261
|
override_configs=override_configs)
|
|
262
|
+
if not per_context_config and cloud in _REGION_CONFIG_CLOUDS:
|
|
263
|
+
# TODO (kyuds): Backward compatibility, remove after 0.11.0.
|
|
264
|
+
per_context_config = input_config.get_nested(
|
|
265
|
+
keys=(cloud, region) + keys,
|
|
266
|
+
default_value=None,
|
|
267
|
+
override_configs=override_configs)
|
|
268
|
+
if per_context_config is not None:
|
|
269
|
+
logger.info(
|
|
270
|
+
f'{cloud} configuration is using the legacy format. \n'
|
|
271
|
+
'This format will be deprecated after 0.11.0, refer to '
|
|
272
|
+
'`https://docs.skypilot.co/en/latest/reference/config.html` ' # pylint: disable=line-too-long
|
|
273
|
+
'for the new format. Please use `region_configs` to specify region specific configuration.'
|
|
274
|
+
)
|
|
258
275
|
# if no override found for specified region
|
|
259
276
|
general_config = input_config.get_nested(keys=(cloud,) + keys,
|
|
260
277
|
default_value=default_value,
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
"""Constants for the database schemas."""
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
+
import logging
|
|
4
5
|
import os
|
|
5
6
|
|
|
7
|
+
from alembic import command as alembic_command
|
|
6
8
|
from alembic.config import Config
|
|
9
|
+
from alembic.runtime import migration
|
|
7
10
|
import filelock
|
|
8
11
|
import sqlalchemy
|
|
9
12
|
|
|
@@ -13,10 +16,6 @@ GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
|
13
16
|
GLOBAL_USER_STATE_VERSION = '001'
|
|
14
17
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
15
18
|
|
|
16
|
-
SKYPILOT_CONFIG_DB_NAME = 'skypilot_config_db'
|
|
17
|
-
SKYPILOT_CONFIG_VERSION = '001'
|
|
18
|
-
SKYPILOT_CONFIG_LOCK_PATH = '~/.sky/locks/.skypilot_config_db.lock'
|
|
19
|
-
|
|
20
19
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
|
21
20
|
SPOT_JOBS_VERSION = '001'
|
|
22
21
|
SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
|
|
@@ -51,3 +50,44 @@ def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
|
|
|
51
50
|
alembic_cfg.set_section_option(section, 'sqlalchemy.url', url)
|
|
52
51
|
|
|
53
52
|
return alembic_cfg
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine,
|
|
56
|
+
alembic_config: Config, target_revision: str):
|
|
57
|
+
"""Only upgrade if current version is older than target.
|
|
58
|
+
|
|
59
|
+
This handles the case where a database was created with a newer version of
|
|
60
|
+
the code and we're now running older code. Since our migrations are purely
|
|
61
|
+
additive, it's safe to run a newer database with older code.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
engine: SQLAlchemy engine for the database
|
|
65
|
+
alembic_config: Alembic configuration object
|
|
66
|
+
target_revision: Target revision to upgrade to (e.g., '001')
|
|
67
|
+
"""
|
|
68
|
+
# set alembic logger to warning level
|
|
69
|
+
alembic_logger = logging.getLogger('alembic')
|
|
70
|
+
alembic_logger.setLevel(logging.WARNING)
|
|
71
|
+
|
|
72
|
+
current_rev = None
|
|
73
|
+
|
|
74
|
+
# Get the current revision from the database
|
|
75
|
+
version_table = alembic_config.get_section_option(
|
|
76
|
+
alembic_config.config_ini_section, 'version_table', 'alembic_version')
|
|
77
|
+
|
|
78
|
+
with engine.connect() as connection:
|
|
79
|
+
context = migration.MigrationContext.configure(
|
|
80
|
+
connection, opts={'version_table': version_table})
|
|
81
|
+
current_rev = context.get_current_revision()
|
|
82
|
+
|
|
83
|
+
if current_rev is None:
|
|
84
|
+
alembic_command.upgrade(alembic_config, target_revision)
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
# Compare revisions - assuming they are numeric strings like '001', '002'
|
|
88
|
+
current_rev_num = int(current_rev)
|
|
89
|
+
target_rev_num = int(target_revision)
|
|
90
|
+
|
|
91
|
+
# only upgrade if current revision is older than target revision
|
|
92
|
+
if current_rev_num < target_rev_num:
|
|
93
|
+
alembic_command.upgrade(alembic_config, target_revision)
|