skypilot-nightly 1.0.0.dev20250720__py3-none-any.whl → 1.0.0.dev20250724__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (113) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +11 -4
  3. sky/backends/backend_utils.py +27 -11
  4. sky/backends/cloud_vm_ray_backend.py +22 -27
  5. sky/client/cli/command.py +44 -28
  6. sky/client/sdk.py +52 -7
  7. sky/client/sdk.pyi +296 -0
  8. sky/clouds/nebius.py +2 -5
  9. sky/clouds/utils/oci_utils.py +16 -40
  10. sky/clouds/vast.py +2 -1
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
  15. sky/dashboard/out/_next/static/chunks/{1871-a821dcaaae2a3823.js → 1871-ea0e7283886407ca.js} +2 -2
  16. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
  19. sky/dashboard/out/_next/static/chunks/{2641.5233e938f14e31a7.js → 2641.74c19c4d45a2c034.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
  23. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
  25. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
  27. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
  28. sky/dashboard/out/_next/static/chunks/{938-63fc419cb82ad9b3.js → 938-7ee806653aef0609.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
  30. sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
  32. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
  33. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
  39. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/webpack-b6447da22305b14a.js +1 -0
  44. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
  45. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  46. sky/dashboard/out/clusters/[cluster].html +1 -1
  47. sky/dashboard/out/clusters.html +1 -1
  48. sky/dashboard/out/config.html +1 -1
  49. sky/dashboard/out/index.html +1 -1
  50. sky/dashboard/out/infra/[context].html +1 -1
  51. sky/dashboard/out/infra.html +1 -1
  52. sky/dashboard/out/jobs/[job].html +1 -1
  53. sky/dashboard/out/jobs.html +1 -1
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +93 -32
  60. sky/exceptions.py +8 -0
  61. sky/global_user_state.py +2 -3
  62. sky/jobs/state.py +2 -2
  63. sky/logs/__init__.py +4 -0
  64. sky/logs/agent.py +14 -0
  65. sky/logs/aws.py +276 -0
  66. sky/provision/nebius/utils.py +3 -6
  67. sky/server/common.py +9 -4
  68. sky/server/requests/payloads.py +20 -4
  69. sky/server/rest.py +6 -0
  70. sky/server/server.py +2 -1
  71. sky/setup_files/MANIFEST.in +1 -1
  72. sky/setup_files/alembic.ini +0 -4
  73. sky/skylet/constants.py +4 -0
  74. sky/skypilot_config.py +5 -31
  75. sky/utils/common_utils.py +8 -3
  76. sky/utils/config_utils.py +17 -0
  77. sky/utils/db/migration_utils.py +44 -4
  78. sky/utils/locks.py +319 -0
  79. sky/utils/rich_utils.py +2 -3
  80. sky/utils/schemas.py +92 -56
  81. sky/utils/timeline.py +41 -0
  82. {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/METADATA +1 -1
  83. {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/RECORD +88 -86
  84. sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
  85. sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
  87. sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
  89. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
  90. sky/dashboard/out/_next/static/chunks/4869.c7c055a5c2814f33.js +0 -16
  91. sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
  92. sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
  94. sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
  95. sky/dashboard/out/_next/static/chunks/9470-8178183f3bae198f.js +0 -1
  96. sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
  97. sky/dashboard/out/_next/static/chunks/pages/_app-507712f30cd3cec3.js +0 -20
  98. sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
  101. sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
  102. sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/webpack-26cdc782eed15a7d.js +0 -1
  106. sky/dashboard/out/_next/static/css/5122cb0a08486fd3.css +0 -3
  107. sky/dashboard/out/_next/static/pTQKG61ng32Zc7gsAROFJ/_buildManifest.js +0 -1
  108. sky/schemas/db/skypilot_config/001_initial_schema.py +0 -30
  109. /sky/dashboard/out/_next/static/{pTQKG61ng32Zc7gsAROFJ → BURfWrKsQk9psMPv0OXrh}/_ssgManifest.js +0 -0
  110. {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/WHEEL +0 -0
  111. {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/entry_points.txt +0 -0
  112. {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/licenses/LICENSE +0 -0
  113. {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/top_level.txt +0 -0
sky/logs/aws.py ADDED
@@ -0,0 +1,276 @@
1
+ """AWS CloudWatch logging agent."""
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import pydantic
6
+
7
+ from sky.logs.agent import FluentbitAgent
8
+ from sky.skylet import constants
9
+ from sky.utils import common_utils
10
+ from sky.utils import resources_utils
11
+
12
+
13
+ class _CloudwatchLoggingConfig(pydantic.BaseModel):
14
+ """Configuration for AWS CloudWatch logging agent."""
15
+ region: Optional[str] = None
16
+ credentials_file: Optional[str] = None
17
+ log_group_name: str = 'skypilot-logs'
18
+ log_stream_prefix: str = 'skypilot-'
19
+ auto_create_group: bool = True
20
+ additional_tags: Optional[Dict[str, str]] = None
21
+
22
+
23
+ class _CloudWatchOutputConfig(pydantic.BaseModel):
24
+ """Auxiliary model for building CloudWatch output config in YAML.
25
+
26
+ Ref: https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
27
+ """
28
+ name: str = 'cloudwatch_logs'
29
+ match: str = '*'
30
+ region: Optional[str] = None
31
+ log_group_name: Optional[str] = None
32
+ log_stream_prefix: Optional[str] = None
33
+ auto_create_group: bool = True
34
+ additional_tags: Optional[Dict[str, str]] = None
35
+
36
+ def to_dict(self) -> Dict[str, Any]:
37
+ config = self.model_dump(exclude_none=True)
38
+ if 'auto_create_group' in config:
39
+ config['auto_create_group'] = 'true' if config[
40
+ 'auto_create_group'] else 'false'
41
+ return config
42
+
43
+
44
+ class CloudwatchLoggingAgent(FluentbitAgent):
45
+ """AWS CloudWatch logging agent.
46
+
47
+ This agent forwards logs from SkyPilot clusters to AWS CloudWatch using
48
+ Fluent Bit. It supports authentication via IAM roles (preferred), AWS
49
+ credentials file, or environment variables.
50
+
51
+ Example configuration:
52
+ ```yaml
53
+ logs:
54
+ store: aws
55
+ aws:
56
+ region: us-west-2
57
+ log_group_name: skypilot-logs
58
+ log_stream_prefix: my-cluster-
59
+ auto_create_group: true
60
+ ```
61
+ """
62
+
63
+ def __init__(self, config: Dict[str, Any]):
64
+ """Initialize the CloudWatch logging agent.
65
+
66
+ Args:
67
+ config: The configuration for the CloudWatch logging agent.
68
+ See the class docstring for the expected format.
69
+ """
70
+ self.config = _CloudwatchLoggingConfig(**config)
71
+ super().__init__()
72
+
73
+ def get_setup_command(self,
74
+ cluster_name: resources_utils.ClusterName) -> str:
75
+ """Get the command to set up the CloudWatch logging agent.
76
+
77
+ Args:
78
+ cluster_name: The name of the cluster.
79
+
80
+ Returns:
81
+ The command to set up the CloudWatch logging agent.
82
+ """
83
+
84
+ if self.config.credentials_file:
85
+ credential_path = self.config.credentials_file
86
+
87
+ # Set AWS credentials and check whether credentials are valid.
88
+ # CloudWatch plugin supports IAM roles, credentials file, and
89
+ # environment variables. We prefer IAM roles when available
90
+ # (on EC2 instances). If credentials file is provided, we use
91
+ # it. Otherwise, we check if credentials are available in
92
+ # the environment.
93
+ pre_cmd = ''
94
+ if self.config.credentials_file:
95
+ pre_cmd = (
96
+ f'export AWS_SHARED_CREDENTIALS_FILE={credential_path}; '
97
+ f'if [ ! -f {credential_path} ]; then '
98
+ f'echo "ERROR: AWS credentials file {credential_path} '
99
+ f'not found. Please check if the file exists and is '
100
+ f'accessible." && exit 1; '
101
+ f'fi; '
102
+ f'if ! grep -q "\\[.*\\]" {credential_path} || '
103
+ f'! grep -q "aws_access_key_id" {credential_path}; then '
104
+ f'echo "ERROR: AWS credentials file {credential_path} is '
105
+ f'invalid. It should contain a profile section '
106
+ f'[profile_name] and aws_access_key_id." && exit 1; '
107
+ f'fi;')
108
+ else:
109
+ # Check if we're running on EC2 with an IAM role or if
110
+ # AWS credentials are available in the environment
111
+ pre_cmd = (
112
+ 'if ! curl -s -m 1 http://169.254.169.254'
113
+ '/latest/meta-data/iam/security-credentials/ > /dev/null; '
114
+ 'then '
115
+ # failed EC2 check, look for env vars
116
+ 'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
117
+ '[ -z "$AWS_SECRET_ACCESS_KEY" ]; then '
118
+ 'echo "ERROR: AWS CloudWatch logging configuration error. '
119
+ 'Not running on EC2 with IAM role and AWS credentials not '
120
+ 'found in environment. Please do one of the following: '
121
+ '1. Run on an EC2 instance with an IAM role that has '
122
+ 'CloudWatch permissions, 2. Set AWS_ACCESS_KEY_ID and '
123
+ 'AWS_SECRET_ACCESS_KEY environment variables, or '
124
+ '3. Provide a credentials file via logs.aws.credentials_file '
125
+ 'in SkyPilot config." && exit 1; '
126
+ 'fi; '
127
+ 'fi;')
128
+
129
+ # If region is specified, set it in the environment
130
+ if self.config.region:
131
+ pre_cmd += f' export AWS_REGION={self.config.region};'
132
+ else:
133
+ # If region is not specified, check if it's available in
134
+ # the environment or credentials file
135
+ pre_cmd += (
136
+ ' if [ -z "$AWS_REGION" ] && '
137
+ '[ -z "$AWS_DEFAULT_REGION" ]; then '
138
+ 'echo "WARNING: AWS region not specified in configuration or '
139
+ 'environment. CloudWatch logging may fail if the region '
140
+ 'cannot be determined. Consider setting logs.aws.region in '
141
+ 'SkyPilot config."; '
142
+ 'fi; ')
143
+
144
+ # Add a test command to verify AWS credentials work with CloudWatch
145
+ pre_cmd += (
146
+ ' echo "Verifying AWS CloudWatch access..."; '
147
+ 'if command -v aws > /dev/null; then '
148
+ 'aws cloudwatch list-metrics --namespace AWS/Logs --max-items 1 '
149
+ '> /dev/null 2>&1 || '
150
+ '{ echo "ERROR: Failed to access AWS CloudWatch. Please check '
151
+ 'your credentials and permissions."; '
152
+ 'echo "The IAM role or user must have cloudwatch:ListMetrics '
153
+ 'and logs:* permissions."; '
154
+ 'exit 1; }; '
155
+ 'else echo "AWS CLI not installed, skipping CloudWatch access '
156
+ 'verification."; '
157
+ 'fi; ')
158
+
159
+ return pre_cmd + ' ' + super().get_setup_command(cluster_name)
160
+
161
+ def fluentbit_config(self,
162
+ cluster_name: resources_utils.ClusterName) -> str:
163
+ """Get the Fluent Bit configuration for CloudWatch.
164
+
165
+ This overrides the base method to add a fallback output for local file
166
+ logging in case CloudWatch logging fails.
167
+
168
+ Args:
169
+ cluster_name: The name of the cluster.
170
+
171
+ Returns:
172
+ The Fluent Bit configuration as a YAML string.
173
+ """
174
+ display_name = cluster_name.display_name
175
+ unique_name = cluster_name.name_on_cloud
176
+ # Build tags for the log stream
177
+ tags = {
178
+ 'skypilot.cluster_name': display_name,
179
+ 'skypilot.cluster_id': unique_name,
180
+ }
181
+
182
+ # Add additional tags if provided
183
+ if self.config.additional_tags:
184
+ tags.update(self.config.additional_tags)
185
+
186
+ log_processors = []
187
+ for key, value in tags.items():
188
+ log_processors.append({
189
+ 'name': 'content_modifier',
190
+ 'action': 'upsert',
191
+ 'key': key,
192
+ 'value': value
193
+ })
194
+
195
+ cfg_dict = {
196
+ 'pipeline': {
197
+ 'inputs': [{
198
+ 'name': 'tail',
199
+ 'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
200
+ 'path_key': 'log_path',
201
+ # Shorten the refresh interval from 60s to 1s since every
202
+ # job creates a new log file and we must be responsive
203
+ # for this: the VM might be autodown within a minute
204
+ # right after the job completion.
205
+ 'refresh_interval': 1,
206
+ 'processors': {
207
+ 'logs': log_processors,
208
+ }
209
+ }],
210
+ 'outputs': [self.fluentbit_output_config(cluster_name)],
211
+ }
212
+ }
213
+
214
+ # Add fallback outputs for graceful failure handling
215
+ cfg_dict = self.add_fallback_outputs(cfg_dict)
216
+
217
+ return common_utils.dump_yaml_str(cfg_dict)
218
+
219
+ def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
220
+ """Add fallback outputs to the Fluent Bit configuration.
221
+
222
+ This adds a local file output as a fallback in case
223
+ CloudWatch logging fails.
224
+
225
+ Args:
226
+ cfg_dict: The Fluent Bit configuration dictionary.
227
+
228
+ Returns:
229
+ The updated configuration dictionary.
230
+ """
231
+ # Add a local file output as a fallback
232
+ fallback_output = {
233
+ 'name': 'file',
234
+ 'match': '*',
235
+ 'path': '/tmp/skypilot_logs_fallback.log',
236
+ 'format': 'out_file',
237
+ }
238
+
239
+ # Add the fallback output to the configuration
240
+ cfg_dict['pipeline']['outputs'].append(fallback_output)
241
+
242
+ return cfg_dict
243
+
244
+ def fluentbit_output_config(
245
+ self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
246
+ """Get the Fluent Bit output configuration for CloudWatch.
247
+
248
+ Args:
249
+ cluster_name: The name of the cluster.
250
+
251
+ Returns:
252
+ The Fluent Bit output configuration for CloudWatch.
253
+ """
254
+ unique_name = cluster_name.name_on_cloud
255
+
256
+ # Format the log stream name to include cluster information
257
+ # This helps with identifying logs in CloudWatch
258
+ log_stream_prefix = f'{self.config.log_stream_prefix}{unique_name}-'
259
+
260
+ # Create the CloudWatch output configuration with error handling options
261
+ return _CloudWatchOutputConfig(
262
+ region=self.config.region,
263
+ log_group_name=self.config.log_group_name,
264
+ log_stream_prefix=log_stream_prefix,
265
+ auto_create_group=self.config.auto_create_group,
266
+ ).to_dict()
267
+
268
+ def get_credential_file_mounts(self) -> Dict[str, str]:
269
+ """Get the credential file mounts for the CloudWatch logging agent.
270
+
271
+ Returns:
272
+ A dictionary mapping local credential file paths to remote paths.
273
+ """
274
+ if self.config.credentials_file:
275
+ return {self.config.credentials_file: self.config.credentials_file}
276
+ return {}
@@ -41,10 +41,7 @@ def get_project_by_region(region: str) -> str:
41
41
 
42
42
  # Check is there project if in config
43
43
  project_id = skypilot_config.get_effective_region_config(
44
- cloud='nebius',
45
- region=None,
46
- keys=(region, 'project_id'),
47
- default_value=None)
44
+ cloud='nebius', region=region, keys=('project_id',), default_value=None)
48
45
  if project_id is not None:
49
46
  return project_id
50
47
  for project in projects.items:
@@ -189,8 +186,8 @@ def launch(cluster_name_on_cloud: str,
189
186
  if preset == '8gpu-128vcpu-1600gb':
190
187
  fabric = skypilot_config.get_effective_region_config(
191
188
  cloud='nebius',
192
- region=None,
193
- keys=(region, 'fabric'),
189
+ region=region,
190
+ keys=('fabric',),
194
191
  default_value=None)
195
192
 
196
193
  # Auto-select fabric if network_tier=best and no fabric configured
sky/server/common.py CHANGED
@@ -132,6 +132,8 @@ def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
132
132
  def set_api_cookie_jar(cookie_jar: CookieJar,
133
133
  create_if_not_exists: bool = True) -> None:
134
134
  """Updates the file cookie jar with the given cookie jar."""
135
+ if len(cookie_jar) == 0:
136
+ return
135
137
  cookie_path = get_api_cookie_jar_path()
136
138
  if not cookie_path.exists() and not create_if_not_exists:
137
139
  # if the file doesn't exist and we don't want to create it, do nothing
@@ -252,8 +254,9 @@ def get_dashboard_url(server_url: str,
252
254
 
253
255
 
254
256
  @annotations.lru_cache(scope='global')
255
- def is_api_server_local():
256
- return get_server_url() in AVAILABLE_LOCAL_API_SERVER_URLS
257
+ def is_api_server_local(endpoint: Optional[str] = None):
258
+ server_url = endpoint if endpoint is not None else get_server_url()
259
+ return server_url in AVAILABLE_LOCAL_API_SERVER_URLS
257
260
 
258
261
 
259
262
  def _handle_non_200_server_status(
@@ -350,7 +353,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
350
353
  error=version_info.error)
351
354
 
352
355
  cookies = get_cookies_from_response(response)
353
- set_api_cookie_jar(cookies, create_if_not_exists=False)
356
+ # Save or refresh the cookie jar in case of session affinity and
357
+ # OAuth.
358
+ set_api_cookie_jar(cookies, create_if_not_exists=True)
354
359
  return server_info
355
360
  except (json.JSONDecodeError, AttributeError) as e:
356
361
  # Try to check if we got redirected to a login page.
@@ -566,7 +571,7 @@ def check_server_healthy(
566
571
  api_server_status = api_server_info.status
567
572
  if api_server_status == ApiServerStatus.VERSION_MISMATCH:
568
573
  msg = api_server_info.error
569
- if is_api_server_local():
574
+ if is_api_server_local(endpoint):
570
575
  # For local server, just hint user to restart the server to get
571
576
  # a consistent version.
572
577
  msg = _LOCAL_API_SERVER_RESTART_HINT
@@ -203,17 +203,33 @@ class DagRequestBody(RequestBody):
203
203
  return kwargs
204
204
 
205
205
 
206
- class ValidateBody(DagRequestBody):
206
+ class DagRequestBodyWithRequestOptions(DagRequestBody):
207
+ """Request body base class for endpoints with a dag and request options."""
208
+ request_options: Optional[admin_policy.RequestOptions]
209
+
210
+ def get_request_options(self) -> Optional[admin_policy.RequestOptions]:
211
+ """Get the request options."""
212
+ if self.request_options is None:
213
+ return None
214
+ if isinstance(self.request_options, dict):
215
+ return admin_policy.RequestOptions(**self.request_options)
216
+ return self.request_options
217
+
218
+ def to_kwargs(self) -> Dict[str, Any]:
219
+ kwargs = super().to_kwargs()
220
+ kwargs['request_options'] = self.get_request_options()
221
+ return kwargs
222
+
223
+
224
+ class ValidateBody(DagRequestBodyWithRequestOptions):
207
225
  """The request body for the validate endpoint."""
208
226
  dag: str
209
- request_options: Optional[admin_policy.RequestOptions]
210
227
 
211
228
 
212
- class OptimizeBody(DagRequestBody):
229
+ class OptimizeBody(DagRequestBodyWithRequestOptions):
213
230
  """The request body for the optimize endpoint."""
214
231
  dag: str
215
232
  minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
216
- request_options: Optional[admin_policy.RequestOptions]
217
233
 
218
234
 
219
235
  class LaunchBody(RequestBody):
sky/server/rest.py CHANGED
@@ -89,6 +89,12 @@ def retry_transient_errors(max_retries: int = 3,
89
89
  for retry_cnt in range(max_retries):
90
90
  try:
91
91
  return func(*args, **kwargs)
92
+ # Occurs when the server proactively interrupts the request
93
+ # during rolling update, we can retry immediately on the
94
+ # new replica.
95
+ except exceptions.RequestInterruptedError:
96
+ logger.debug('Request interrupted. Retry immediately.')
97
+ continue
92
98
  except Exception as e: # pylint: disable=broad-except
93
99
  if retry_cnt >= max_retries - 1:
94
100
  # Retries exhausted.
sky/server/server.py CHANGED
@@ -827,7 +827,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
827
827
  # added RTTs. For now, we stick to doing the validation inline in the
828
828
  # server thread.
829
829
  with admin_policy_utils.apply_and_use_config_in_current_request(
830
- dag, request_options=validate_body.request_options) as dag:
830
+ dag,
831
+ request_options=validate_body.get_request_options()) as dag:
831
832
  # Skip validating workdir and file_mounts, as those need to be
832
833
  # validated after the files are uploaded to the SkyPilot API server
833
834
  # with `upload_mounts_to_api_server`.
@@ -18,5 +18,5 @@ include sky/server/html/*
18
18
  recursive-include sky/dashboard/out *
19
19
  include sky/users/*.conf
20
20
  include sky/utils/*.sh
21
- include alembic.ini
21
+ include sky/setup_files/alembic.ini
22
22
  recursive-include sky/schemas/db *
@@ -94,10 +94,6 @@ version_table = alembic_version_state_db
94
94
  version_locations = %(here)s/../schemas/db/spot_jobs
95
95
  version_table = alembic_version_spot_jobs_db
96
96
 
97
- [sky_config_db]
98
- version_locations = %(here)s/../schemas/db/skypilot_config
99
- version_table = alembic_version_sky_config_db
100
-
101
97
  [post_write_hooks]
102
98
  # post_write_hooks defines scripts or Python functions that are run
103
99
  # on newly generated revision scripts. See the documentation for further
sky/skylet/constants.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Constants for SkyPilot."""
2
+ import os
2
3
  from typing import List, Tuple
3
4
 
4
5
  from packaging import version
@@ -491,3 +492,6 @@ DEFAULT_PRIORITY = 0
491
492
 
492
493
  GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
493
494
  COST_REPORT_DEFAULT_DAYS = 30
495
+
496
+ # The directory for file locks.
497
+ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
sky/skypilot_config.py CHANGED
@@ -58,10 +58,8 @@ import threading
58
58
  import typing
59
59
  from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
60
60
 
61
- from alembic import command as alembic_command
62
61
  import filelock
63
62
  import sqlalchemy
64
- from sqlalchemy import exc as sqlalchemy_exc
65
63
  from sqlalchemy import orm
66
64
  from sqlalchemy.dialects import postgresql
67
65
  from sqlalchemy.dialects import sqlite
@@ -78,7 +76,6 @@ from sky.utils import context
78
76
  from sky.utils import schemas
79
77
  from sky.utils import ux_utils
80
78
  from sky.utils.db import db_utils
81
- from sky.utils.db import migration_utils
82
79
  from sky.utils.kubernetes import config_map_utils
83
80
 
84
81
  if typing.TYPE_CHECKING:
@@ -574,17 +571,11 @@ def _reload_config_as_server() -> None:
574
571
  'if db config is specified, no other config is allowed')
575
572
 
576
573
  if db_url:
577
- with migration_utils.db_lock(migration_utils.SKYPILOT_CONFIG_DB_NAME):
574
+ with _DB_USE_LOCK:
578
575
  sqlalchemy_engine = sqlalchemy.create_engine(db_url,
579
576
  poolclass=NullPool)
580
-
581
- # Get alembic config for sky config db and run migrations
582
- alembic_config = migration_utils.get_alembic_config(
583
- sqlalchemy_engine, migration_utils.SKYPILOT_CONFIG_DB_NAME)
584
- # pylint: disable=line-too-long
585
- alembic_config.config_ini_section = migration_utils.SKYPILOT_CONFIG_DB_NAME
586
- alembic_command.upgrade(alembic_config,
587
- migration_utils.SKYPILOT_CONFIG_VERSION)
577
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
578
+ sqlalchemy_engine)
588
579
 
589
580
  def _get_config_yaml_from_db(
590
581
  key: str) -> Optional[config_utils.Config]:
@@ -872,25 +863,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
872
863
  with _DB_USE_LOCK:
873
864
  sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
874
865
  poolclass=NullPool)
875
-
876
- # Get alembic config for sky config db and run migrations
877
- alembic_config = migration_utils.get_alembic_config(
878
- sqlalchemy_engine, 'sky_config_db')
879
- alembic_config.config_ini_section = 'sky_config_db'
880
- try:
881
- alembic_command.upgrade(alembic_config, '001')
882
- except (sqlalchemy_exc.IntegrityError,
883
- sqlalchemy_exc.OperationalError) as e:
884
- # If the version already exists (due to concurrent
885
- # initialization), we can safely ignore this error
886
- if ('UNIQUE constraint failed: '
887
- 'alembic_version_sky_config_db.version_num'
888
- in str(e) or
889
- 'table alembic_version_sky_config_db already exists'
890
- in str(e)):
891
- pass
892
- else:
893
- raise
866
+ db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
867
+ sqlalchemy_engine)
894
868
 
895
869
  def _set_config_yaml_to_db(key: str,
896
870
  config: config_utils.Config):
sky/utils/common_utils.py CHANGED
@@ -562,8 +562,9 @@ def read_yaml_all(path: str) -> List[Dict[str, Any]]:
562
562
  return read_yaml_all_str(f.read())
563
563
 
564
564
 
565
- def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
566
- Dict[str, Any]]) -> None:
565
+ def dump_yaml(path: str,
566
+ config: Union[List[Dict[str, Any]], Dict[str, Any]],
567
+ blank: bool = False) -> None:
567
568
  """Dumps a YAML file.
568
569
 
569
570
  Args:
@@ -571,7 +572,11 @@ def dump_yaml(path: str, config: Union[List[Dict[str, Any]],
571
572
  config: the configuration to dump.
572
573
  """
573
574
  with open(path, 'w', encoding='utf-8') as f:
574
- f.write(dump_yaml_str(config))
575
+ contents = dump_yaml_str(config)
576
+ if blank and isinstance(config, dict) and len(config) == 0:
577
+ # when dumping to yaml, an empty dict will go in as {}.
578
+ contents = ''
579
+ f.write(contents)
575
580
 
576
581
 
577
582
  def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
sky/utils/config_utils.py CHANGED
@@ -6,6 +6,8 @@ from sky import sky_logging
6
6
 
7
7
  logger = sky_logging.init_logger(__name__)
8
8
 
9
+ _REGION_CONFIG_CLOUDS = ['nebius', 'oci']
10
+
9
11
 
10
12
  class Config(Dict[str, Any]):
11
13
  """SkyPilot config that supports setting/getting values with nested keys."""
@@ -248,6 +250,8 @@ def get_cloud_config_value_from_dict(
248
250
  region_key = None
249
251
  if cloud == 'kubernetes':
250
252
  region_key = 'context_configs'
253
+ elif cloud in _REGION_CONFIG_CLOUDS:
254
+ region_key = 'region_configs'
251
255
 
252
256
  per_context_config = None
253
257
  if region is not None and region_key is not None:
@@ -255,6 +259,19 @@ def get_cloud_config_value_from_dict(
255
259
  keys=(cloud, region_key, region) + keys,
256
260
  default_value=None,
257
261
  override_configs=override_configs)
262
+ if not per_context_config and cloud in _REGION_CONFIG_CLOUDS:
263
+ # TODO (kyuds): Backward compatibility, remove after 0.11.0.
264
+ per_context_config = input_config.get_nested(
265
+ keys=(cloud, region) + keys,
266
+ default_value=None,
267
+ override_configs=override_configs)
268
+ if per_context_config is not None:
269
+ logger.info(
270
+ f'{cloud} configuration is using the legacy format. \n'
271
+ 'This format will be deprecated after 0.11.0, refer to '
272
+ '`https://docs.skypilot.co/en/latest/reference/config.html` ' # pylint: disable=line-too-long
273
+ 'for the new format. Please use `region_configs` to specify region specific configuration.'
274
+ )
258
275
  # if no override found for specified region
259
276
  general_config = input_config.get_nested(keys=(cloud,) + keys,
260
277
  default_value=default_value,
@@ -1,9 +1,12 @@
1
1
  """Constants for the database schemas."""
2
2
 
3
3
  import contextlib
4
+ import logging
4
5
  import os
5
6
 
7
+ from alembic import command as alembic_command
6
8
  from alembic.config import Config
9
+ from alembic.runtime import migration
7
10
  import filelock
8
11
  import sqlalchemy
9
12
 
@@ -13,10 +16,6 @@ GLOBAL_USER_STATE_DB_NAME = 'state_db'
13
16
  GLOBAL_USER_STATE_VERSION = '001'
14
17
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
15
18
 
16
- SKYPILOT_CONFIG_DB_NAME = 'skypilot_config_db'
17
- SKYPILOT_CONFIG_VERSION = '001'
18
- SKYPILOT_CONFIG_LOCK_PATH = '~/.sky/locks/.skypilot_config_db.lock'
19
-
20
19
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
21
20
  SPOT_JOBS_VERSION = '001'
22
21
  SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
@@ -51,3 +50,44 @@ def get_alembic_config(engine: sqlalchemy.engine.Engine, section: str):
51
50
  alembic_cfg.set_section_option(section, 'sqlalchemy.url', url)
52
51
 
53
52
  return alembic_cfg
53
+
54
+
55
+ def safe_alembic_upgrade(engine: sqlalchemy.engine.Engine,
56
+ alembic_config: Config, target_revision: str):
57
+ """Only upgrade if current version is older than target.
58
+
59
+ This handles the case where a database was created with a newer version of
60
+ the code and we're now running older code. Since our migrations are purely
61
+ additive, it's safe to run a newer database with older code.
62
+
63
+ Args:
64
+ engine: SQLAlchemy engine for the database
65
+ alembic_config: Alembic configuration object
66
+ target_revision: Target revision to upgrade to (e.g., '001')
67
+ """
68
+ # set alembic logger to warning level
69
+ alembic_logger = logging.getLogger('alembic')
70
+ alembic_logger.setLevel(logging.WARNING)
71
+
72
+ current_rev = None
73
+
74
+ # Get the current revision from the database
75
+ version_table = alembic_config.get_section_option(
76
+ alembic_config.config_ini_section, 'version_table', 'alembic_version')
77
+
78
+ with engine.connect() as connection:
79
+ context = migration.MigrationContext.configure(
80
+ connection, opts={'version_table': version_table})
81
+ current_rev = context.get_current_revision()
82
+
83
+ if current_rev is None:
84
+ alembic_command.upgrade(alembic_config, target_revision)
85
+ return
86
+
87
+ # Compare revisions - assuming they are numeric strings like '001', '002'
88
+ current_rev_num = int(current_rev)
89
+ target_rev_num = int(target_revision)
90
+
91
+ # only upgrade if current revision is older than target revision
92
+ if current_rev_num < target_rev_num:
93
+ alembic_command.upgrade(alembic_config, target_revision)