skypilot-nightly 1.0.0.dev20250723__py3-none-any.whl → 1.0.0.dev20250724__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/client/cli/command.py +6 -5
- sky/client/sdk.pyi +296 -0
- sky/clouds/utils/oci_utils.py +16 -40
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-a305898dc479711e.js → webpack-b6447da22305b14a.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +8 -0
- sky/logs/__init__.py +4 -0
- sky/logs/agent.py +14 -0
- sky/logs/aws.py +276 -0
- sky/server/common.py +5 -1
- sky/server/requests/payloads.py +20 -4
- sky/server/rest.py +6 -0
- sky/server/server.py +2 -1
- sky/utils/config_utils.py +6 -4
- sky/utils/rich_utils.py +2 -3
- sky/utils/schemas.py +54 -22
- {skypilot_nightly-1.0.0.dev20250723.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250723.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/RECORD +39 -37
- /sky/dashboard/out/_next/static/{mym3Ciwp-zqU7ZpOLGnrW → BURfWrKsQk9psMPv0OXrh}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{mym3Ciwp-zqU7ZpOLGnrW → BURfWrKsQk9psMPv0OXrh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250723.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250723.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250723.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250723.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/top_level.txt +0 -0
sky/dashboard/out/jobs.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6447da22305b14a.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-49f790d12a85027c.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"BURfWrKsQk9psMPv0OXrh","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/users.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6447da22305b14a.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/users-6790fcefd5487b13.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/users","query":{},"buildId":"BURfWrKsQk9psMPv0OXrh","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/volumes.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6447da22305b14a.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"BURfWrKsQk9psMPv0OXrh","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6447da22305b14a.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"BURfWrKsQk9psMPv0OXrh","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6447da22305b14a.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1559-18717d96ef2fcbe9.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-eab0e9c16b64fd9f.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-9fa11dafb5cad4a6.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-2abbd0352f8ee061.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f64e03df359e04f7.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-8e0b2055bf5dd499.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-869d9c78bf5dd3df.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-d4a381403a8bae91.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-7ee806653aef0609.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-e49a159c30a6c4a7.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-6bcd4b20914d76c9.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"BURfWrKsQk9psMPv0OXrh","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6447da22305b14a.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-da491665d4289aae.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/BURfWrKsQk9psMPv0OXrh/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"BURfWrKsQk9psMPv0OXrh","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/exceptions.py
CHANGED
|
@@ -642,3 +642,11 @@ class RestfulPolicyError(Exception):
|
|
|
642
642
|
class GitError(Exception):
|
|
643
643
|
"""Raised when a git operation fails."""
|
|
644
644
|
pass
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
class RequestInterruptedError(Exception):
|
|
648
|
+
"""Raised when a request is interrupted by the server.
|
|
649
|
+
Client is expected to retry the request immediately when
|
|
650
|
+
this error is raised.
|
|
651
|
+
"""
|
|
652
|
+
pass
|
sky/logs/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Optional
|
|
|
4
4
|
from sky import exceptions
|
|
5
5
|
from sky import skypilot_config
|
|
6
6
|
from sky.logs.agent import LoggingAgent
|
|
7
|
+
from sky.logs.aws import CloudwatchLoggingAgent
|
|
7
8
|
from sky.logs.gcp import GCPLoggingAgent
|
|
8
9
|
|
|
9
10
|
|
|
@@ -13,5 +14,8 @@ def get_logging_agent() -> Optional[LoggingAgent]:
|
|
|
13
14
|
return None
|
|
14
15
|
if store == 'gcp':
|
|
15
16
|
return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
|
|
17
|
+
elif store == 'aws':
|
|
18
|
+
return CloudwatchLoggingAgent(
|
|
19
|
+
skypilot_config.get_nested(('logs', 'aws'), {}))
|
|
16
20
|
raise exceptions.InvalidSkyPilotConfigError(
|
|
17
21
|
f'Invalid logging store: {store}')
|
sky/logs/agent.py
CHANGED
|
@@ -67,6 +67,20 @@ class FluentbitAgent(LoggingAgent):
|
|
|
67
67
|
}
|
|
68
68
|
return common_utils.dump_yaml_str(cfg_dict)
|
|
69
69
|
|
|
70
|
+
def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
71
|
+
"""Add fallback outputs to the Fluent Bit configuration.
|
|
72
|
+
|
|
73
|
+
This method can be overridden by subclasses to add fallback outputs
|
|
74
|
+
in case the primary output fails.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
cfg_dict: The Fluent Bit configuration dictionary.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
The updated configuration dictionary.
|
|
81
|
+
"""
|
|
82
|
+
return cfg_dict
|
|
83
|
+
|
|
70
84
|
@abc.abstractmethod
|
|
71
85
|
def fluentbit_output_config(
|
|
72
86
|
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
sky/logs/aws.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""AWS CloudWatch logging agent."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from sky.logs.agent import FluentbitAgent
|
|
8
|
+
from sky.skylet import constants
|
|
9
|
+
from sky.utils import common_utils
|
|
10
|
+
from sky.utils import resources_utils
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _CloudwatchLoggingConfig(pydantic.BaseModel):
|
|
14
|
+
"""Configuration for AWS CloudWatch logging agent."""
|
|
15
|
+
region: Optional[str] = None
|
|
16
|
+
credentials_file: Optional[str] = None
|
|
17
|
+
log_group_name: str = 'skypilot-logs'
|
|
18
|
+
log_stream_prefix: str = 'skypilot-'
|
|
19
|
+
auto_create_group: bool = True
|
|
20
|
+
additional_tags: Optional[Dict[str, str]] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _CloudWatchOutputConfig(pydantic.BaseModel):
|
|
24
|
+
"""Auxiliary model for building CloudWatch output config in YAML.
|
|
25
|
+
|
|
26
|
+
Ref: https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
|
|
27
|
+
"""
|
|
28
|
+
name: str = 'cloudwatch_logs'
|
|
29
|
+
match: str = '*'
|
|
30
|
+
region: Optional[str] = None
|
|
31
|
+
log_group_name: Optional[str] = None
|
|
32
|
+
log_stream_prefix: Optional[str] = None
|
|
33
|
+
auto_create_group: bool = True
|
|
34
|
+
additional_tags: Optional[Dict[str, str]] = None
|
|
35
|
+
|
|
36
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
37
|
+
config = self.model_dump(exclude_none=True)
|
|
38
|
+
if 'auto_create_group' in config:
|
|
39
|
+
config['auto_create_group'] = 'true' if config[
|
|
40
|
+
'auto_create_group'] else 'false'
|
|
41
|
+
return config
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CloudwatchLoggingAgent(FluentbitAgent):
|
|
45
|
+
"""AWS CloudWatch logging agent.
|
|
46
|
+
|
|
47
|
+
This agent forwards logs from SkyPilot clusters to AWS CloudWatch using
|
|
48
|
+
Fluent Bit. It supports authentication via IAM roles (preferred), AWS
|
|
49
|
+
credentials file, or environment variables.
|
|
50
|
+
|
|
51
|
+
Example configuration:
|
|
52
|
+
```yaml
|
|
53
|
+
logs:
|
|
54
|
+
store: aws
|
|
55
|
+
aws:
|
|
56
|
+
region: us-west-2
|
|
57
|
+
log_group_name: skypilot-logs
|
|
58
|
+
log_stream_prefix: my-cluster-
|
|
59
|
+
auto_create_group: true
|
|
60
|
+
```
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, config: Dict[str, Any]):
|
|
64
|
+
"""Initialize the CloudWatch logging agent.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
config: The configuration for the CloudWatch logging agent.
|
|
68
|
+
See the class docstring for the expected format.
|
|
69
|
+
"""
|
|
70
|
+
self.config = _CloudwatchLoggingConfig(**config)
|
|
71
|
+
super().__init__()
|
|
72
|
+
|
|
73
|
+
def get_setup_command(self,
|
|
74
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
75
|
+
"""Get the command to set up the CloudWatch logging agent.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
cluster_name: The name of the cluster.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
The command to set up the CloudWatch logging agent.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
if self.config.credentials_file:
|
|
85
|
+
credential_path = self.config.credentials_file
|
|
86
|
+
|
|
87
|
+
# Set AWS credentials and check whether credentials are valid.
|
|
88
|
+
# CloudWatch plugin supports IAM roles, credentials file, and
|
|
89
|
+
# environment variables. We prefer IAM roles when available
|
|
90
|
+
# (on EC2 instances). If credentials file is provided, we use
|
|
91
|
+
# it. Otherwise, we check if credentials are available in
|
|
92
|
+
# the environment.
|
|
93
|
+
pre_cmd = ''
|
|
94
|
+
if self.config.credentials_file:
|
|
95
|
+
pre_cmd = (
|
|
96
|
+
f'export AWS_SHARED_CREDENTIALS_FILE={credential_path}; '
|
|
97
|
+
f'if [ ! -f {credential_path} ]; then '
|
|
98
|
+
f'echo "ERROR: AWS credentials file {credential_path} '
|
|
99
|
+
f'not found. Please check if the file exists and is '
|
|
100
|
+
f'accessible." && exit 1; '
|
|
101
|
+
f'fi; '
|
|
102
|
+
f'if ! grep -q "\\[.*\\]" {credential_path} || '
|
|
103
|
+
f'! grep -q "aws_access_key_id" {credential_path}; then '
|
|
104
|
+
f'echo "ERROR: AWS credentials file {credential_path} is '
|
|
105
|
+
f'invalid. It should contain a profile section '
|
|
106
|
+
f'[profile_name] and aws_access_key_id." && exit 1; '
|
|
107
|
+
f'fi;')
|
|
108
|
+
else:
|
|
109
|
+
# Check if we're running on EC2 with an IAM role or if
|
|
110
|
+
# AWS credentials are available in the environment
|
|
111
|
+
pre_cmd = (
|
|
112
|
+
'if ! curl -s -m 1 http://169.254.169.254'
|
|
113
|
+
'/latest/meta-data/iam/security-credentials/ > /dev/null; '
|
|
114
|
+
'then '
|
|
115
|
+
# failed EC2 check, look for env vars
|
|
116
|
+
'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
|
|
117
|
+
'[ -z "$AWS_SECRET_ACCESS_KEY" ]; then '
|
|
118
|
+
'echo "ERROR: AWS CloudWatch logging configuration error. '
|
|
119
|
+
'Not running on EC2 with IAM role and AWS credentials not '
|
|
120
|
+
'found in environment. Please do one of the following: '
|
|
121
|
+
'1. Run on an EC2 instance with an IAM role that has '
|
|
122
|
+
'CloudWatch permissions, 2. Set AWS_ACCESS_KEY_ID and '
|
|
123
|
+
'AWS_SECRET_ACCESS_KEY environment variables, or '
|
|
124
|
+
'3. Provide a credentials file via logs.aws.credentials_file '
|
|
125
|
+
'in SkyPilot config." && exit 1; '
|
|
126
|
+
'fi; '
|
|
127
|
+
'fi;')
|
|
128
|
+
|
|
129
|
+
# If region is specified, set it in the environment
|
|
130
|
+
if self.config.region:
|
|
131
|
+
pre_cmd += f' export AWS_REGION={self.config.region};'
|
|
132
|
+
else:
|
|
133
|
+
# If region is not specified, check if it's available in
|
|
134
|
+
# the environment or credentials file
|
|
135
|
+
pre_cmd += (
|
|
136
|
+
' if [ -z "$AWS_REGION" ] && '
|
|
137
|
+
'[ -z "$AWS_DEFAULT_REGION" ]; then '
|
|
138
|
+
'echo "WARNING: AWS region not specified in configuration or '
|
|
139
|
+
'environment. CloudWatch logging may fail if the region '
|
|
140
|
+
'cannot be determined. Consider setting logs.aws.region in '
|
|
141
|
+
'SkyPilot config."; '
|
|
142
|
+
'fi; ')
|
|
143
|
+
|
|
144
|
+
# Add a test command to verify AWS credentials work with CloudWatch
|
|
145
|
+
pre_cmd += (
|
|
146
|
+
' echo "Verifying AWS CloudWatch access..."; '
|
|
147
|
+
'if command -v aws > /dev/null; then '
|
|
148
|
+
'aws cloudwatch list-metrics --namespace AWS/Logs --max-items 1 '
|
|
149
|
+
'> /dev/null 2>&1 || '
|
|
150
|
+
'{ echo "ERROR: Failed to access AWS CloudWatch. Please check '
|
|
151
|
+
'your credentials and permissions."; '
|
|
152
|
+
'echo "The IAM role or user must have cloudwatch:ListMetrics '
|
|
153
|
+
'and logs:* permissions."; '
|
|
154
|
+
'exit 1; }; '
|
|
155
|
+
'else echo "AWS CLI not installed, skipping CloudWatch access '
|
|
156
|
+
'verification."; '
|
|
157
|
+
'fi; ')
|
|
158
|
+
|
|
159
|
+
return pre_cmd + ' ' + super().get_setup_command(cluster_name)
|
|
160
|
+
|
|
161
|
+
def fluentbit_config(self,
|
|
162
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
163
|
+
"""Get the Fluent Bit configuration for CloudWatch.
|
|
164
|
+
|
|
165
|
+
This overrides the base method to add a fallback output for local file
|
|
166
|
+
logging in case CloudWatch logging fails.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
cluster_name: The name of the cluster.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
The Fluent Bit configuration as a YAML string.
|
|
173
|
+
"""
|
|
174
|
+
display_name = cluster_name.display_name
|
|
175
|
+
unique_name = cluster_name.name_on_cloud
|
|
176
|
+
# Build tags for the log stream
|
|
177
|
+
tags = {
|
|
178
|
+
'skypilot.cluster_name': display_name,
|
|
179
|
+
'skypilot.cluster_id': unique_name,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
# Add additional tags if provided
|
|
183
|
+
if self.config.additional_tags:
|
|
184
|
+
tags.update(self.config.additional_tags)
|
|
185
|
+
|
|
186
|
+
log_processors = []
|
|
187
|
+
for key, value in tags.items():
|
|
188
|
+
log_processors.append({
|
|
189
|
+
'name': 'content_modifier',
|
|
190
|
+
'action': 'upsert',
|
|
191
|
+
'key': key,
|
|
192
|
+
'value': value
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
cfg_dict = {
|
|
196
|
+
'pipeline': {
|
|
197
|
+
'inputs': [{
|
|
198
|
+
'name': 'tail',
|
|
199
|
+
'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
|
|
200
|
+
'path_key': 'log_path',
|
|
201
|
+
# Shorten the refresh interval from 60s to 1s since every
|
|
202
|
+
# job creates a new log file and we must be responsive
|
|
203
|
+
# for this: the VM might be autodown within a minute
|
|
204
|
+
# right after the job completion.
|
|
205
|
+
'refresh_interval': 1,
|
|
206
|
+
'processors': {
|
|
207
|
+
'logs': log_processors,
|
|
208
|
+
}
|
|
209
|
+
}],
|
|
210
|
+
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Add fallback outputs for graceful failure handling
|
|
215
|
+
cfg_dict = self.add_fallback_outputs(cfg_dict)
|
|
216
|
+
|
|
217
|
+
return common_utils.dump_yaml_str(cfg_dict)
|
|
218
|
+
|
|
219
|
+
def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
220
|
+
"""Add fallback outputs to the Fluent Bit configuration.
|
|
221
|
+
|
|
222
|
+
This adds a local file output as a fallback in case
|
|
223
|
+
CloudWatch logging fails.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
cfg_dict: The Fluent Bit configuration dictionary.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
The updated configuration dictionary.
|
|
230
|
+
"""
|
|
231
|
+
# Add a local file output as a fallback
|
|
232
|
+
fallback_output = {
|
|
233
|
+
'name': 'file',
|
|
234
|
+
'match': '*',
|
|
235
|
+
'path': '/tmp/skypilot_logs_fallback.log',
|
|
236
|
+
'format': 'out_file',
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
# Add the fallback output to the configuration
|
|
240
|
+
cfg_dict['pipeline']['outputs'].append(fallback_output)
|
|
241
|
+
|
|
242
|
+
return cfg_dict
|
|
243
|
+
|
|
244
|
+
def fluentbit_output_config(
|
|
245
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
|
246
|
+
"""Get the Fluent Bit output configuration for CloudWatch.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
cluster_name: The name of the cluster.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
The Fluent Bit output configuration for CloudWatch.
|
|
253
|
+
"""
|
|
254
|
+
unique_name = cluster_name.name_on_cloud
|
|
255
|
+
|
|
256
|
+
# Format the log stream name to include cluster information
|
|
257
|
+
# This helps with identifying logs in CloudWatch
|
|
258
|
+
log_stream_prefix = f'{self.config.log_stream_prefix}{unique_name}-'
|
|
259
|
+
|
|
260
|
+
# Create the CloudWatch output configuration with error handling options
|
|
261
|
+
return _CloudWatchOutputConfig(
|
|
262
|
+
region=self.config.region,
|
|
263
|
+
log_group_name=self.config.log_group_name,
|
|
264
|
+
log_stream_prefix=log_stream_prefix,
|
|
265
|
+
auto_create_group=self.config.auto_create_group,
|
|
266
|
+
).to_dict()
|
|
267
|
+
|
|
268
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
269
|
+
"""Get the credential file mounts for the CloudWatch logging agent.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
A dictionary mapping local credential file paths to remote paths.
|
|
273
|
+
"""
|
|
274
|
+
if self.config.credentials_file:
|
|
275
|
+
return {self.config.credentials_file: self.config.credentials_file}
|
|
276
|
+
return {}
|
sky/server/common.py
CHANGED
|
@@ -132,6 +132,8 @@ def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
|
|
|
132
132
|
def set_api_cookie_jar(cookie_jar: CookieJar,
|
|
133
133
|
create_if_not_exists: bool = True) -> None:
|
|
134
134
|
"""Updates the file cookie jar with the given cookie jar."""
|
|
135
|
+
if len(cookie_jar) == 0:
|
|
136
|
+
return
|
|
135
137
|
cookie_path = get_api_cookie_jar_path()
|
|
136
138
|
if not cookie_path.exists() and not create_if_not_exists:
|
|
137
139
|
# if the file doesn't exist and we don't want to create it, do nothing
|
|
@@ -351,7 +353,9 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
351
353
|
error=version_info.error)
|
|
352
354
|
|
|
353
355
|
cookies = get_cookies_from_response(response)
|
|
354
|
-
|
|
356
|
+
# Save or refresh the cookie jar in case of session affinity and
|
|
357
|
+
# OAuth.
|
|
358
|
+
set_api_cookie_jar(cookies, create_if_not_exists=True)
|
|
355
359
|
return server_info
|
|
356
360
|
except (json.JSONDecodeError, AttributeError) as e:
|
|
357
361
|
# Try to check if we got redirected to a login page.
|
sky/server/requests/payloads.py
CHANGED
|
@@ -203,17 +203,33 @@ class DagRequestBody(RequestBody):
|
|
|
203
203
|
return kwargs
|
|
204
204
|
|
|
205
205
|
|
|
206
|
-
class
|
|
206
|
+
class DagRequestBodyWithRequestOptions(DagRequestBody):
|
|
207
|
+
"""Request body base class for endpoints with a dag and request options."""
|
|
208
|
+
request_options: Optional[admin_policy.RequestOptions]
|
|
209
|
+
|
|
210
|
+
def get_request_options(self) -> Optional[admin_policy.RequestOptions]:
|
|
211
|
+
"""Get the request options."""
|
|
212
|
+
if self.request_options is None:
|
|
213
|
+
return None
|
|
214
|
+
if isinstance(self.request_options, dict):
|
|
215
|
+
return admin_policy.RequestOptions(**self.request_options)
|
|
216
|
+
return self.request_options
|
|
217
|
+
|
|
218
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
|
219
|
+
kwargs = super().to_kwargs()
|
|
220
|
+
kwargs['request_options'] = self.get_request_options()
|
|
221
|
+
return kwargs
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class ValidateBody(DagRequestBodyWithRequestOptions):
|
|
207
225
|
"""The request body for the validate endpoint."""
|
|
208
226
|
dag: str
|
|
209
|
-
request_options: Optional[admin_policy.RequestOptions]
|
|
210
227
|
|
|
211
228
|
|
|
212
|
-
class OptimizeBody(
|
|
229
|
+
class OptimizeBody(DagRequestBodyWithRequestOptions):
|
|
213
230
|
"""The request body for the optimize endpoint."""
|
|
214
231
|
dag: str
|
|
215
232
|
minimize: common_lib.OptimizeTarget = common_lib.OptimizeTarget.COST
|
|
216
|
-
request_options: Optional[admin_policy.RequestOptions]
|
|
217
233
|
|
|
218
234
|
|
|
219
235
|
class LaunchBody(RequestBody):
|
sky/server/rest.py
CHANGED
|
@@ -89,6 +89,12 @@ def retry_transient_errors(max_retries: int = 3,
|
|
|
89
89
|
for retry_cnt in range(max_retries):
|
|
90
90
|
try:
|
|
91
91
|
return func(*args, **kwargs)
|
|
92
|
+
# Occurs when the server proactively interrupts the request
|
|
93
|
+
# during rolling update, we can retry immediately on the
|
|
94
|
+
# new replica.
|
|
95
|
+
except exceptions.RequestInterruptedError:
|
|
96
|
+
logger.debug('Request interrupted. Retry immediately.')
|
|
97
|
+
continue
|
|
92
98
|
except Exception as e: # pylint: disable=broad-except
|
|
93
99
|
if retry_cnt >= max_retries - 1:
|
|
94
100
|
# Retries exhausted.
|
sky/server/server.py
CHANGED
|
@@ -827,7 +827,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
827
827
|
# added RTTs. For now, we stick to doing the validation inline in the
|
|
828
828
|
# server thread.
|
|
829
829
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
830
|
-
dag,
|
|
830
|
+
dag,
|
|
831
|
+
request_options=validate_body.get_request_options()) as dag:
|
|
831
832
|
# Skip validating workdir and file_mounts, as those need to be
|
|
832
833
|
# validated after the files are uploaded to the SkyPilot API server
|
|
833
834
|
# with `upload_mounts_to_api_server`.
|
sky/utils/config_utils.py
CHANGED
|
@@ -6,6 +6,8 @@ from sky import sky_logging
|
|
|
6
6
|
|
|
7
7
|
logger = sky_logging.init_logger(__name__)
|
|
8
8
|
|
|
9
|
+
_REGION_CONFIG_CLOUDS = ['nebius', 'oci']
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
class Config(Dict[str, Any]):
|
|
11
13
|
"""SkyPilot config that supports setting/getting values with nested keys."""
|
|
@@ -248,7 +250,7 @@ def get_cloud_config_value_from_dict(
|
|
|
248
250
|
region_key = None
|
|
249
251
|
if cloud == 'kubernetes':
|
|
250
252
|
region_key = 'context_configs'
|
|
251
|
-
|
|
253
|
+
elif cloud in _REGION_CONFIG_CLOUDS:
|
|
252
254
|
region_key = 'region_configs'
|
|
253
255
|
|
|
254
256
|
per_context_config = None
|
|
@@ -257,7 +259,7 @@ def get_cloud_config_value_from_dict(
|
|
|
257
259
|
keys=(cloud, region_key, region) + keys,
|
|
258
260
|
default_value=None,
|
|
259
261
|
override_configs=override_configs)
|
|
260
|
-
if not per_context_config and cloud
|
|
262
|
+
if not per_context_config and cloud in _REGION_CONFIG_CLOUDS:
|
|
261
263
|
# TODO (kyuds): Backward compatibility, remove after 0.11.0.
|
|
262
264
|
per_context_config = input_config.get_nested(
|
|
263
265
|
keys=(cloud, region) + keys,
|
|
@@ -265,9 +267,9 @@ def get_cloud_config_value_from_dict(
|
|
|
265
267
|
override_configs=override_configs)
|
|
266
268
|
if per_context_config is not None:
|
|
267
269
|
logger.info(
|
|
268
|
-
'
|
|
270
|
+
f'{cloud} configuration is using the legacy format. \n'
|
|
269
271
|
'This format will be deprecated after 0.11.0, refer to '
|
|
270
|
-
'`https://docs.skypilot.co/en/latest/reference/config.html
|
|
272
|
+
'`https://docs.skypilot.co/en/latest/reference/config.html` ' # pylint: disable=line-too-long
|
|
271
273
|
'for the new format. Please use `region_configs` to specify region specific configuration.'
|
|
272
274
|
)
|
|
273
275
|
# if no override found for specified region
|
sky/utils/rich_utils.py
CHANGED
|
@@ -368,9 +368,8 @@ def decode_rich_status(
|
|
|
368
368
|
continue
|
|
369
369
|
|
|
370
370
|
if control == Control.RETRY:
|
|
371
|
-
raise exceptions.
|
|
372
|
-
'
|
|
373
|
-
'again.')
|
|
371
|
+
raise exceptions.RequestInterruptedError(
|
|
372
|
+
'Streaming interrupted. Please retry.')
|
|
374
373
|
# control is not None, i.e. it is a rich status control message.
|
|
375
374
|
if threading.current_thread() is not threading.main_thread():
|
|
376
375
|
yield None
|