skypilot-nightly 1.0.0.dev20250617__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +48 -36
- sky/cli.py +5 -5729
- sky/client/cli.py +11 -2
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +5 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +9 -1
- sky/skylet/job_lib.py +75 -19
- sky/templates/kubernetes-ray.yml.j2 +9 -0
- sky/users/permission.py +49 -19
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/schemas.py +40 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +58 -54
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- /sky/dashboard/out/_next/static/{vA3PPpkBwpRTRNBHFYAw_ → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250617.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-db3c97c2bfbceb65.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-1493ac755eadeb35.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-b3040e493f6e7947.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c4ff1ec05e2f3daf.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
@@ -64,6 +64,7 @@ user_table = sqlalchemy.Table(
|
|
64
64
|
Base.metadata,
|
65
65
|
sqlalchemy.Column('id', sqlalchemy.Text, primary_key=True),
|
66
66
|
sqlalchemy.Column('name', sqlalchemy.Text),
|
67
|
+
sqlalchemy.Column('password', sqlalchemy.Text),
|
67
68
|
)
|
68
69
|
|
69
70
|
cluster_table = sqlalchemy.Table(
|
@@ -301,6 +302,12 @@ def create_table():
|
|
301
302
|
'last_creation_command',
|
302
303
|
sqlalchemy.Text(),
|
303
304
|
default_statement='DEFAULT NULL')
|
305
|
+
db_utils.add_column_to_table_sqlalchemy(
|
306
|
+
session,
|
307
|
+
'users',
|
308
|
+
'password',
|
309
|
+
sqlalchemy.Text(),
|
310
|
+
default_statement='DEFAULT NULL')
|
304
311
|
session.commit()
|
305
312
|
|
306
313
|
|
@@ -358,7 +365,9 @@ def add_or_update_user(user: models.User) -> bool:
|
|
358
365
|
|
359
366
|
# First try INSERT OR IGNORE - this won't fail if user exists
|
360
367
|
insert_stmnt = insert_func(user_table).prefix_with(
|
361
|
-
'OR IGNORE').values(id=user.id,
|
368
|
+
'OR IGNORE').values(id=user.id,
|
369
|
+
name=user.name,
|
370
|
+
password=user.password)
|
362
371
|
result = session.execute(insert_stmnt)
|
363
372
|
|
364
373
|
# Check if the INSERT actually inserted a row
|
@@ -366,8 +375,14 @@ def add_or_update_user(user: models.User) -> bool:
|
|
366
375
|
|
367
376
|
if not was_inserted:
|
368
377
|
# User existed, so update it
|
369
|
-
|
370
|
-
|
378
|
+
if user.password:
|
379
|
+
session.query(user_table).filter_by(id=user.id).update({
|
380
|
+
user_table.c.name: user.name,
|
381
|
+
user_table.c.password: user.password
|
382
|
+
})
|
383
|
+
else:
|
384
|
+
session.query(user_table).filter_by(id=user.id).update(
|
385
|
+
{user_table.c.name: user.name})
|
371
386
|
|
372
387
|
session.commit()
|
373
388
|
return was_inserted
|
@@ -377,15 +392,19 @@ def add_or_update_user(user: models.User) -> bool:
|
|
377
392
|
# For PostgreSQL, use INSERT ... ON CONFLICT with RETURNING to
|
378
393
|
# detect insert vs update
|
379
394
|
insert_func = postgresql.insert
|
380
|
-
insert_stmnt = insert_func(user_table).values(
|
381
|
-
|
395
|
+
insert_stmnt = insert_func(user_table).values(
|
396
|
+
id=user.id, name=user.name, password=user.password)
|
382
397
|
|
383
398
|
# Use a sentinel in the RETURNING clause to detect insert vs update
|
399
|
+
if user.password:
|
400
|
+
set_ = {
|
401
|
+
user_table.c.name: user.name,
|
402
|
+
user_table.c.password: user.password
|
403
|
+
}
|
404
|
+
else:
|
405
|
+
set_ = {user_table.c.name: user.name}
|
384
406
|
upsert_stmnt = insert_stmnt.on_conflict_do_update(
|
385
|
-
index_elements=[user_table.c.id],
|
386
|
-
set_={
|
387
|
-
user_table.c.name: user.name
|
388
|
-
}).returning(
|
407
|
+
index_elements=[user_table.c.id], set_=set_).returning(
|
389
408
|
user_table.c.id,
|
390
409
|
# This will be True for INSERT, False for UPDATE
|
391
410
|
sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
|
@@ -407,7 +426,24 @@ def get_user(user_id: str) -> Optional[models.User]:
|
|
407
426
|
row = session.query(user_table).filter_by(id=user_id).first()
|
408
427
|
if row is None:
|
409
428
|
return None
|
410
|
-
return models.User(id=row.id, name=row.name)
|
429
|
+
return models.User(id=row.id, name=row.name, password=row.password)
|
430
|
+
|
431
|
+
|
432
|
+
def get_user_by_name(username: str) -> List[models.User]:
|
433
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
434
|
+
rows = session.query(user_table).filter_by(name=username).all()
|
435
|
+
if len(rows) == 0:
|
436
|
+
return []
|
437
|
+
return [
|
438
|
+
models.User(id=row.id, name=row.name, password=row.password)
|
439
|
+
for row in rows
|
440
|
+
]
|
441
|
+
|
442
|
+
|
443
|
+
def delete_user(user_id: str) -> None:
|
444
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
445
|
+
session.query(user_table).filter_by(id=user_id).delete()
|
446
|
+
session.commit()
|
411
447
|
|
412
448
|
|
413
449
|
@_init_db
|
@@ -415,7 +451,10 @@ def get_all_users() -> List[models.User]:
|
|
415
451
|
assert _SQLALCHEMY_ENGINE is not None
|
416
452
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
417
453
|
rows = session.query(user_table).all()
|
418
|
-
return [
|
454
|
+
return [
|
455
|
+
models.User(id=row.id, name=row.name, password=row.password)
|
456
|
+
for row in rows
|
457
|
+
]
|
419
458
|
|
420
459
|
|
421
460
|
@_init_db
|
sky/logs/__init__.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
"""Sky logging agents."""
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from sky import exceptions
|
5
|
+
from sky import skypilot_config
|
6
|
+
from sky.logs.agent import LoggingAgent
|
7
|
+
from sky.logs.gcp import GCPLoggingAgent
|
8
|
+
|
9
|
+
|
10
|
+
def get_logging_agent() -> Optional[LoggingAgent]:
|
11
|
+
store = skypilot_config.get_nested(('logs', 'store'), None)
|
12
|
+
if store is None:
|
13
|
+
return None
|
14
|
+
if store == 'gcp':
|
15
|
+
return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
|
16
|
+
raise exceptions.InvalidSkyPilotConfigError(
|
17
|
+
f'Invalid logging store: {store}')
|
sky/logs/agent.py
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
"""Base class for all logging agents."""
|
2
|
+
import abc
|
3
|
+
import os
|
4
|
+
import shlex
|
5
|
+
from typing import Any, Dict
|
6
|
+
|
7
|
+
from sky.skylet import constants
|
8
|
+
from sky.utils import common_utils
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
|
11
|
+
|
12
|
+
class LoggingAgent(abc.ABC):
|
13
|
+
"""Base class for all logging agents.
|
14
|
+
|
15
|
+
Each agent should implement the `get_setup_command` and
|
16
|
+
`get_credential_file_mounts` methods to return the setup command and
|
17
|
+
credential file mounts for the agent for provisioner to setup the agent
|
18
|
+
on each node.
|
19
|
+
"""
|
20
|
+
|
21
|
+
@abc.abstractmethod
|
22
|
+
def get_setup_command(self,
|
23
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
24
|
+
pass
|
25
|
+
|
26
|
+
@abc.abstractmethod
|
27
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
28
|
+
pass
|
29
|
+
|
30
|
+
|
31
|
+
class FluentbitAgent(LoggingAgent):
|
32
|
+
"""Base class for logging store that use fluentbit as the agent."""
|
33
|
+
|
34
|
+
def get_setup_command(self,
|
35
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
36
|
+
install_cmd = (
|
37
|
+
'if ! command -v fluent-bit >/dev/null 2>&1; then '
|
38
|
+
'sudo apt-get install -y gnupg; '
|
39
|
+
# pylint: disable=line-too-long
|
40
|
+
'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
|
41
|
+
'fi')
|
42
|
+
cfg = self.fluentbit_config(cluster_name)
|
43
|
+
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
44
|
+
config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
|
45
|
+
f'echo {shlex.quote(cfg)} > {cfg_path}')
|
46
|
+
start_cmd = ('nohup $(command -v fluent-bit || '
|
47
|
+
'echo "/opt/fluent-bit/bin/fluent-bit") '
|
48
|
+
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
|
49
|
+
return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
|
50
|
+
|
51
|
+
def fluentbit_config(self,
|
52
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
53
|
+
cfg_dict = {
|
54
|
+
'pipeline': {
|
55
|
+
'inputs': [{
|
56
|
+
'name': 'tail',
|
57
|
+
'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
|
58
|
+
'path_key': 'log_path',
|
59
|
+
# Shorten the refresh interval from 60s to 1s since every
|
60
|
+
# job creates a new log file and we must be responsive
|
61
|
+
# for this: the VM might be autodown within a minute
|
62
|
+
# right after the job completion.
|
63
|
+
'refresh_interval': 1,
|
64
|
+
}],
|
65
|
+
'outputs': [self.fluentbit_output_config(cluster_name)],
|
66
|
+
}
|
67
|
+
}
|
68
|
+
return common_utils.dump_yaml_str(cfg_dict)
|
69
|
+
|
70
|
+
@abc.abstractmethod
|
71
|
+
def fluentbit_output_config(
|
72
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
73
|
+
pass
|
sky/logs/gcp.py
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
"""GCP logging agent."""
|
2
|
+
|
3
|
+
from typing import Any, Dict, Optional
|
4
|
+
|
5
|
+
import pydantic
|
6
|
+
|
7
|
+
from sky.clouds import gcp
|
8
|
+
from sky.logs.agent import FluentbitAgent
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
|
11
|
+
|
12
|
+
class _GCPLoggingConfig(pydantic.BaseModel):
|
13
|
+
"""Configuration for GCP logging agent."""
|
14
|
+
project_id: Optional[str] = None
|
15
|
+
credentials_file: Optional[str] = None
|
16
|
+
additional_labels: Optional[Dict[str, str]] = None
|
17
|
+
|
18
|
+
|
19
|
+
class _StackdriverOutputConfig(pydantic.BaseModel):
|
20
|
+
"""Auxiliary model for building stackdriver output config in YAML.
|
21
|
+
|
22
|
+
Ref: https://docs.fluentbit.io/manual/1.7/pipeline/outputs/stackdriver
|
23
|
+
"""
|
24
|
+
name: str = 'stackdriver'
|
25
|
+
match: str = '*'
|
26
|
+
export_to_project_id: Optional[str] = None
|
27
|
+
labels: Optional[Dict[str, str]] = None
|
28
|
+
|
29
|
+
def to_dict(self) -> Dict[str, Any]:
|
30
|
+
config = self.model_dump(exclude_none=True)
|
31
|
+
if self.labels:
|
32
|
+
# Replace the label format from `{k: v}` to `k=v`
|
33
|
+
label_str = ','.join([f'{k}={v}' for k, v in self.labels.items()])
|
34
|
+
config['labels'] = label_str
|
35
|
+
return config
|
36
|
+
|
37
|
+
|
38
|
+
class GCPLoggingAgent(FluentbitAgent):
|
39
|
+
"""GCP logging agent."""
|
40
|
+
|
41
|
+
def __init__(self, config: Dict[str, Any]):
|
42
|
+
self.config = _GCPLoggingConfig(**config)
|
43
|
+
|
44
|
+
def get_setup_command(self,
|
45
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
46
|
+
credential_path = gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
|
47
|
+
if self.config.credentials_file:
|
48
|
+
credential_path = self.config.credentials_file
|
49
|
+
# Set GOOGLE_APPLICATION_CREDENTIALS and check whether credentials
|
50
|
+
# is valid.
|
51
|
+
# Stackdriver only support service account credentials or credentials
|
52
|
+
# from metadata server (only available on GCE or GKE). If the default
|
53
|
+
# credentials uploaded by API server is NOT a service account key and
|
54
|
+
# there is NO metadata server available, the logging agent will fail to
|
55
|
+
# authenticate and we require the user to upload a service account key
|
56
|
+
# via logs.gcp.credentials_file in this case.
|
57
|
+
# Also note that we use env var instead of YAML config to specify the
|
58
|
+
# service account key file path in order to resolve the home directory
|
59
|
+
# more reliably.
|
60
|
+
# Ref: https://github.com/fluent/fluent-bit/issues/8804
|
61
|
+
# TODO(aylei): check whether the credentials config is valid before
|
62
|
+
# provision.
|
63
|
+
pre_cmd = (f'export GOOGLE_APPLICATION_CREDENTIALS={credential_path}; '
|
64
|
+
f'cat {credential_path} | grep "service_account" || '
|
65
|
+
f'(echo "Credentials file {credential_path} is not a '
|
66
|
+
'service account key, check metadata server" && '
|
67
|
+
'curl -s http://metadata.google.internal >/dev/null || '
|
68
|
+
f'(echo "Neither service account key nor metadata server is '
|
69
|
+
'available. Set logs.gcp.credentials_file to a service '
|
70
|
+
'account key in server config and retry." && '
|
71
|
+
'exit 1;))')
|
72
|
+
return pre_cmd + ' && ' + super().get_setup_command(cluster_name)
|
73
|
+
|
74
|
+
def fluentbit_output_config(
|
75
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
76
|
+
display_name = cluster_name.display_name
|
77
|
+
unique_name = cluster_name.name_on_cloud
|
78
|
+
|
79
|
+
return _StackdriverOutputConfig(
|
80
|
+
export_to_project_id=self.config.project_id,
|
81
|
+
labels={
|
82
|
+
'skypilot_cluster_name': display_name,
|
83
|
+
'skypilot_cluster_id': unique_name,
|
84
|
+
**(self.config.additional_labels or {})
|
85
|
+
},
|
86
|
+
).to_dict()
|
87
|
+
|
88
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
89
|
+
if self.config.credentials_file:
|
90
|
+
return {self.config.credentials_file: self.config.credentials_file}
|
91
|
+
return {}
|
sky/models.py
CHANGED
sky/provision/instance_setup.py
CHANGED
@@ -8,6 +8,7 @@ import time
|
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
9
9
|
|
10
10
|
from sky import exceptions
|
11
|
+
from sky import logs
|
11
12
|
from sky import provision
|
12
13
|
from sky import sky_logging
|
13
14
|
from sky.provision import common
|
@@ -21,6 +22,7 @@ from sky.utils import accelerator_registry
|
|
21
22
|
from sky.utils import command_runner
|
22
23
|
from sky.utils import common_utils
|
23
24
|
from sky.utils import env_options
|
25
|
+
from sky.utils import resources_utils
|
24
26
|
from sky.utils import subprocess_utils
|
25
27
|
from sky.utils import timeline
|
26
28
|
from sky.utils import ux_utils
|
@@ -557,3 +559,36 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
|
|
557
559
|
ssh_credentials=ssh_credentials,
|
558
560
|
max_workers=subprocess_utils.get_max_workers_for_file_mounts(
|
559
561
|
common_file_mounts, cluster_info.provider_name))
|
562
|
+
|
563
|
+
|
564
|
+
@common.log_function_start_end
|
565
|
+
@timeline.event
|
566
|
+
def setup_logging_on_cluster(logging_agent: logs.LoggingAgent,
|
567
|
+
cluster_name: resources_utils.ClusterName,
|
568
|
+
cluster_info: common.ClusterInfo,
|
569
|
+
ssh_credentials: Dict[str, Any]) -> None:
|
570
|
+
"""Setup logging agent (fluentbit) on all nodes after provisioning."""
|
571
|
+
_hint_worker_log_path(cluster_name.name_on_cloud, cluster_info,
|
572
|
+
'logging_setup')
|
573
|
+
|
574
|
+
@_auto_retry()
|
575
|
+
def _setup_node(runner: command_runner.CommandRunner, log_path: str):
|
576
|
+
cmd = logging_agent.get_setup_command(cluster_name)
|
577
|
+
logger.info(f'Running command on node: {cmd}')
|
578
|
+
returncode, stdout, stderr = runner.run(cmd,
|
579
|
+
stream_logs=False,
|
580
|
+
require_outputs=True,
|
581
|
+
log_path=log_path,
|
582
|
+
source_bashrc=True)
|
583
|
+
if returncode:
|
584
|
+
raise RuntimeError(f'Failed to setup logging agent\n{cmd}\n'
|
585
|
+
f'(exit code {returncode}). Error: '
|
586
|
+
f'===== stdout ===== \n{stdout}\n'
|
587
|
+
f'===== stderr ====={stderr}')
|
588
|
+
|
589
|
+
_parallel_ssh_with_cache(_setup_node,
|
590
|
+
cluster_name.name_on_cloud,
|
591
|
+
stage_name='logging_setup',
|
592
|
+
digest=None,
|
593
|
+
cluster_info=cluster_info,
|
594
|
+
ssh_credentials=ssh_credentials)
|
sky/provision/provisioner.py
CHANGED
@@ -16,6 +16,7 @@ import sky
|
|
16
16
|
from sky import clouds
|
17
17
|
from sky import exceptions
|
18
18
|
from sky import global_user_state
|
19
|
+
from sky import logs
|
19
20
|
from sky import provision
|
20
21
|
from sky import sky_logging
|
21
22
|
from sky import skypilot_config
|
@@ -648,6 +649,15 @@ def _post_provision_setup(
|
|
648
649
|
logger.debug('Ray cluster is ready. Skip starting ray cluster on '
|
649
650
|
'worker nodes.')
|
650
651
|
|
652
|
+
logging_agent = logs.get_logging_agent()
|
653
|
+
if logging_agent:
|
654
|
+
status.update(
|
655
|
+
ux_utils.spinner_message('Setting up logging agent',
|
656
|
+
provision_logging.config.log_path))
|
657
|
+
instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
|
658
|
+
cluster_info,
|
659
|
+
ssh_credentials)
|
660
|
+
|
651
661
|
instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
|
652
662
|
cluster_info, ssh_credentials)
|
653
663
|
|
@@ -672,6 +682,7 @@ def post_provision_runtime_setup(
|
|
672
682
|
and other necessary files to the VM.
|
673
683
|
3. Run setup commands to install dependencies.
|
674
684
|
4. Start ray cluster and skylet.
|
685
|
+
5. (Optional) Setup logging agent.
|
675
686
|
|
676
687
|
Raises:
|
677
688
|
RuntimeError: If the setup process encounters any error.
|
sky/server/common.py
CHANGED
@@ -13,7 +13,7 @@ import subprocess
|
|
13
13
|
import sys
|
14
14
|
import time
|
15
15
|
import typing
|
16
|
-
from typing import Any, Dict, Literal, Optional
|
16
|
+
from typing import Any, Dict, Literal, Optional, Tuple
|
17
17
|
from urllib import parse
|
18
18
|
import uuid
|
19
19
|
|
@@ -128,6 +128,8 @@ class ApiServerInfo:
|
|
128
128
|
version: Optional[str] = None
|
129
129
|
version_on_disk: Optional[str] = None
|
130
130
|
commit: Optional[str] = None
|
131
|
+
user: Optional[Dict[str, Any]] = None
|
132
|
+
basic_auth_enabled: bool = False
|
131
133
|
|
132
134
|
|
133
135
|
def get_api_cookie_jar_path() -> pathlib.Path:
|
@@ -261,11 +263,15 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
261
263
|
version = result.get('version')
|
262
264
|
version_on_disk = result.get('version_on_disk')
|
263
265
|
commit = result.get('commit')
|
266
|
+
user = result.get('user')
|
267
|
+
basic_auth_enabled = result.get('basic_auth_enabled')
|
264
268
|
server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
|
265
269
|
api_version=api_version,
|
266
270
|
version=version,
|
267
271
|
version_on_disk=version_on_disk,
|
268
|
-
commit=commit
|
272
|
+
commit=commit,
|
273
|
+
user=user,
|
274
|
+
basic_auth_enabled=basic_auth_enabled)
|
269
275
|
if api_version is None or version is None or commit is None:
|
270
276
|
logger.warning(f'API server response missing '
|
271
277
|
f'version info. {server_url} may '
|
@@ -320,7 +326,8 @@ def get_request_id(response: 'requests.Response') -> RequestId:
|
|
320
326
|
|
321
327
|
def _start_api_server(deploy: bool = False,
|
322
328
|
host: str = '127.0.0.1',
|
323
|
-
foreground: bool = False
|
329
|
+
foreground: bool = False,
|
330
|
+
enable_basic_auth: bool = False):
|
324
331
|
"""Starts a SkyPilot API server locally."""
|
325
332
|
server_url = get_server_url(host)
|
326
333
|
assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
|
@@ -354,6 +361,8 @@ def _start_api_server(deploy: bool = False,
|
|
354
361
|
if foreground:
|
355
362
|
# Replaces the current process with the API server
|
356
363
|
os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
364
|
+
if enable_basic_auth:
|
365
|
+
os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
357
366
|
os.execvp(args[0], args)
|
358
367
|
|
359
368
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
@@ -365,6 +374,8 @@ def _start_api_server(deploy: bool = False,
|
|
365
374
|
# the API server.
|
366
375
|
server_env = os.environ.copy()
|
367
376
|
server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
377
|
+
if enable_basic_auth:
|
378
|
+
server_env[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
368
379
|
with open(log_path, 'w', encoding='utf-8') as log_file:
|
369
380
|
# Because the log file is opened using a with statement, it may seem
|
370
381
|
# that the file will be closed when the with statement is exited
|
@@ -428,10 +439,10 @@ def _start_api_server(deploy: bool = False,
|
|
428
439
|
|
429
440
|
def check_server_healthy(
|
430
441
|
endpoint: Optional[str] = None
|
431
|
-
) -> Literal[
|
442
|
+
) -> Tuple[Literal[
|
432
443
|
# Use an incomplete list of Literals here to enforce raising for other
|
433
444
|
# enum values.
|
434
|
-
ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH]:
|
445
|
+
ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH], ApiServerInfo]:
|
435
446
|
"""Check if the API server is healthy.
|
436
447
|
|
437
448
|
Args:
|
@@ -508,7 +519,7 @@ def check_server_healthy(
|
|
508
519
|
|
509
520
|
hinted_for_server_install_version_mismatch = True
|
510
521
|
|
511
|
-
return api_server_status
|
522
|
+
return api_server_status, api_server_info
|
512
523
|
|
513
524
|
|
514
525
|
def _get_version_info_hint(server_info: ApiServerInfo) -> str:
|
@@ -559,10 +570,11 @@ def get_skypilot_version_on_disk() -> str:
|
|
559
570
|
|
560
571
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
561
572
|
host: str = '127.0.0.1',
|
562
|
-
foreground: bool = False
|
573
|
+
foreground: bool = False,
|
574
|
+
enable_basic_auth: bool = False):
|
563
575
|
api_server_status = None
|
564
576
|
try:
|
565
|
-
api_server_status = check_server_healthy()
|
577
|
+
api_server_status, _ = check_server_healthy()
|
566
578
|
if api_server_status == ApiServerStatus.NEEDS_AUTH:
|
567
579
|
endpoint = get_server_url()
|
568
580
|
with ux_utils.print_exception_no_traceback():
|
@@ -580,7 +592,7 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
580
592
|
# have started the server while we were waiting for the lock.
|
581
593
|
api_server_info = get_api_server_status(endpoint)
|
582
594
|
if api_server_info.status == ApiServerStatus.UNHEALTHY:
|
583
|
-
_start_api_server(deploy, host, foreground)
|
595
|
+
_start_api_server(deploy, host, foreground, enable_basic_auth)
|
584
596
|
|
585
597
|
|
586
598
|
def check_server_healthy_or_start(func):
|
sky/server/requests/payloads.py
CHANGED
@@ -336,10 +336,28 @@ class ClusterJobsDownloadLogsBody(RequestBody):
|
|
336
336
|
local_dir: str = constants.SKY_LOGS_DIRECTORY
|
337
337
|
|
338
338
|
|
339
|
+
class UserCreateBody(RequestBody):
|
340
|
+
"""The request body for the user create endpoint."""
|
341
|
+
username: str
|
342
|
+
password: str
|
343
|
+
role: Optional[str] = None
|
344
|
+
|
345
|
+
|
346
|
+
class UserDeleteBody(RequestBody):
|
347
|
+
"""The request body for the user delete endpoint."""
|
348
|
+
user_id: str
|
349
|
+
|
350
|
+
|
339
351
|
class UserUpdateBody(RequestBody):
|
340
352
|
"""The request body for the user update endpoint."""
|
341
353
|
user_id: str
|
342
|
-
role: str
|
354
|
+
role: Optional[str] = None
|
355
|
+
password: Optional[str] = None
|
356
|
+
|
357
|
+
|
358
|
+
class UserImportBody(RequestBody):
|
359
|
+
"""The request body for the user import endpoint."""
|
360
|
+
csv_content: str
|
343
361
|
|
344
362
|
|
345
363
|
class DownloadBody(RequestBody):
|