skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +91 -96
- sky/cli.py +5 -6311
- sky/client/cli.py +66 -639
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +8 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/__init__.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +48 -1
- sky/skylet/job_lib.py +83 -19
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +60 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +47 -34
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +83 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py
CHANGED
@@ -89,7 +89,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
89
89
|
# cluster yaml is updated.
|
90
90
|
#
|
91
91
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
92
|
-
SKYLET_VERSION = '
|
92
|
+
SKYLET_VERSION = '14'
|
93
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
94
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
95
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
@@ -396,6 +396,10 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
|
|
396
396
|
# persistent through PVC. See kubernetes-ray.yml.j2.
|
397
397
|
PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
|
398
398
|
PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
|
399
|
+
# Signal file to indicate that the controller is recovering from a failure.
|
400
|
+
# See sky/jobs/utils.py::update_managed_jobs_statuses for more details.
|
401
|
+
PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
|
402
|
+
'~/.sky/.controller_recovery_restarting_signal')
|
399
403
|
|
400
404
|
# The placeholder for the local skypilot config path in file mounts for
|
401
405
|
# controllers.
|
@@ -407,6 +411,11 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
407
411
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
408
412
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
409
413
|
|
414
|
+
# Environment variable that is set to 'true' if basic
|
415
|
+
# authentication is enabled in the API server.
|
416
|
+
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
417
|
+
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
418
|
+
|
410
419
|
SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
411
420
|
|
412
421
|
# BEGIN constants used for service catalog.
|
@@ -421,3 +430,41 @@ ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
421
430
|
|
422
431
|
# The user ID of the SkyPilot system.
|
423
432
|
SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
|
433
|
+
|
434
|
+
# The directory to store the logging configuration.
|
435
|
+
LOGGING_CONFIG_DIR = '~/.sky/logging'
|
436
|
+
|
437
|
+
# Resources constants
|
438
|
+
TIME_UNITS = {
|
439
|
+
's': 1 / 60,
|
440
|
+
'sec': 1 / 60,
|
441
|
+
'm': 1,
|
442
|
+
'min': 1,
|
443
|
+
'h': 60,
|
444
|
+
'hr': 60,
|
445
|
+
'd': 24 * 60,
|
446
|
+
'day': 24 * 60,
|
447
|
+
}
|
448
|
+
|
449
|
+
TIME_PATTERN: str = (
|
450
|
+
f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
|
451
|
+
|
452
|
+
MEMORY_SIZE_UNITS = {
|
453
|
+
'b': 1,
|
454
|
+
'k': 2**10,
|
455
|
+
'kb': 2**10,
|
456
|
+
'm': 2**20,
|
457
|
+
'mb': 2**20,
|
458
|
+
'g': 2**30,
|
459
|
+
'gb': 2**30,
|
460
|
+
't': 2**40,
|
461
|
+
'tb': 2**40,
|
462
|
+
'p': 2**50,
|
463
|
+
'pb': 2**50,
|
464
|
+
}
|
465
|
+
|
466
|
+
MEMORY_SIZE_PATTERN = (
|
467
|
+
'^[0-9]+('
|
468
|
+
f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
|
469
|
+
')?$/i')
|
470
|
+
MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
|
sky/skylet/job_lib.py
CHANGED
@@ -14,7 +14,7 @@ import sqlite3
|
|
14
14
|
import threading
|
15
15
|
import time
|
16
16
|
import typing
|
17
|
-
from typing import Any, Dict, List, Optional, Sequence
|
17
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
18
18
|
|
19
19
|
import colorama
|
20
20
|
import filelock
|
@@ -62,6 +62,7 @@ class JobInfoLoc(enum.IntEnum):
|
|
62
62
|
END_AT = 7
|
63
63
|
RESOURCES = 8
|
64
64
|
PID = 9
|
65
|
+
LOG_PATH = 10
|
65
66
|
|
66
67
|
|
67
68
|
def create_table(cursor, conn):
|
@@ -101,7 +102,8 @@ def create_table(cursor, conn):
|
|
101
102
|
start_at FLOAT DEFAULT -1,
|
102
103
|
end_at FLOAT DEFAULT NULL,
|
103
104
|
resources TEXT DEFAULT NULL,
|
104
|
-
pid INTEGER DEFAULT -1
|
105
|
+
pid INTEGER DEFAULT -1,
|
106
|
+
log_dir TEXT DEFAULT NULL)""")
|
105
107
|
|
106
108
|
cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
|
107
109
|
job_id INTEGER,
|
@@ -114,6 +116,8 @@ def create_table(cursor, conn):
|
|
114
116
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT')
|
115
117
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'pid',
|
116
118
|
'INTEGER DEFAULT -1')
|
119
|
+
db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
|
120
|
+
'TEXT DEFAULT NULL')
|
117
121
|
conn.commit()
|
118
122
|
|
119
123
|
|
@@ -335,13 +339,13 @@ def make_job_command_with_user_switching(username: str,
|
|
335
339
|
|
336
340
|
@init_db
|
337
341
|
def add_job(job_name: str, username: str, run_timestamp: str,
|
338
|
-
resources_str: str) -> int:
|
342
|
+
resources_str: str) -> Tuple[int, str]:
|
339
343
|
"""Atomically reserve the next available job id for the user."""
|
340
344
|
assert _DB is not None
|
341
345
|
job_submitted_at = time.time()
|
342
346
|
# job_id will autoincrement with the null value
|
343
347
|
_DB.cursor.execute(
|
344
|
-
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0)',
|
348
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null)',
|
345
349
|
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
346
350
|
run_timestamp, None, resources_str))
|
347
351
|
_DB.conn.commit()
|
@@ -350,7 +354,41 @@ def add_job(job_name: str, username: str, run_timestamp: str,
|
|
350
354
|
for row in rows:
|
351
355
|
job_id = row[0]
|
352
356
|
assert job_id is not None
|
353
|
-
|
357
|
+
log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, f'{job_id}-{job_name}')
|
358
|
+
set_log_dir_no_lock(job_id, log_dir)
|
359
|
+
return job_id, log_dir
|
360
|
+
|
361
|
+
|
362
|
+
@init_db
|
363
|
+
def set_log_dir_no_lock(job_id: int, log_dir: str) -> None:
|
364
|
+
"""Set the log directory for the job.
|
365
|
+
|
366
|
+
We persist the log directory for the job to allow changing the log directory
|
367
|
+
generation logic over versions.
|
368
|
+
|
369
|
+
Args:
|
370
|
+
job_id: The ID of the job.
|
371
|
+
log_dir: The log directory for the job.
|
372
|
+
"""
|
373
|
+
assert _DB is not None
|
374
|
+
_DB.cursor.execute('UPDATE jobs SET log_dir=(?) WHERE job_id=(?)',
|
375
|
+
(log_dir, job_id))
|
376
|
+
_DB.conn.commit()
|
377
|
+
|
378
|
+
|
379
|
+
@init_db
|
380
|
+
def get_log_dir_for_job(job_id: int) -> Optional[str]:
|
381
|
+
"""Get the log directory for the job.
|
382
|
+
|
383
|
+
Args:
|
384
|
+
job_id: The ID of the job.
|
385
|
+
"""
|
386
|
+
assert _DB is not None
|
387
|
+
rows = _DB.cursor.execute('SELECT log_dir FROM jobs WHERE job_id=(?)',
|
388
|
+
(job_id,))
|
389
|
+
for row in rows:
|
390
|
+
return row[0]
|
391
|
+
return None
|
354
392
|
|
355
393
|
|
356
394
|
@init_db
|
@@ -758,6 +796,14 @@ def fail_all_jobs_in_progress() -> None:
|
|
758
796
|
|
759
797
|
|
760
798
|
def update_status() -> None:
|
799
|
+
# This signal file suggests that the controller is recovering from a
|
800
|
+
# failure. See sky/jobs/utils.py::update_managed_jobs_statuses for more
|
801
|
+
# details. When recovering, we should not update the job status to failed
|
802
|
+
# driver as they will be recovered later.
|
803
|
+
if os.path.exists(
|
804
|
+
os.path.expanduser(
|
805
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
806
|
+
return
|
761
807
|
# This will be called periodically by the skylet to update the status
|
762
808
|
# of the jobs in the database, to avoid stale job status.
|
763
809
|
nonterminal_jobs = _get_jobs(user_hash=None,
|
@@ -970,8 +1016,8 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
|
|
970
1016
|
|
971
1017
|
|
972
1018
|
@init_db
|
973
|
-
def
|
974
|
-
"""Returns the relative paths to the log files for
|
1019
|
+
def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
1020
|
+
"""Returns the relative paths to the log files for jobs with globbing."""
|
975
1021
|
assert _DB is not None
|
976
1022
|
query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
|
977
1023
|
_DB.cursor.execute(
|
@@ -979,12 +1025,16 @@ def run_timestamp_with_globbing_payload(job_ids: List[Optional[str]]) -> str:
|
|
979
1025
|
SELECT * FROM jobs
|
980
1026
|
WHERE {query_str}""", job_ids)
|
981
1027
|
rows = _DB.cursor.fetchall()
|
982
|
-
|
1028
|
+
job_to_dir = {}
|
983
1029
|
for row in rows:
|
984
1030
|
job_id = row[JobInfoLoc.JOB_ID.value]
|
985
|
-
|
986
|
-
|
987
|
-
|
1031
|
+
if row[JobInfoLoc.LOG_PATH.value]:
|
1032
|
+
job_to_dir[str(job_id)] = row[JobInfoLoc.LOG_PATH.value]
|
1033
|
+
else:
|
1034
|
+
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
1035
|
+
job_to_dir[str(job_id)] = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
1036
|
+
run_timestamp)
|
1037
|
+
return message_utils.encode_payload(job_to_dir)
|
988
1038
|
|
989
1039
|
|
990
1040
|
class JobLibCodeGen:
|
@@ -1016,12 +1066,16 @@ class JobLibCodeGen:
|
|
1016
1066
|
'\nif int(constants.SKYLET_VERSION) < 9: '
|
1017
1067
|
'raise RuntimeError("SkyPilot runtime is too old, which does not '
|
1018
1068
|
'support submitting jobs.")',
|
1019
|
-
'\
|
1069
|
+
'\nresult = job_lib.add_job('
|
1020
1070
|
f'{job_name!r},'
|
1021
1071
|
f'{username!r},'
|
1022
1072
|
f'{run_timestamp!r},'
|
1023
1073
|
f'{resources_str!r})',
|
1024
|
-
'
|
1074
|
+
('\nif isinstance(result, tuple):'
|
1075
|
+
'\n print("Job ID: " + str(result[0]), flush=True)'
|
1076
|
+
'\n print("Log Dir: " + str(result[1]), flush=True)'
|
1077
|
+
'\nelse:'
|
1078
|
+
'\n print("Job ID: " + str(result), flush=True)'),
|
1025
1079
|
]
|
1026
1080
|
return cls._build(code)
|
1027
1081
|
|
@@ -1090,9 +1144,17 @@ class JobLibCodeGen:
|
|
1090
1144
|
# We use != instead of is not because 1 is not None will print a warning:
|
1091
1145
|
# <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
|
1092
1146
|
f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1147
|
+
# For backward compatibility, use the legacy generation rule for
|
1148
|
+
# jobs submitted before 0.11.0.
|
1149
|
+
('log_dir = None\n'
|
1150
|
+
'if hasattr(job_lib, "get_log_dir_for_job"):\n'
|
1151
|
+
' log_dir = job_lib.get_log_dir_for_job(job_id)\n'
|
1152
|
+
'if log_dir is None:\n'
|
1153
|
+
' run_timestamp = job_lib.get_run_timestamp(job_id)\n'
|
1154
|
+
f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
|
1155
|
+
),
|
1156
|
+
# Add a newline to leave the if indent block above.
|
1157
|
+
f'\ntail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
|
1096
1158
|
f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
|
1097
1159
|
f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
|
1098
1160
|
# After tailing, check the job status and exit with appropriate code
|
@@ -1132,12 +1194,14 @@ class JobLibCodeGen:
|
|
1132
1194
|
return cls._build(code)
|
1133
1195
|
|
1134
1196
|
@classmethod
|
1135
|
-
def
|
1136
|
-
job_ids: Optional[List[str]]) -> str:
|
1197
|
+
def get_log_dirs_for_jobs(cls, job_ids: Optional[List[str]]) -> str:
|
1137
1198
|
code = [
|
1138
1199
|
f'job_ids = {job_ids} if {job_ids} is not None '
|
1139
1200
|
'else [job_lib.get_latest_job_id()]',
|
1140
|
-
|
1201
|
+
# TODO(aylei): backward compatibility, remove after 0.12.0.
|
1202
|
+
'log_dirs = job_lib.get_log_dir_for_jobs(job_ids) if '
|
1203
|
+
'hasattr(job_lib, "get_log_dir_for_jobs") else '
|
1204
|
+
'job_lib.run_timestamp_with_globbing_payload(job_ids)',
|
1141
1205
|
'print(log_dirs, flush=True)',
|
1142
1206
|
]
|
1143
1207
|
return cls._build(code)
|
sky/task.py
CHANGED
@@ -121,27 +121,61 @@ def _fill_in_env_vars(
|
|
121
121
|
return json.loads(yaml_field_str)
|
122
122
|
|
123
123
|
|
124
|
-
def _check_docker_login_config(task_envs: Dict[str, str]
|
125
|
-
|
124
|
+
def _check_docker_login_config(task_envs: Dict[str, str],
|
125
|
+
task_secrets: Dict[str, str]) -> bool:
|
126
|
+
"""Validates a valid docker login config in task_envs and task_secrets.
|
126
127
|
|
127
|
-
|
128
|
+
Docker login variables must be specified together either in envs OR secrets,
|
129
|
+
not split across both. If any of the docker login env vars is set, all of
|
130
|
+
them must be set in the same location.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
task_envs: Environment variables
|
134
|
+
task_secrets: Secret variables (optional, defaults to empty dict)
|
128
135
|
|
129
136
|
Returns:
|
130
|
-
True if there is a valid docker login config
|
137
|
+
True if there is a valid docker login config.
|
131
138
|
False otherwise.
|
132
139
|
Raises:
|
133
|
-
ValueError: if
|
134
|
-
them are set.
|
140
|
+
ValueError: if docker login configuration is invalid.
|
135
141
|
"""
|
142
|
+
if task_secrets is None:
|
143
|
+
task_secrets = {}
|
144
|
+
|
136
145
|
all_keys = constants.DOCKER_LOGIN_ENV_VARS
|
137
|
-
|
138
|
-
|
146
|
+
envs_keys = all_keys & set(task_envs.keys())
|
147
|
+
secrets_keys = all_keys & set(task_secrets.keys())
|
148
|
+
|
149
|
+
# Check if any docker variables exist
|
150
|
+
if not envs_keys and not secrets_keys:
|
139
151
|
return False
|
140
|
-
|
152
|
+
|
153
|
+
# Check if variables are split across envs and secrets
|
154
|
+
if envs_keys and secrets_keys:
|
141
155
|
with ux_utils.print_exception_no_traceback():
|
142
156
|
raise ValueError(
|
143
|
-
|
144
|
-
|
157
|
+
'Docker login variables must be specified together either '
|
158
|
+
'in envs OR secrets, not split across both. '
|
159
|
+
f'Found in envs: {sorted(envs_keys)}, '
|
160
|
+
f'Found in secrets: {sorted(secrets_keys)}')
|
161
|
+
|
162
|
+
# Check if all variables are present in the chosen location
|
163
|
+
if envs_keys:
|
164
|
+
if len(envs_keys) != len(all_keys):
|
165
|
+
with ux_utils.print_exception_no_traceback():
|
166
|
+
raise ValueError(
|
167
|
+
'Docker login variables must be specified together '
|
168
|
+
'in envs. '
|
169
|
+
f'Missing from envs: {sorted(all_keys - envs_keys)}')
|
170
|
+
|
171
|
+
if secrets_keys:
|
172
|
+
if len(secrets_keys) != len(all_keys):
|
173
|
+
with ux_utils.print_exception_no_traceback():
|
174
|
+
raise ValueError(
|
175
|
+
'Docker login variables must be specified together '
|
176
|
+
'in secrets. '
|
177
|
+
f'Missing from secrets: {sorted(all_keys - secrets_keys)}')
|
178
|
+
|
145
179
|
return True
|
146
180
|
|
147
181
|
|
@@ -149,11 +183,13 @@ def _with_docker_login_config(
|
|
149
183
|
resources: Union[Set['resources_lib.Resources'],
|
150
184
|
List['resources_lib.Resources']],
|
151
185
|
task_envs: Dict[str, str],
|
186
|
+
task_secrets: Dict[str, str],
|
152
187
|
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
|
153
|
-
if not _check_docker_login_config(task_envs):
|
188
|
+
if not _check_docker_login_config(task_envs, task_secrets):
|
154
189
|
return resources
|
155
|
-
|
156
|
-
|
190
|
+
envs = task_envs.copy()
|
191
|
+
envs.update(task_secrets)
|
192
|
+
docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
|
157
193
|
|
158
194
|
def _add_docker_login_config(resources: 'resources_lib.Resources'):
|
159
195
|
docker_image = resources.extract_docker_image()
|
@@ -181,8 +217,11 @@ def _with_docker_username_for_runpod(
|
|
181
217
|
resources: Union[Set['resources_lib.Resources'],
|
182
218
|
List['resources_lib.Resources']],
|
183
219
|
task_envs: Dict[str, str],
|
220
|
+
task_secrets: Dict[str, str],
|
184
221
|
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
|
185
|
-
|
222
|
+
envs = task_envs.copy()
|
223
|
+
envs.update(task_secrets)
|
224
|
+
docker_username_for_runpod = envs.get(
|
186
225
|
constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
|
187
226
|
|
188
227
|
# We should not call r.copy() if docker_username_for_runpod is None,
|
@@ -204,6 +243,7 @@ class Task:
|
|
204
243
|
setup: Optional[str] = None,
|
205
244
|
run: Optional[CommandOrCommandGen] = None,
|
206
245
|
envs: Optional[Dict[str, str]] = None,
|
246
|
+
secrets: Optional[Dict[str, str]] = None,
|
207
247
|
workdir: Optional[str] = None,
|
208
248
|
num_nodes: Optional[int] = None,
|
209
249
|
# Advanced:
|
@@ -254,6 +294,9 @@ class Task:
|
|
254
294
|
self-contained lambda.
|
255
295
|
envs: A dictionary of environment variables to set before running the
|
256
296
|
setup and run commands.
|
297
|
+
secrets: A dictionary of secret environment variables to set before
|
298
|
+
running the setup and run commands. These will be redacted in logs
|
299
|
+
and YAML output.
|
257
300
|
workdir: The local working directory. This directory will be synced
|
258
301
|
to a location on the remote VM(s), and ``setup`` and ``run``
|
259
302
|
commands will be run under that location (thus, they can rely on
|
@@ -275,6 +318,13 @@ class Task:
|
|
275
318
|
storage_lib.StoreType] = {}
|
276
319
|
self.setup = setup
|
277
320
|
self._envs = envs or {}
|
321
|
+
self._secrets = secrets or {}
|
322
|
+
|
323
|
+
# Validate Docker login configuration early if both envs and secrets
|
324
|
+
# contain Docker variables
|
325
|
+
if self._envs or self._secrets:
|
326
|
+
_check_docker_login_config(self._envs, self._secrets)
|
327
|
+
|
278
328
|
self.workdir = workdir
|
279
329
|
self.docker_image = (docker_image if docker_image else
|
280
330
|
'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04')
|
@@ -447,6 +497,7 @@ class Task:
|
|
447
497
|
def from_yaml_config(
|
448
498
|
config: Dict[str, Any],
|
449
499
|
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
500
|
+
secrets_overrides: Optional[List[Tuple[str, str]]] = None,
|
450
501
|
) -> 'Task':
|
451
502
|
# More robust handling for 'envs': explicitly convert keys and values to
|
452
503
|
# str, since users may pass '123' as keys/values which will get parsed
|
@@ -460,6 +511,20 @@ class Task:
|
|
460
511
|
else:
|
461
512
|
new_envs[str(k)] = None
|
462
513
|
config['envs'] = new_envs
|
514
|
+
|
515
|
+
# More robust handling for 'secrets': explicitly convert keys and values
|
516
|
+
# to str, since users may pass '123' as keys/values which will get
|
517
|
+
# parsed as int causing validate_schema() to fail.
|
518
|
+
secrets = config.get('secrets')
|
519
|
+
if secrets is not None and isinstance(secrets, dict):
|
520
|
+
new_secrets: Dict[str, Optional[str]] = {}
|
521
|
+
for k, v in secrets.items():
|
522
|
+
if v is not None:
|
523
|
+
new_secrets[str(k)] = str(v)
|
524
|
+
else:
|
525
|
+
new_secrets[str(k)] = None
|
526
|
+
config['secrets'] = new_secrets
|
527
|
+
|
463
528
|
common_utils.validate_schema(config, schemas.get_task_schema(),
|
464
529
|
'Invalid task YAML: ')
|
465
530
|
if env_overrides is not None:
|
@@ -473,6 +538,12 @@ class Task:
|
|
473
538
|
new_envs.update(env_overrides)
|
474
539
|
config['envs'] = new_envs
|
475
540
|
|
541
|
+
if secrets_overrides is not None:
|
542
|
+
# Override secrets vars from CLI.
|
543
|
+
new_secrets = config.get('secrets', {})
|
544
|
+
new_secrets.update(secrets_overrides)
|
545
|
+
config['secrets'] = new_secrets
|
546
|
+
|
476
547
|
for k, v in config.get('envs', {}).items():
|
477
548
|
if v is None:
|
478
549
|
with ux_utils.print_exception_no_traceback():
|
@@ -482,6 +553,15 @@ class Task:
|
|
482
553
|
f'To set it to be empty, use an empty string ({k}: "" '
|
483
554
|
f'in task YAML or --env {k}="" in CLI).')
|
484
555
|
|
556
|
+
for k, v in config.get('secrets', {}).items():
|
557
|
+
if v is None:
|
558
|
+
with ux_utils.print_exception_no_traceback():
|
559
|
+
raise ValueError(
|
560
|
+
f'Secret variable {k!r} is None. Please set a '
|
561
|
+
'value for it in task YAML or with --secret flag. '
|
562
|
+
f'To set it to be empty, use an empty string ({k}: "" '
|
563
|
+
f'in task YAML or --secret {k}="" in CLI).')
|
564
|
+
|
485
565
|
# Fill in any Task.envs into file_mounts (src/dst paths, storage
|
486
566
|
# name/source).
|
487
567
|
if config.get('file_mounts') is not None:
|
@@ -505,6 +585,7 @@ class Task:
|
|
505
585
|
setup=config.pop('setup', None),
|
506
586
|
num_nodes=config.pop('num_nodes', None),
|
507
587
|
envs=config.pop('envs', None),
|
588
|
+
secrets=config.pop('secrets', None),
|
508
589
|
event_callback=config.pop('event_callback', None),
|
509
590
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
510
591
|
)
|
@@ -687,6 +768,10 @@ class Task:
|
|
687
768
|
def envs(self) -> Dict[str, str]:
|
688
769
|
return self._envs
|
689
770
|
|
771
|
+
@property
|
772
|
+
def secrets(self) -> Dict[str, str]:
|
773
|
+
return self._secrets
|
774
|
+
|
690
775
|
def update_envs(
|
691
776
|
self, envs: Union[None, List[Tuple[str, str]],
|
692
777
|
Dict[str, str]]) -> 'Task':
|
@@ -727,17 +812,70 @@ class Task:
|
|
727
812
|
# If the update_envs() is called after set_resources(), we need to
|
728
813
|
# manually update docker login config in task resources, in case the
|
729
814
|
# docker login envs are newly added.
|
730
|
-
if _check_docker_login_config(self._envs):
|
815
|
+
if _check_docker_login_config(self._envs, self._secrets):
|
731
816
|
self.resources = _with_docker_login_config(self.resources,
|
732
|
-
self._envs
|
817
|
+
self._envs,
|
818
|
+
self._secrets)
|
733
819
|
self.resources = _with_docker_username_for_runpod(
|
734
|
-
self.resources, self._envs)
|
820
|
+
self.resources, self._envs, self._secrets)
|
821
|
+
return self
|
822
|
+
|
823
|
+
def update_secrets(
|
824
|
+
self, secrets: Union[None, List[Tuple[str, str]],
|
825
|
+
Dict[str, str]]) -> 'Task':
|
826
|
+
"""Updates secret env vars for use inside the setup/run commands.
|
827
|
+
|
828
|
+
Args:
|
829
|
+
secrets: (optional) either a list of ``(secret_name, value)`` or a
|
830
|
+
dict ``{secret_name: value}``.
|
831
|
+
|
832
|
+
Returns:
|
833
|
+
self: The current task, with secrets updated.
|
834
|
+
|
835
|
+
Raises:
|
836
|
+
ValueError: if various invalid inputs errors are detected.
|
837
|
+
"""
|
838
|
+
if secrets is None:
|
839
|
+
secrets = {}
|
840
|
+
if isinstance(secrets, (list, tuple)):
|
841
|
+
keys = set(secret[0] for secret in secrets)
|
842
|
+
if len(keys) != len(secrets):
|
843
|
+
with ux_utils.print_exception_no_traceback():
|
844
|
+
raise ValueError('Duplicate secret keys provided.')
|
845
|
+
secrets = dict(secrets)
|
846
|
+
if isinstance(secrets, dict):
|
847
|
+
for key in secrets:
|
848
|
+
if not isinstance(key, str):
|
849
|
+
with ux_utils.print_exception_no_traceback():
|
850
|
+
raise ValueError('Secret keys must be strings.')
|
851
|
+
if not common_utils.is_valid_env_var(key):
|
852
|
+
with ux_utils.print_exception_no_traceback():
|
853
|
+
raise ValueError(f'Invalid secret key: {key}')
|
854
|
+
else:
|
855
|
+
with ux_utils.print_exception_no_traceback():
|
856
|
+
raise ValueError(
|
857
|
+
'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
|
858
|
+
f'{secrets}')
|
859
|
+
self._secrets.update(secrets)
|
860
|
+
# Validate Docker login configuration if needed
|
861
|
+
if _check_docker_login_config(self._envs, self._secrets):
|
862
|
+
self.resources = _with_docker_login_config(self.resources,
|
863
|
+
self._envs,
|
864
|
+
self._secrets)
|
865
|
+
self.resources = _with_docker_username_for_runpod(
|
866
|
+
self.resources, self._envs, self._secrets)
|
735
867
|
return self
|
736
868
|
|
737
869
|
@property
|
738
870
|
def use_spot(self) -> bool:
|
739
871
|
return any(r.use_spot for r in self.resources)
|
740
872
|
|
873
|
+
@property
|
874
|
+
def envs_and_secrets(self) -> Dict[str, str]:
|
875
|
+
envs = self.envs.copy()
|
876
|
+
envs.update(self.secrets)
|
877
|
+
return envs
|
878
|
+
|
741
879
|
def set_inputs(self, inputs: str,
|
742
880
|
estimated_size_gigabytes: float) -> 'Task':
|
743
881
|
# E.g., 's3://bucket', 'gs://bucket', or None.
|
@@ -796,10 +934,11 @@ class Task:
|
|
796
934
|
if isinstance(resources, sky.Resources):
|
797
935
|
resources = {resources}
|
798
936
|
# TODO(woosuk): Check if the resources are None.
|
799
|
-
self.resources = _with_docker_login_config(resources, self.envs
|
937
|
+
self.resources = _with_docker_login_config(resources, self.envs,
|
938
|
+
self.secrets)
|
800
939
|
# Only have effect on RunPod.
|
801
940
|
self.resources = _with_docker_username_for_runpod(
|
802
|
-
self.resources, self.envs)
|
941
|
+
self.resources, self.envs, self.secrets)
|
803
942
|
|
804
943
|
# Evaluate if the task requires FUSE and set the requires_fuse flag
|
805
944
|
for _, storage_obj in self.storage_mounts.items():
|
@@ -1266,7 +1405,7 @@ class Task:
|
|
1266
1405
|
d[k] = v
|
1267
1406
|
return d
|
1268
1407
|
|
1269
|
-
def to_yaml_config(self) -> Dict[str, Any]:
|
1408
|
+
def to_yaml_config(self, redact_secrets: bool = True) -> Dict[str, Any]:
|
1270
1409
|
"""Returns a yaml-style dict representation of the task.
|
1271
1410
|
|
1272
1411
|
INTERNAL: this method is internal-facing.
|
@@ -1314,8 +1453,19 @@ class Task:
|
|
1314
1453
|
add_if_not_none('workdir', self.workdir)
|
1315
1454
|
add_if_not_none('event_callback', self.event_callback)
|
1316
1455
|
add_if_not_none('run', self.run)
|
1456
|
+
|
1457
|
+
# Add envs without redaction
|
1317
1458
|
add_if_not_none('envs', self.envs, no_empty=True)
|
1318
1459
|
|
1460
|
+
# Add secrets with redaction if requested
|
1461
|
+
secrets = self.secrets
|
1462
|
+
if secrets and redact_secrets:
|
1463
|
+
secrets = {
|
1464
|
+
k: '<redacted>' if isinstance(v, str) else v
|
1465
|
+
for k, v in secrets.items()
|
1466
|
+
}
|
1467
|
+
add_if_not_none('secrets', secrets, no_empty=True)
|
1468
|
+
|
1319
1469
|
add_if_not_none('file_mounts', {})
|
1320
1470
|
|
1321
1471
|
if self.file_mounts is not None:
|