skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py CHANGED
@@ -89,7 +89,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
89
  # cluster yaml is updated.
90
90
  #
91
91
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '13'
92
+ SKYLET_VERSION = '14'
93
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
95
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -396,6 +396,10 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
396
396
  # persistent through PVC. See kubernetes-ray.yml.j2.
397
397
  PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
398
398
  PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
399
+ # Signal file to indicate that the controller is recovering from a failure.
400
+ # See sky/jobs/utils.py::update_managed_jobs_statuses for more details.
401
+ PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
402
+ '~/.sky/.controller_recovery_restarting_signal')
399
403
 
400
404
  # The placeholder for the local skypilot config path in file mounts for
401
405
  # controllers.
@@ -407,6 +411,11 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
407
411
  # Environment variable that is set to 'true' if this is a skypilot server.
408
412
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
409
413
 
414
+ # Environment variable that is set to 'true' if basic
415
+ # authentication is enabled in the API server.
416
+ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
417
+ SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
418
+
410
419
  SKYPILOT_DEFAULT_WORKSPACE = 'default'
411
420
 
412
421
  # BEGIN constants used for service catalog.
@@ -421,3 +430,41 @@ ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
421
430
 
422
431
  # The user ID of the SkyPilot system.
423
432
  SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
433
+
434
+ # The directory to store the logging configuration.
435
+ LOGGING_CONFIG_DIR = '~/.sky/logging'
436
+
437
+ # Resources constants
438
+ TIME_UNITS = {
439
+ 's': 1 / 60,
440
+ 'sec': 1 / 60,
441
+ 'm': 1,
442
+ 'min': 1,
443
+ 'h': 60,
444
+ 'hr': 60,
445
+ 'd': 24 * 60,
446
+ 'day': 24 * 60,
447
+ }
448
+
449
+ TIME_PATTERN: str = (
450
+ f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
451
+
452
+ MEMORY_SIZE_UNITS = {
453
+ 'b': 1,
454
+ 'k': 2**10,
455
+ 'kb': 2**10,
456
+ 'm': 2**20,
457
+ 'mb': 2**20,
458
+ 'g': 2**30,
459
+ 'gb': 2**30,
460
+ 't': 2**40,
461
+ 'tb': 2**40,
462
+ 'p': 2**50,
463
+ 'pb': 2**50,
464
+ }
465
+
466
+ MEMORY_SIZE_PATTERN = (
467
+ '^[0-9]+('
468
+ f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
469
+ ')?$/i')
470
+ MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
sky/skylet/job_lib.py CHANGED
@@ -14,7 +14,7 @@ import sqlite3
14
14
  import threading
15
15
  import time
16
16
  import typing
17
- from typing import Any, Dict, List, Optional, Sequence
17
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
18
18
 
19
19
  import colorama
20
20
  import filelock
@@ -62,6 +62,7 @@ class JobInfoLoc(enum.IntEnum):
62
62
  END_AT = 7
63
63
  RESOURCES = 8
64
64
  PID = 9
65
+ LOG_PATH = 10
65
66
 
66
67
 
67
68
  def create_table(cursor, conn):
@@ -101,7 +102,8 @@ def create_table(cursor, conn):
101
102
  start_at FLOAT DEFAULT -1,
102
103
  end_at FLOAT DEFAULT NULL,
103
104
  resources TEXT DEFAULT NULL,
104
- pid INTEGER DEFAULT -1)""")
105
+ pid INTEGER DEFAULT -1,
106
+ log_dir TEXT DEFAULT NULL)""")
105
107
 
106
108
  cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
107
109
  job_id INTEGER,
@@ -114,6 +116,8 @@ def create_table(cursor, conn):
114
116
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT')
115
117
  db_utils.add_column_to_table(cursor, conn, 'jobs', 'pid',
116
118
  'INTEGER DEFAULT -1')
119
+ db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
120
+ 'TEXT DEFAULT NULL')
117
121
  conn.commit()
118
122
 
119
123
 
@@ -335,13 +339,13 @@ def make_job_command_with_user_switching(username: str,
335
339
 
336
340
  @init_db
337
341
  def add_job(job_name: str, username: str, run_timestamp: str,
338
- resources_str: str) -> int:
342
+ resources_str: str) -> Tuple[int, str]:
339
343
  """Atomically reserve the next available job id for the user."""
340
344
  assert _DB is not None
341
345
  job_submitted_at = time.time()
342
346
  # job_id will autoincrement with the null value
343
347
  _DB.cursor.execute(
344
- 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0)',
348
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null)',
345
349
  (job_name, username, job_submitted_at, JobStatus.INIT.value,
346
350
  run_timestamp, None, resources_str))
347
351
  _DB.conn.commit()
@@ -350,7 +354,41 @@ def add_job(job_name: str, username: str, run_timestamp: str,
350
354
  for row in rows:
351
355
  job_id = row[0]
352
356
  assert job_id is not None
353
- return job_id
357
+ log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, f'{job_id}-{job_name}')
358
+ set_log_dir_no_lock(job_id, log_dir)
359
+ return job_id, log_dir
360
+
361
+
362
+ @init_db
363
+ def set_log_dir_no_lock(job_id: int, log_dir: str) -> None:
364
+ """Set the log directory for the job.
365
+
366
+ We persist the log directory for the job to allow changing the log directory
367
+ generation logic over versions.
368
+
369
+ Args:
370
+ job_id: The ID of the job.
371
+ log_dir: The log directory for the job.
372
+ """
373
+ assert _DB is not None
374
+ _DB.cursor.execute('UPDATE jobs SET log_dir=(?) WHERE job_id=(?)',
375
+ (log_dir, job_id))
376
+ _DB.conn.commit()
377
+
378
+
379
+ @init_db
380
+ def get_log_dir_for_job(job_id: int) -> Optional[str]:
381
+ """Get the log directory for the job.
382
+
383
+ Args:
384
+ job_id: The ID of the job.
385
+ """
386
+ assert _DB is not None
387
+ rows = _DB.cursor.execute('SELECT log_dir FROM jobs WHERE job_id=(?)',
388
+ (job_id,))
389
+ for row in rows:
390
+ return row[0]
391
+ return None
354
392
 
355
393
 
356
394
  @init_db
@@ -758,6 +796,14 @@ def fail_all_jobs_in_progress() -> None:
758
796
 
759
797
 
760
798
  def update_status() -> None:
799
+ # This signal file suggests that the controller is recovering from a
800
+ # failure. See sky/jobs/utils.py::update_managed_jobs_statuses for more
801
+ # details. When recovering, we should not update the job status to failed
802
+ # driver as they will be recovered later.
803
+ if os.path.exists(
804
+ os.path.expanduser(
805
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
806
+ return
761
807
  # This will be called periodically by the skylet to update the status
762
808
  # of the jobs in the database, to avoid stale job status.
763
809
  nonterminal_jobs = _get_jobs(user_hash=None,
@@ -970,8 +1016,8 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
970
1016
 
971
1017
 
972
1018
  @init_db
973
- def run_timestamp_with_globbing_payload(job_ids: List[Optional[str]]) -> str:
974
- """Returns the relative paths to the log files for job with globbing."""
1019
+ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1020
+ """Returns the relative paths to the log files for jobs with globbing."""
975
1021
  assert _DB is not None
976
1022
  query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
977
1023
  _DB.cursor.execute(
@@ -979,12 +1025,16 @@ def run_timestamp_with_globbing_payload(job_ids: List[Optional[str]]) -> str:
979
1025
  SELECT * FROM jobs
980
1026
  WHERE {query_str}""", job_ids)
981
1027
  rows = _DB.cursor.fetchall()
982
- run_timestamps = {}
1028
+ job_to_dir = {}
983
1029
  for row in rows:
984
1030
  job_id = row[JobInfoLoc.JOB_ID.value]
985
- run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
986
- run_timestamps[str(job_id)] = run_timestamp
987
- return message_utils.encode_payload(run_timestamps)
1031
+ if row[JobInfoLoc.LOG_PATH.value]:
1032
+ job_to_dir[str(job_id)] = row[JobInfoLoc.LOG_PATH.value]
1033
+ else:
1034
+ run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
1035
+ job_to_dir[str(job_id)] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1036
+ run_timestamp)
1037
+ return message_utils.encode_payload(job_to_dir)
988
1038
 
989
1039
 
990
1040
  class JobLibCodeGen:
@@ -1016,12 +1066,16 @@ class JobLibCodeGen:
1016
1066
  '\nif int(constants.SKYLET_VERSION) < 9: '
1017
1067
  'raise RuntimeError("SkyPilot runtime is too old, which does not '
1018
1068
  'support submitting jobs.")',
1019
- '\njob_id = job_lib.add_job('
1069
+ '\nresult = job_lib.add_job('
1020
1070
  f'{job_name!r},'
1021
1071
  f'{username!r},'
1022
1072
  f'{run_timestamp!r},'
1023
1073
  f'{resources_str!r})',
1024
- 'print("Job ID: " + str(job_id), flush=True)',
1074
+ ('\nif isinstance(result, tuple):'
1075
+ '\n print("Job ID: " + str(result[0]), flush=True)'
1076
+ '\n print("Log Dir: " + str(result[1]), flush=True)'
1077
+ '\nelse:'
1078
+ '\n print("Job ID: " + str(result), flush=True)'),
1025
1079
  ]
1026
1080
  return cls._build(code)
1027
1081
 
@@ -1090,9 +1144,17 @@ class JobLibCodeGen:
1090
1144
  # We use != instead of is not because 1 is not None will print a warning:
1091
1145
  # <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
1092
1146
  f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
1093
- 'run_timestamp = job_lib.get_run_timestamp(job_id)',
1094
- f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
1095
- f'tail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1147
+ # For backward compatibility, use the legacy generation rule for
1148
+ # jobs submitted before 0.11.0.
1149
+ ('log_dir = None\n'
1150
+ 'if hasattr(job_lib, "get_log_dir_for_job"):\n'
1151
+ ' log_dir = job_lib.get_log_dir_for_job(job_id)\n'
1152
+ 'if log_dir is None:\n'
1153
+ ' run_timestamp = job_lib.get_run_timestamp(job_id)\n'
1154
+ f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
1155
+ ),
1156
+ # Add a newline to leave the if indent block above.
1157
+ f'\ntail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
1096
1158
  f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
1097
1159
  f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
1098
1160
  # After tailing, check the job status and exit with appropriate code
@@ -1132,12 +1194,14 @@ class JobLibCodeGen:
1132
1194
  return cls._build(code)
1133
1195
 
1134
1196
  @classmethod
1135
- def get_run_timestamp_with_globbing(cls,
1136
- job_ids: Optional[List[str]]) -> str:
1197
+ def get_log_dirs_for_jobs(cls, job_ids: Optional[List[str]]) -> str:
1137
1198
  code = [
1138
1199
  f'job_ids = {job_ids} if {job_ids} is not None '
1139
1200
  'else [job_lib.get_latest_job_id()]',
1140
- 'log_dirs = job_lib.run_timestamp_with_globbing_payload(job_ids)',
1201
+ # TODO(aylei): backward compatibility, remove after 0.12.0.
1202
+ 'log_dirs = job_lib.get_log_dir_for_jobs(job_ids) if '
1203
+ 'hasattr(job_lib, "get_log_dir_for_jobs") else '
1204
+ 'job_lib.run_timestamp_with_globbing_payload(job_ids)',
1141
1205
  'print(log_dirs, flush=True)',
1142
1206
  ]
1143
1207
  return cls._build(code)
sky/task.py CHANGED
@@ -121,27 +121,61 @@ def _fill_in_env_vars(
121
121
  return json.loads(yaml_field_str)
122
122
 
123
123
 
124
- def _check_docker_login_config(task_envs: Dict[str, str]) -> bool:
125
- """Checks if there is a valid docker login config in task_envs.
124
+ def _check_docker_login_config(task_envs: Dict[str, str],
125
+ task_secrets: Dict[str, str]) -> bool:
126
+ """Validates a valid docker login config in task_envs and task_secrets.
126
127
 
127
- If any of the docker login env vars is set, all of them must be set.
128
+ Docker login variables must be specified together either in envs OR secrets,
129
+ not split across both. If any of the docker login env vars is set, all of
130
+ them must be set in the same location.
131
+
132
+ Args:
133
+ task_envs: Environment variables
134
+ task_secrets: Secret variables (optional, defaults to empty dict)
128
135
 
129
136
  Returns:
130
- True if there is a valid docker login config in task_envs.
137
+ True if there is a valid docker login config.
131
138
  False otherwise.
132
139
  Raises:
133
- ValueError: if any of the docker login env vars is set, but not all of
134
- them are set.
140
+ ValueError: if docker login configuration is invalid.
135
141
  """
142
+ if task_secrets is None:
143
+ task_secrets = {}
144
+
136
145
  all_keys = constants.DOCKER_LOGIN_ENV_VARS
137
- existing_keys = all_keys & set(task_envs.keys())
138
- if not existing_keys:
146
+ envs_keys = all_keys & set(task_envs.keys())
147
+ secrets_keys = all_keys & set(task_secrets.keys())
148
+
149
+ # Check if any docker variables exist
150
+ if not envs_keys and not secrets_keys:
139
151
  return False
140
- if len(existing_keys) != len(all_keys):
152
+
153
+ # Check if variables are split across envs and secrets
154
+ if envs_keys and secrets_keys:
141
155
  with ux_utils.print_exception_no_traceback():
142
156
  raise ValueError(
143
- f'If any of {", ".join(all_keys)} is set, all of them must '
144
- f'be set. Missing envs: {all_keys - existing_keys}')
157
+ 'Docker login variables must be specified together either '
158
+ 'in envs OR secrets, not split across both. '
159
+ f'Found in envs: {sorted(envs_keys)}, '
160
+ f'Found in secrets: {sorted(secrets_keys)}')
161
+
162
+ # Check if all variables are present in the chosen location
163
+ if envs_keys:
164
+ if len(envs_keys) != len(all_keys):
165
+ with ux_utils.print_exception_no_traceback():
166
+ raise ValueError(
167
+ 'Docker login variables must be specified together '
168
+ 'in envs. '
169
+ f'Missing from envs: {sorted(all_keys - envs_keys)}')
170
+
171
+ if secrets_keys:
172
+ if len(secrets_keys) != len(all_keys):
173
+ with ux_utils.print_exception_no_traceback():
174
+ raise ValueError(
175
+ 'Docker login variables must be specified together '
176
+ 'in secrets. '
177
+ f'Missing from secrets: {sorted(all_keys - secrets_keys)}')
178
+
145
179
  return True
146
180
 
147
181
 
@@ -149,11 +183,13 @@ def _with_docker_login_config(
149
183
  resources: Union[Set['resources_lib.Resources'],
150
184
  List['resources_lib.Resources']],
151
185
  task_envs: Dict[str, str],
186
+ task_secrets: Dict[str, str],
152
187
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
153
- if not _check_docker_login_config(task_envs):
188
+ if not _check_docker_login_config(task_envs, task_secrets):
154
189
  return resources
155
- docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(
156
- task_envs)
190
+ envs = task_envs.copy()
191
+ envs.update(task_secrets)
192
+ docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
157
193
 
158
194
  def _add_docker_login_config(resources: 'resources_lib.Resources'):
159
195
  docker_image = resources.extract_docker_image()
@@ -181,8 +217,11 @@ def _with_docker_username_for_runpod(
181
217
  resources: Union[Set['resources_lib.Resources'],
182
218
  List['resources_lib.Resources']],
183
219
  task_envs: Dict[str, str],
220
+ task_secrets: Dict[str, str],
184
221
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
185
- docker_username_for_runpod = task_envs.get(
222
+ envs = task_envs.copy()
223
+ envs.update(task_secrets)
224
+ docker_username_for_runpod = envs.get(
186
225
  constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
187
226
 
188
227
  # We should not call r.copy() if docker_username_for_runpod is None,
@@ -204,6 +243,7 @@ class Task:
204
243
  setup: Optional[str] = None,
205
244
  run: Optional[CommandOrCommandGen] = None,
206
245
  envs: Optional[Dict[str, str]] = None,
246
+ secrets: Optional[Dict[str, str]] = None,
207
247
  workdir: Optional[str] = None,
208
248
  num_nodes: Optional[int] = None,
209
249
  # Advanced:
@@ -254,6 +294,9 @@ class Task:
254
294
  self-contained lambda.
255
295
  envs: A dictionary of environment variables to set before running the
256
296
  setup and run commands.
297
+ secrets: A dictionary of secret environment variables to set before
298
+ running the setup and run commands. These will be redacted in logs
299
+ and YAML output.
257
300
  workdir: The local working directory. This directory will be synced
258
301
  to a location on the remote VM(s), and ``setup`` and ``run``
259
302
  commands will be run under that location (thus, they can rely on
@@ -275,6 +318,13 @@ class Task:
275
318
  storage_lib.StoreType] = {}
276
319
  self.setup = setup
277
320
  self._envs = envs or {}
321
+ self._secrets = secrets or {}
322
+
323
+ # Validate Docker login configuration early if both envs and secrets
324
+ # contain Docker variables
325
+ if self._envs or self._secrets:
326
+ _check_docker_login_config(self._envs, self._secrets)
327
+
278
328
  self.workdir = workdir
279
329
  self.docker_image = (docker_image if docker_image else
280
330
  'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04')
@@ -447,6 +497,7 @@ class Task:
447
497
  def from_yaml_config(
448
498
  config: Dict[str, Any],
449
499
  env_overrides: Optional[List[Tuple[str, str]]] = None,
500
+ secrets_overrides: Optional[List[Tuple[str, str]]] = None,
450
501
  ) -> 'Task':
451
502
  # More robust handling for 'envs': explicitly convert keys and values to
452
503
  # str, since users may pass '123' as keys/values which will get parsed
@@ -460,6 +511,20 @@ class Task:
460
511
  else:
461
512
  new_envs[str(k)] = None
462
513
  config['envs'] = new_envs
514
+
515
+ # More robust handling for 'secrets': explicitly convert keys and values
516
+ # to str, since users may pass '123' as keys/values which will get
517
+ # parsed as int causing validate_schema() to fail.
518
+ secrets = config.get('secrets')
519
+ if secrets is not None and isinstance(secrets, dict):
520
+ new_secrets: Dict[str, Optional[str]] = {}
521
+ for k, v in secrets.items():
522
+ if v is not None:
523
+ new_secrets[str(k)] = str(v)
524
+ else:
525
+ new_secrets[str(k)] = None
526
+ config['secrets'] = new_secrets
527
+
463
528
  common_utils.validate_schema(config, schemas.get_task_schema(),
464
529
  'Invalid task YAML: ')
465
530
  if env_overrides is not None:
@@ -473,6 +538,12 @@ class Task:
473
538
  new_envs.update(env_overrides)
474
539
  config['envs'] = new_envs
475
540
 
541
+ if secrets_overrides is not None:
542
+ # Override secrets vars from CLI.
543
+ new_secrets = config.get('secrets', {})
544
+ new_secrets.update(secrets_overrides)
545
+ config['secrets'] = new_secrets
546
+
476
547
  for k, v in config.get('envs', {}).items():
477
548
  if v is None:
478
549
  with ux_utils.print_exception_no_traceback():
@@ -482,6 +553,15 @@ class Task:
482
553
  f'To set it to be empty, use an empty string ({k}: "" '
483
554
  f'in task YAML or --env {k}="" in CLI).')
484
555
 
556
+ for k, v in config.get('secrets', {}).items():
557
+ if v is None:
558
+ with ux_utils.print_exception_no_traceback():
559
+ raise ValueError(
560
+ f'Secret variable {k!r} is None. Please set a '
561
+ 'value for it in task YAML or with --secret flag. '
562
+ f'To set it to be empty, use an empty string ({k}: "" '
563
+ f'in task YAML or --secret {k}="" in CLI).')
564
+
485
565
  # Fill in any Task.envs into file_mounts (src/dst paths, storage
486
566
  # name/source).
487
567
  if config.get('file_mounts') is not None:
@@ -505,6 +585,7 @@ class Task:
505
585
  setup=config.pop('setup', None),
506
586
  num_nodes=config.pop('num_nodes', None),
507
587
  envs=config.pop('envs', None),
588
+ secrets=config.pop('secrets', None),
508
589
  event_callback=config.pop('event_callback', None),
509
590
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
510
591
  )
@@ -687,6 +768,10 @@ class Task:
687
768
  def envs(self) -> Dict[str, str]:
688
769
  return self._envs
689
770
 
771
+ @property
772
+ def secrets(self) -> Dict[str, str]:
773
+ return self._secrets
774
+
690
775
  def update_envs(
691
776
  self, envs: Union[None, List[Tuple[str, str]],
692
777
  Dict[str, str]]) -> 'Task':
@@ -727,17 +812,70 @@ class Task:
727
812
  # If the update_envs() is called after set_resources(), we need to
728
813
  # manually update docker login config in task resources, in case the
729
814
  # docker login envs are newly added.
730
- if _check_docker_login_config(self._envs):
815
+ if _check_docker_login_config(self._envs, self._secrets):
731
816
  self.resources = _with_docker_login_config(self.resources,
732
- self._envs)
817
+ self._envs,
818
+ self._secrets)
733
819
  self.resources = _with_docker_username_for_runpod(
734
- self.resources, self._envs)
820
+ self.resources, self._envs, self._secrets)
821
+ return self
822
+
823
+ def update_secrets(
824
+ self, secrets: Union[None, List[Tuple[str, str]],
825
+ Dict[str, str]]) -> 'Task':
826
+ """Updates secret env vars for use inside the setup/run commands.
827
+
828
+ Args:
829
+ secrets: (optional) either a list of ``(secret_name, value)`` or a
830
+ dict ``{secret_name: value}``.
831
+
832
+ Returns:
833
+ self: The current task, with secrets updated.
834
+
835
+ Raises:
836
+ ValueError: if various invalid inputs errors are detected.
837
+ """
838
+ if secrets is None:
839
+ secrets = {}
840
+ if isinstance(secrets, (list, tuple)):
841
+ keys = set(secret[0] for secret in secrets)
842
+ if len(keys) != len(secrets):
843
+ with ux_utils.print_exception_no_traceback():
844
+ raise ValueError('Duplicate secret keys provided.')
845
+ secrets = dict(secrets)
846
+ if isinstance(secrets, dict):
847
+ for key in secrets:
848
+ if not isinstance(key, str):
849
+ with ux_utils.print_exception_no_traceback():
850
+ raise ValueError('Secret keys must be strings.')
851
+ if not common_utils.is_valid_env_var(key):
852
+ with ux_utils.print_exception_no_traceback():
853
+ raise ValueError(f'Invalid secret key: {key}')
854
+ else:
855
+ with ux_utils.print_exception_no_traceback():
856
+ raise ValueError(
857
+ 'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
858
+ f'{secrets}')
859
+ self._secrets.update(secrets)
860
+ # Validate Docker login configuration if needed
861
+ if _check_docker_login_config(self._envs, self._secrets):
862
+ self.resources = _with_docker_login_config(self.resources,
863
+ self._envs,
864
+ self._secrets)
865
+ self.resources = _with_docker_username_for_runpod(
866
+ self.resources, self._envs, self._secrets)
735
867
  return self
736
868
 
737
869
  @property
738
870
  def use_spot(self) -> bool:
739
871
  return any(r.use_spot for r in self.resources)
740
872
 
873
+ @property
874
+ def envs_and_secrets(self) -> Dict[str, str]:
875
+ envs = self.envs.copy()
876
+ envs.update(self.secrets)
877
+ return envs
878
+
741
879
  def set_inputs(self, inputs: str,
742
880
  estimated_size_gigabytes: float) -> 'Task':
743
881
  # E.g., 's3://bucket', 'gs://bucket', or None.
@@ -796,10 +934,11 @@ class Task:
796
934
  if isinstance(resources, sky.Resources):
797
935
  resources = {resources}
798
936
  # TODO(woosuk): Check if the resources are None.
799
- self.resources = _with_docker_login_config(resources, self.envs)
937
+ self.resources = _with_docker_login_config(resources, self.envs,
938
+ self.secrets)
800
939
  # Only have effect on RunPod.
801
940
  self.resources = _with_docker_username_for_runpod(
802
- self.resources, self.envs)
941
+ self.resources, self.envs, self.secrets)
803
942
 
804
943
  # Evaluate if the task requires FUSE and set the requires_fuse flag
805
944
  for _, storage_obj in self.storage_mounts.items():
@@ -1266,7 +1405,7 @@ class Task:
1266
1405
  d[k] = v
1267
1406
  return d
1268
1407
 
1269
- def to_yaml_config(self) -> Dict[str, Any]:
1408
+ def to_yaml_config(self, redact_secrets: bool = True) -> Dict[str, Any]:
1270
1409
  """Returns a yaml-style dict representation of the task.
1271
1410
 
1272
1411
  INTERNAL: this method is internal-facing.
@@ -1314,8 +1453,19 @@ class Task:
1314
1453
  add_if_not_none('workdir', self.workdir)
1315
1454
  add_if_not_none('event_callback', self.event_callback)
1316
1455
  add_if_not_none('run', self.run)
1456
+
1457
+ # Add envs without redaction
1317
1458
  add_if_not_none('envs', self.envs, no_empty=True)
1318
1459
 
1460
+ # Add secrets with redaction if requested
1461
+ secrets = self.secrets
1462
+ if secrets and redact_secrets:
1463
+ secrets = {
1464
+ k: '<redacted>' if isinstance(v, str) else v
1465
+ for k, v in secrets.items()
1466
+ }
1467
+ add_if_not_none('secrets', secrets, no_empty=True)
1468
+
1319
1469
  add_if_not_none('file_mounts', {})
1320
1470
 
1321
1471
  if self.file_mounts is not None: