skypilot-nightly 1.0.0.dev20250615__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/cloud_vm_ray_backend.py +43 -60
- sky/cli.py +55 -637
- sky/client/cli.py +55 -637
- sky/clouds/kubernetes.py +3 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/provision/__init__.py +1 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/skylet/constants.py +39 -0
- sky/skylet/job_lib.py +8 -0
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +51 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +19 -36
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +43 -5
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/task.py
CHANGED
@@ -121,27 +121,61 @@ def _fill_in_env_vars(
|
|
121
121
|
return json.loads(yaml_field_str)
|
122
122
|
|
123
123
|
|
124
|
-
def _check_docker_login_config(task_envs: Dict[str, str]
|
125
|
-
|
124
|
+
def _check_docker_login_config(task_envs: Dict[str, str],
|
125
|
+
task_secrets: Dict[str, str]) -> bool:
|
126
|
+
"""Validates a valid docker login config in task_envs and task_secrets.
|
126
127
|
|
127
|
-
|
128
|
+
Docker login variables must be specified together either in envs OR secrets,
|
129
|
+
not split across both. If any of the docker login env vars is set, all of
|
130
|
+
them must be set in the same location.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
task_envs: Environment variables
|
134
|
+
task_secrets: Secret variables (optional, defaults to empty dict)
|
128
135
|
|
129
136
|
Returns:
|
130
|
-
True if there is a valid docker login config
|
137
|
+
True if there is a valid docker login config.
|
131
138
|
False otherwise.
|
132
139
|
Raises:
|
133
|
-
ValueError: if
|
134
|
-
them are set.
|
140
|
+
ValueError: if docker login configuration is invalid.
|
135
141
|
"""
|
142
|
+
if task_secrets is None:
|
143
|
+
task_secrets = {}
|
144
|
+
|
136
145
|
all_keys = constants.DOCKER_LOGIN_ENV_VARS
|
137
|
-
|
138
|
-
|
146
|
+
envs_keys = all_keys & set(task_envs.keys())
|
147
|
+
secrets_keys = all_keys & set(task_secrets.keys())
|
148
|
+
|
149
|
+
# Check if any docker variables exist
|
150
|
+
if not envs_keys and not secrets_keys:
|
139
151
|
return False
|
140
|
-
|
152
|
+
|
153
|
+
# Check if variables are split across envs and secrets
|
154
|
+
if envs_keys and secrets_keys:
|
141
155
|
with ux_utils.print_exception_no_traceback():
|
142
156
|
raise ValueError(
|
143
|
-
|
144
|
-
|
157
|
+
'Docker login variables must be specified together either '
|
158
|
+
'in envs OR secrets, not split across both. '
|
159
|
+
f'Found in envs: {sorted(envs_keys)}, '
|
160
|
+
f'Found in secrets: {sorted(secrets_keys)}')
|
161
|
+
|
162
|
+
# Check if all variables are present in the chosen location
|
163
|
+
if envs_keys:
|
164
|
+
if len(envs_keys) != len(all_keys):
|
165
|
+
with ux_utils.print_exception_no_traceback():
|
166
|
+
raise ValueError(
|
167
|
+
'Docker login variables must be specified together '
|
168
|
+
'in envs. '
|
169
|
+
f'Missing from envs: {sorted(all_keys - envs_keys)}')
|
170
|
+
|
171
|
+
if secrets_keys:
|
172
|
+
if len(secrets_keys) != len(all_keys):
|
173
|
+
with ux_utils.print_exception_no_traceback():
|
174
|
+
raise ValueError(
|
175
|
+
'Docker login variables must be specified together '
|
176
|
+
'in secrets. '
|
177
|
+
f'Missing from secrets: {sorted(all_keys - secrets_keys)}')
|
178
|
+
|
145
179
|
return True
|
146
180
|
|
147
181
|
|
@@ -149,11 +183,13 @@ def _with_docker_login_config(
|
|
149
183
|
resources: Union[Set['resources_lib.Resources'],
|
150
184
|
List['resources_lib.Resources']],
|
151
185
|
task_envs: Dict[str, str],
|
186
|
+
task_secrets: Dict[str, str],
|
152
187
|
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
|
153
|
-
if not _check_docker_login_config(task_envs):
|
188
|
+
if not _check_docker_login_config(task_envs, task_secrets):
|
154
189
|
return resources
|
155
|
-
|
156
|
-
|
190
|
+
envs = task_envs.copy()
|
191
|
+
envs.update(task_secrets)
|
192
|
+
docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
|
157
193
|
|
158
194
|
def _add_docker_login_config(resources: 'resources_lib.Resources'):
|
159
195
|
docker_image = resources.extract_docker_image()
|
@@ -181,8 +217,11 @@ def _with_docker_username_for_runpod(
|
|
181
217
|
resources: Union[Set['resources_lib.Resources'],
|
182
218
|
List['resources_lib.Resources']],
|
183
219
|
task_envs: Dict[str, str],
|
220
|
+
task_secrets: Dict[str, str],
|
184
221
|
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
|
185
|
-
|
222
|
+
envs = task_envs.copy()
|
223
|
+
envs.update(task_secrets)
|
224
|
+
docker_username_for_runpod = envs.get(
|
186
225
|
constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
|
187
226
|
|
188
227
|
# We should not call r.copy() if docker_username_for_runpod is None,
|
@@ -204,6 +243,7 @@ class Task:
|
|
204
243
|
setup: Optional[str] = None,
|
205
244
|
run: Optional[CommandOrCommandGen] = None,
|
206
245
|
envs: Optional[Dict[str, str]] = None,
|
246
|
+
secrets: Optional[Dict[str, str]] = None,
|
207
247
|
workdir: Optional[str] = None,
|
208
248
|
num_nodes: Optional[int] = None,
|
209
249
|
# Advanced:
|
@@ -254,6 +294,9 @@ class Task:
|
|
254
294
|
self-contained lambda.
|
255
295
|
envs: A dictionary of environment variables to set before running the
|
256
296
|
setup and run commands.
|
297
|
+
secrets: A dictionary of secret environment variables to set before
|
298
|
+
running the setup and run commands. These will be redacted in logs
|
299
|
+
and YAML output.
|
257
300
|
workdir: The local working directory. This directory will be synced
|
258
301
|
to a location on the remote VM(s), and ``setup`` and ``run``
|
259
302
|
commands will be run under that location (thus, they can rely on
|
@@ -275,6 +318,13 @@ class Task:
|
|
275
318
|
storage_lib.StoreType] = {}
|
276
319
|
self.setup = setup
|
277
320
|
self._envs = envs or {}
|
321
|
+
self._secrets = secrets or {}
|
322
|
+
|
323
|
+
# Validate Docker login configuration early if both envs and secrets
|
324
|
+
# contain Docker variables
|
325
|
+
if self._envs or self._secrets:
|
326
|
+
_check_docker_login_config(self._envs, self._secrets)
|
327
|
+
|
278
328
|
self.workdir = workdir
|
279
329
|
self.docker_image = (docker_image if docker_image else
|
280
330
|
'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04')
|
@@ -447,6 +497,7 @@ class Task:
|
|
447
497
|
def from_yaml_config(
|
448
498
|
config: Dict[str, Any],
|
449
499
|
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
500
|
+
secrets_overrides: Optional[List[Tuple[str, str]]] = None,
|
450
501
|
) -> 'Task':
|
451
502
|
# More robust handling for 'envs': explicitly convert keys and values to
|
452
503
|
# str, since users may pass '123' as keys/values which will get parsed
|
@@ -460,6 +511,20 @@ class Task:
|
|
460
511
|
else:
|
461
512
|
new_envs[str(k)] = None
|
462
513
|
config['envs'] = new_envs
|
514
|
+
|
515
|
+
# More robust handling for 'secrets': explicitly convert keys and values
|
516
|
+
# to str, since users may pass '123' as keys/values which will get
|
517
|
+
# parsed as int causing validate_schema() to fail.
|
518
|
+
secrets = config.get('secrets')
|
519
|
+
if secrets is not None and isinstance(secrets, dict):
|
520
|
+
new_secrets: Dict[str, Optional[str]] = {}
|
521
|
+
for k, v in secrets.items():
|
522
|
+
if v is not None:
|
523
|
+
new_secrets[str(k)] = str(v)
|
524
|
+
else:
|
525
|
+
new_secrets[str(k)] = None
|
526
|
+
config['secrets'] = new_secrets
|
527
|
+
|
463
528
|
common_utils.validate_schema(config, schemas.get_task_schema(),
|
464
529
|
'Invalid task YAML: ')
|
465
530
|
if env_overrides is not None:
|
@@ -473,6 +538,12 @@ class Task:
|
|
473
538
|
new_envs.update(env_overrides)
|
474
539
|
config['envs'] = new_envs
|
475
540
|
|
541
|
+
if secrets_overrides is not None:
|
542
|
+
# Override secrets vars from CLI.
|
543
|
+
new_secrets = config.get('secrets', {})
|
544
|
+
new_secrets.update(secrets_overrides)
|
545
|
+
config['secrets'] = new_secrets
|
546
|
+
|
476
547
|
for k, v in config.get('envs', {}).items():
|
477
548
|
if v is None:
|
478
549
|
with ux_utils.print_exception_no_traceback():
|
@@ -482,6 +553,15 @@ class Task:
|
|
482
553
|
f'To set it to be empty, use an empty string ({k}: "" '
|
483
554
|
f'in task YAML or --env {k}="" in CLI).')
|
484
555
|
|
556
|
+
for k, v in config.get('secrets', {}).items():
|
557
|
+
if v is None:
|
558
|
+
with ux_utils.print_exception_no_traceback():
|
559
|
+
raise ValueError(
|
560
|
+
f'Secret variable {k!r} is None. Please set a '
|
561
|
+
'value for it in task YAML or with --secret flag. '
|
562
|
+
f'To set it to be empty, use an empty string ({k}: "" '
|
563
|
+
f'in task YAML or --secret {k}="" in CLI).')
|
564
|
+
|
485
565
|
# Fill in any Task.envs into file_mounts (src/dst paths, storage
|
486
566
|
# name/source).
|
487
567
|
if config.get('file_mounts') is not None:
|
@@ -505,6 +585,7 @@ class Task:
|
|
505
585
|
setup=config.pop('setup', None),
|
506
586
|
num_nodes=config.pop('num_nodes', None),
|
507
587
|
envs=config.pop('envs', None),
|
588
|
+
secrets=config.pop('secrets', None),
|
508
589
|
event_callback=config.pop('event_callback', None),
|
509
590
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
510
591
|
)
|
@@ -687,6 +768,10 @@ class Task:
|
|
687
768
|
def envs(self) -> Dict[str, str]:
|
688
769
|
return self._envs
|
689
770
|
|
771
|
+
@property
|
772
|
+
def secrets(self) -> Dict[str, str]:
|
773
|
+
return self._secrets
|
774
|
+
|
690
775
|
def update_envs(
|
691
776
|
self, envs: Union[None, List[Tuple[str, str]],
|
692
777
|
Dict[str, str]]) -> 'Task':
|
@@ -727,17 +812,70 @@ class Task:
|
|
727
812
|
# If the update_envs() is called after set_resources(), we need to
|
728
813
|
# manually update docker login config in task resources, in case the
|
729
814
|
# docker login envs are newly added.
|
730
|
-
if _check_docker_login_config(self._envs):
|
815
|
+
if _check_docker_login_config(self._envs, self._secrets):
|
731
816
|
self.resources = _with_docker_login_config(self.resources,
|
732
|
-
self._envs
|
817
|
+
self._envs,
|
818
|
+
self._secrets)
|
733
819
|
self.resources = _with_docker_username_for_runpod(
|
734
|
-
self.resources, self._envs)
|
820
|
+
self.resources, self._envs, self._secrets)
|
821
|
+
return self
|
822
|
+
|
823
|
+
def update_secrets(
|
824
|
+
self, secrets: Union[None, List[Tuple[str, str]],
|
825
|
+
Dict[str, str]]) -> 'Task':
|
826
|
+
"""Updates secret env vars for use inside the setup/run commands.
|
827
|
+
|
828
|
+
Args:
|
829
|
+
secrets: (optional) either a list of ``(secret_name, value)`` or a
|
830
|
+
dict ``{secret_name: value}``.
|
831
|
+
|
832
|
+
Returns:
|
833
|
+
self: The current task, with secrets updated.
|
834
|
+
|
835
|
+
Raises:
|
836
|
+
ValueError: if various invalid inputs errors are detected.
|
837
|
+
"""
|
838
|
+
if secrets is None:
|
839
|
+
secrets = {}
|
840
|
+
if isinstance(secrets, (list, tuple)):
|
841
|
+
keys = set(secret[0] for secret in secrets)
|
842
|
+
if len(keys) != len(secrets):
|
843
|
+
with ux_utils.print_exception_no_traceback():
|
844
|
+
raise ValueError('Duplicate secret keys provided.')
|
845
|
+
secrets = dict(secrets)
|
846
|
+
if isinstance(secrets, dict):
|
847
|
+
for key in secrets:
|
848
|
+
if not isinstance(key, str):
|
849
|
+
with ux_utils.print_exception_no_traceback():
|
850
|
+
raise ValueError('Secret keys must be strings.')
|
851
|
+
if not common_utils.is_valid_env_var(key):
|
852
|
+
with ux_utils.print_exception_no_traceback():
|
853
|
+
raise ValueError(f'Invalid secret key: {key}')
|
854
|
+
else:
|
855
|
+
with ux_utils.print_exception_no_traceback():
|
856
|
+
raise ValueError(
|
857
|
+
'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
|
858
|
+
f'{secrets}')
|
859
|
+
self._secrets.update(secrets)
|
860
|
+
# Validate Docker login configuration if needed
|
861
|
+
if _check_docker_login_config(self._envs, self._secrets):
|
862
|
+
self.resources = _with_docker_login_config(self.resources,
|
863
|
+
self._envs,
|
864
|
+
self._secrets)
|
865
|
+
self.resources = _with_docker_username_for_runpod(
|
866
|
+
self.resources, self._envs, self._secrets)
|
735
867
|
return self
|
736
868
|
|
737
869
|
@property
|
738
870
|
def use_spot(self) -> bool:
|
739
871
|
return any(r.use_spot for r in self.resources)
|
740
872
|
|
873
|
+
@property
|
874
|
+
def envs_and_secrets(self) -> Dict[str, str]:
|
875
|
+
envs = self.envs.copy()
|
876
|
+
envs.update(self.secrets)
|
877
|
+
return envs
|
878
|
+
|
741
879
|
def set_inputs(self, inputs: str,
|
742
880
|
estimated_size_gigabytes: float) -> 'Task':
|
743
881
|
# E.g., 's3://bucket', 'gs://bucket', or None.
|
@@ -796,10 +934,11 @@ class Task:
|
|
796
934
|
if isinstance(resources, sky.Resources):
|
797
935
|
resources = {resources}
|
798
936
|
# TODO(woosuk): Check if the resources are None.
|
799
|
-
self.resources = _with_docker_login_config(resources, self.envs
|
937
|
+
self.resources = _with_docker_login_config(resources, self.envs,
|
938
|
+
self.secrets)
|
800
939
|
# Only have effect on RunPod.
|
801
940
|
self.resources = _with_docker_username_for_runpod(
|
802
|
-
self.resources, self.envs)
|
941
|
+
self.resources, self.envs, self.secrets)
|
803
942
|
|
804
943
|
# Evaluate if the task requires FUSE and set the requires_fuse flag
|
805
944
|
for _, storage_obj in self.storage_mounts.items():
|
@@ -1266,7 +1405,7 @@ class Task:
|
|
1266
1405
|
d[k] = v
|
1267
1406
|
return d
|
1268
1407
|
|
1269
|
-
def to_yaml_config(self) -> Dict[str, Any]:
|
1408
|
+
def to_yaml_config(self, redact_secrets: bool = True) -> Dict[str, Any]:
|
1270
1409
|
"""Returns a yaml-style dict representation of the task.
|
1271
1410
|
|
1272
1411
|
INTERNAL: this method is internal-facing.
|
@@ -1314,8 +1453,19 @@ class Task:
|
|
1314
1453
|
add_if_not_none('workdir', self.workdir)
|
1315
1454
|
add_if_not_none('event_callback', self.event_callback)
|
1316
1455
|
add_if_not_none('run', self.run)
|
1456
|
+
|
1457
|
+
# Add envs without redaction
|
1317
1458
|
add_if_not_none('envs', self.envs, no_empty=True)
|
1318
1459
|
|
1460
|
+
# Add secrets with redaction if requested
|
1461
|
+
secrets = self.secrets
|
1462
|
+
if secrets and redact_secrets:
|
1463
|
+
secrets = {
|
1464
|
+
k: '<redacted>' if isinstance(v, str) else v
|
1465
|
+
for k, v in secrets.items()
|
1466
|
+
}
|
1467
|
+
add_if_not_none('secrets', secrets, no_empty=True)
|
1468
|
+
|
1319
1469
|
add_if_not_none('file_mounts', {})
|
1320
1470
|
|
1321
1471
|
if self.file_mounts is not None:
|
@@ -632,19 +632,66 @@ available_node_types:
|
|
632
632
|
{% if high_availability %}
|
633
633
|
mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
|
634
634
|
if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
|
635
|
+
SKYPILOT_HA_RECOVERY_LOG="/tmp/ha_recovery.log"
|
636
|
+
echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
637
|
+
start_time=$SECONDS
|
638
|
+
retry_count=0
|
639
|
+
|
640
|
+
# Wait for Ray to be ready, as the following commands is depending on Ray.
|
641
|
+
GET_RAY_STATUS_CMD=$({{sky_python_cmd}} -c 'from sky.provision import instance_setup; print(instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND)')
|
642
|
+
while true; do
|
643
|
+
retry_count=$((retry_count + 1))
|
644
|
+
current_duration=$(( SECONDS - start_time ))
|
645
|
+
echo "Attempt $retry_count to get Ray status after $current_duration seconds..." >> $SKYPILOT_HA_RECOVERY_LOG
|
646
|
+
|
647
|
+
bash --login -c "$GET_RAY_STATUS_CMD"
|
648
|
+
if [ $? -eq 0 ]; then
|
649
|
+
wait_duration=$(( SECONDS - start_time ))
|
650
|
+
echo "Ray ready after waiting $wait_duration seconds (took $retry_count attempts)" >> $SKYPILOT_HA_RECOVERY_LOG
|
651
|
+
break
|
652
|
+
fi
|
653
|
+
echo "Waiting for Ray to be ready..." >> $SKYPILOT_HA_RECOVERY_LOG
|
654
|
+
sleep 2
|
655
|
+
done
|
656
|
+
|
635
657
|
# ! Keep this aligned with `CloudVmRayBackend._setup()`
|
636
|
-
# Suppose all `task.setup` are the same for
|
658
|
+
# Suppose all `task.setup` are the same for sky serve / managed jobs controller task.
|
637
659
|
# So be careful for compatibility issue once you change it.
|
638
660
|
chmod +x {{k8s_high_availability_deployment_setup_script_path}}
|
639
661
|
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
|
640
|
-
echo "=== Controller setup commands completed for recovery ==="
|
641
|
-
|
662
|
+
echo "=== Controller setup commands completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
|
663
|
+
|
664
|
+
touch {{k8s_high_availability_restarting_signal_file}}
|
665
|
+
# Get all in-progress jobs from managed jobs controller. We skip any jobs that are already done.
|
666
|
+
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
667
|
+
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
668
|
+
# will delete the service from the database after it is terminated so everything in the database is running.
|
669
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.get_managed_jobs(); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
670
|
+
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
671
|
+
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
672
|
+
fi
|
642
673
|
for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
|
674
|
+
# This is the cluster job id on managed jobs controller, but it is guaranteed to be the same as the managed job id,
|
675
|
+
# so we directly use it here. See `CloudVmRayBackend._exec_code_on_head::_dump_code_to_file` for more details.
|
676
|
+
JOB_ID=$(basename $file | sed 's/sky_job_//')
|
677
|
+
# If the list of in-progress jobs is not None (meaning this is a managed job HA controller) and job is not in-progress, skip.
|
678
|
+
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
679
|
+
if [[ ! " ${ALL_IN_PROGRESS_JOBS_SEQ[@]} " =~ " ${JOB_ID} " ]]; then
|
680
|
+
continue
|
681
|
+
fi
|
682
|
+
fi
|
643
683
|
# ! Keep this aligned with `CloudVmRayBackend._execute()`
|
644
684
|
chmod +x $file
|
685
|
+
# TODO(tian): This logic may run a lot of things if the jobs controller previously had many jobs.
|
686
|
+
# We should do more tests and make sure it will scale well.
|
645
687
|
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
|
646
|
-
echo "=== Controller task run for service (file: $file) completed for recovery ==="
|
688
|
+
echo "=== Controller task run for service / job (file: $file) completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
|
647
689
|
done
|
690
|
+
rm {{k8s_high_availability_restarting_signal_file}}
|
691
|
+
|
692
|
+
duration=$(( SECONDS - start_time ))
|
693
|
+
echo "HA recovery completed at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
694
|
+
echo "Total recovery time: $duration seconds" >> $SKYPILOT_HA_RECOVERY_LOG
|
648
695
|
fi
|
649
696
|
|
650
697
|
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
sky/templates/scp-ray.yml.j2
CHANGED
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
|
|
7
7
|
|
8
8
|
provider:
|
9
9
|
type: external
|
10
|
-
module: sky.
|
10
|
+
module: sky.provision.scp
|
11
11
|
region: {{region}}
|
12
12
|
cache_stopped_nodes: True
|
13
13
|
|
@@ -24,19 +24,6 @@ available_node_types:
|
|
24
24
|
InstanceType: {{instance_type}}
|
25
25
|
imageId: {{image_id}}
|
26
26
|
diskSize: {{disk_size}}
|
27
|
-
{% if num_nodes > 1 %}
|
28
|
-
ray_worker_default:
|
29
|
-
min_workers: {{num_nodes - 1}}
|
30
|
-
max_workers: {{num_nodes - 1}}
|
31
|
-
resources: {}
|
32
|
-
node_config:
|
33
|
-
AuthorizedKey: |
|
34
|
-
skypilot:ssh_public_key_content
|
35
|
-
InstanceType: {{instance_type}}
|
36
|
-
imageId: {{image_id}}
|
37
|
-
diskSize: {{disk_size}}
|
38
|
-
|
39
|
-
{%- endif %}
|
40
27
|
|
41
28
|
head_node_type: ray_head_default
|
42
29
|
|
@@ -50,10 +37,6 @@ file_mounts: {
|
|
50
37
|
{%- endfor %}
|
51
38
|
}
|
52
39
|
|
53
|
-
rsync_exclude: []
|
54
|
-
|
55
|
-
initialization_commands: []
|
56
|
-
|
57
40
|
# List of shell commands to run to set up nodes.
|
58
41
|
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
59
42
|
# connection, which is expensive. Try your best to co-locate commands into fewer
|
@@ -77,36 +60,6 @@ setup_commands:
|
|
77
60
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
78
61
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
79
62
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
80
|
-
{{ ssh_max_sessions_config }}
|
81
|
-
|
82
|
-
# Command to start ray on the head node. You don't need to change this.
|
83
|
-
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
84
|
-
# connection, which is expensive. Try your best to co-locate commands into fewer
|
85
|
-
# items! The same comment applies for worker_start_ray_commands.
|
86
|
-
#
|
87
|
-
# Increment the following for catching performance bugs easier:
|
88
|
-
# current num items (num SSH connections): 1
|
89
|
-
head_start_ray_commands:
|
90
|
-
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
|
91
|
-
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
|
92
|
-
# all the sessions to be reloaded. This is a workaround.
|
93
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
94
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
95
|
-
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
|
96
|
-
|
97
|
-
{%- if num_nodes > 1 %}
|
98
|
-
worker_start_ray_commands:
|
99
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
100
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
101
|
-
{%- else %}
|
102
|
-
worker_start_ray_commands: []
|
103
|
-
{%- endif %}
|
104
|
-
|
105
|
-
head_node: {}
|
106
|
-
worker_nodes: {}
|
107
63
|
|
108
|
-
#
|
109
|
-
|
110
|
-
worker_setup_commands: []
|
111
|
-
cluster_synced_files: []
|
112
|
-
file_mounts_sync_continuously: False
|
64
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
65
|
+
# We do not need to list it here anymore.
|
sky/users/permission.py
CHANGED
@@ -30,36 +30,28 @@ class PermissionService:
|
|
30
30
|
"""Permission service for SkyPilot API Server."""
|
31
31
|
|
32
32
|
def __init__(self):
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
'model.conf')
|
52
|
-
enforcer = casbin.Enforcer(model_path, adapter)
|
53
|
-
self.enforcer = enforcer
|
54
|
-
else:
|
55
|
-
self.enforcer = _enforcer_instance.enforcer
|
56
|
-
else:
|
57
|
-
self.enforcer = _enforcer_instance.enforcer
|
58
|
-
with _policy_lock():
|
59
|
-
self._maybe_initialize_policies()
|
33
|
+
global _enforcer_instance
|
34
|
+
if _enforcer_instance is None:
|
35
|
+
# For different threads, we share the same enforcer instance.
|
36
|
+
with _lock:
|
37
|
+
if _enforcer_instance is None:
|
38
|
+
_enforcer_instance = self
|
39
|
+
engine = global_user_state.initialize_and_get_db()
|
40
|
+
adapter = sqlalchemy_adapter.Adapter(engine)
|
41
|
+
model_path = os.path.join(os.path.dirname(__file__),
|
42
|
+
'model.conf')
|
43
|
+
enforcer = casbin.Enforcer(model_path, adapter)
|
44
|
+
self.enforcer = enforcer
|
45
|
+
else:
|
46
|
+
self.enforcer = _enforcer_instance.enforcer
|
47
|
+
else:
|
48
|
+
self.enforcer = _enforcer_instance.enforcer
|
49
|
+
with _policy_lock():
|
50
|
+
self._maybe_initialize_policies()
|
60
51
|
|
61
52
|
def _maybe_initialize_policies(self) -> None:
|
62
53
|
"""Initialize policies if they don't already exist."""
|
54
|
+
# TODO(zhwu): we should avoid running this on client side.
|
63
55
|
logger.debug(f'Initializing policies in process: {os.getpid()}')
|
64
56
|
self._load_policy_no_lock()
|
65
57
|
|
@@ -138,7 +130,6 @@ class PermissionService:
|
|
138
130
|
|
139
131
|
def add_user_if_not_exists(self, user_id: str) -> None:
|
140
132
|
"""Add user role relationship."""
|
141
|
-
self._lazy_initialize()
|
142
133
|
with _policy_lock():
|
143
134
|
self._add_user_if_not_exists_no_lock(user_id)
|
144
135
|
|
@@ -158,7 +149,6 @@ class PermissionService:
|
|
158
149
|
|
159
150
|
def update_role(self, user_id: str, new_role: str) -> None:
|
160
151
|
"""Update user role relationship."""
|
161
|
-
self._lazy_initialize()
|
162
152
|
with _policy_lock():
|
163
153
|
# Get current roles
|
164
154
|
self._load_policy_no_lock()
|
@@ -191,7 +181,6 @@ class PermissionService:
|
|
191
181
|
Returns:
|
192
182
|
A list of role names that the user has.
|
193
183
|
"""
|
194
|
-
self._lazy_initialize()
|
195
184
|
self._load_policy_no_lock()
|
196
185
|
return self.enforcer.get_roles_for_user(user_id)
|
197
186
|
|
@@ -204,7 +193,6 @@ class PermissionService:
|
|
204
193
|
# it is a hot path in every request. It is ok to have a stale policy,
|
205
194
|
# as long as it is eventually consistent.
|
206
195
|
# self._load_policy_no_lock()
|
207
|
-
self._lazy_initialize()
|
208
196
|
return self.enforcer.enforce(user_id, path, method)
|
209
197
|
|
210
198
|
def _load_policy_no_lock(self):
|
@@ -213,7 +201,6 @@ class PermissionService:
|
|
213
201
|
|
214
202
|
def load_policy(self):
|
215
203
|
"""Load policy from storage with lock."""
|
216
|
-
self._lazy_initialize()
|
217
204
|
with _policy_lock():
|
218
205
|
self._load_policy_no_lock()
|
219
206
|
|
@@ -229,7 +216,6 @@ class PermissionService:
|
|
229
216
|
For public workspaces, the permission is granted via a wildcard policy
|
230
217
|
('*').
|
231
218
|
"""
|
232
|
-
self._lazy_initialize()
|
233
219
|
if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
|
234
220
|
# When it is not on API server, we allow all users to access all
|
235
221
|
# workspaces, as the workspace check has been done on API server.
|
@@ -257,7 +243,6 @@ class PermissionService:
|
|
257
243
|
For public workspaces, this should be ['*'].
|
258
244
|
For private workspaces, this should be specific user IDs.
|
259
245
|
"""
|
260
|
-
self._lazy_initialize()
|
261
246
|
with _policy_lock():
|
262
247
|
for user in users:
|
263
248
|
logger.debug(f'Adding workspace policy: user={user}, '
|
@@ -275,7 +260,6 @@ class PermissionService:
|
|
275
260
|
For public workspaces, this should be ['*'].
|
276
261
|
For private workspaces, this should be specific user IDs.
|
277
262
|
"""
|
278
|
-
self._lazy_initialize()
|
279
263
|
with _policy_lock():
|
280
264
|
self._load_policy_no_lock()
|
281
265
|
# Remove all existing policies for this workspace
|
@@ -289,7 +273,6 @@ class PermissionService:
|
|
289
273
|
|
290
274
|
def remove_workspace_policy(self, workspace_name: str) -> None:
|
291
275
|
"""Remove workspace policy."""
|
292
|
-
self._lazy_initialize()
|
293
276
|
with _policy_lock():
|
294
277
|
self.enforcer.remove_filtered_policy(1, workspace_name)
|
295
278
|
self.enforcer.save_policy()
|
sky/utils/command_runner.py
CHANGED
@@ -561,7 +561,7 @@ class SSHCommandRunner(CommandRunner):
|
|
561
561
|
if self.ssh_control_name is not None:
|
562
562
|
control_path = _ssh_control_path(self.ssh_control_name)
|
563
563
|
if control_path is not None:
|
564
|
-
# Suppress the `Exit request sent.` output for this
|
564
|
+
# Suppress the `Exit request sent.` output for this command
|
565
565
|
# which would interrupt the CLI spinner.
|
566
566
|
cmd = (f'ssh -O exit -S {control_path}/%C '
|
567
567
|
f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
|